LLVM  8.0.1
LoopVectorize.cpp
Go to the documentation of this file.
1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
11 // and generates target-independent LLVM-IR.
12 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
13 // of instructions in order to estimate the profitability of vectorization.
14 //
15 // The loop vectorizer combines consecutive loop iterations into a single
16 // 'wide' iteration. After this transformation the index is incremented
17 // by the SIMD vector width, and not by one.
18 //
19 // This pass has three parts:
20 // 1. The main loop pass that drives the different parts.
21 // 2. LoopVectorizationLegality - A unit that checks for the legality
22 // of the vectorization.
23 // 3. InnerLoopVectorizer - A unit that performs the actual
24 // widening of instructions.
25 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
26 // of vectorization. It decides on the optimal vector width, which
27 // can be one, if vectorization is not profitable.
28 //
29 // There is a development effort going on to migrate loop vectorizer to the
30 // VPlan infrastructure and to introduce outer loop vectorization support (see
31 // docs/Proposal/VectorizationPlan.rst and
32 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
33 // purpose, we temporarily introduced the VPlan-native vectorization path: an
34 // alternative vectorization path that is natively implemented on top of the
35 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
36 //
37 //===----------------------------------------------------------------------===//
38 //
39 // The reduction-variable vectorization is based on the paper:
40 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
41 //
42 // Variable uniformity checks are inspired by:
43 // Karrenberg, R. and Hack, S. Whole Function Vectorization.
44 //
45 // The interleaved access vectorization is based on the paper:
46 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
47 // Data for SIMD
48 //
49 // Other ideas/concepts are from:
50 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
51 //
52 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
53 // Vectorizing Compilers.
54 //
55 //===----------------------------------------------------------------------===//
56 
59 #include "VPRecipeBuilder.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanHCFGTransforms.h"
62 #include "llvm/ADT/APInt.h"
63 #include "llvm/ADT/ArrayRef.h"
64 #include "llvm/ADT/DenseMap.h"
65 #include "llvm/ADT/DenseMapInfo.h"
66 #include "llvm/ADT/Hashing.h"
67 #include "llvm/ADT/MapVector.h"
68 #include "llvm/ADT/None.h"
69 #include "llvm/ADT/Optional.h"
70 #include "llvm/ADT/STLExtras.h"
71 #include "llvm/ADT/SetVector.h"
72 #include "llvm/ADT/SmallPtrSet.h"
73 #include "llvm/ADT/SmallVector.h"
74 #include "llvm/ADT/Statistic.h"
75 #include "llvm/ADT/StringRef.h"
76 #include "llvm/ADT/Twine.h"
81 #include "llvm/Analysis/CFG.h"
87 #include "llvm/Analysis/LoopInfo.h"
96 #include "llvm/IR/Attributes.h"
97 #include "llvm/IR/BasicBlock.h"
98 #include "llvm/IR/CFG.h"
99 #include "llvm/IR/Constant.h"
100 #include "llvm/IR/Constants.h"
101 #include "llvm/IR/DataLayout.h"
103 #include "llvm/IR/DebugLoc.h"
104 #include "llvm/IR/DerivedTypes.h"
105 #include "llvm/IR/DiagnosticInfo.h"
106 #include "llvm/IR/Dominators.h"
107 #include "llvm/IR/Function.h"
108 #include "llvm/IR/IRBuilder.h"
109 #include "llvm/IR/InstrTypes.h"
110 #include "llvm/IR/Instruction.h"
111 #include "llvm/IR/Instructions.h"
112 #include "llvm/IR/IntrinsicInst.h"
113 #include "llvm/IR/Intrinsics.h"
114 #include "llvm/IR/LLVMContext.h"
115 #include "llvm/IR/Metadata.h"
116 #include "llvm/IR/Module.h"
117 #include "llvm/IR/Operator.h"
118 #include "llvm/IR/Type.h"
119 #include "llvm/IR/Use.h"
120 #include "llvm/IR/User.h"
121 #include "llvm/IR/Value.h"
122 #include "llvm/IR/ValueHandle.h"
123 #include "llvm/IR/Verifier.h"
124 #include "llvm/Pass.h"
125 #include "llvm/Support/Casting.h"
127 #include "llvm/Support/Compiler.h"
128 #include "llvm/Support/Debug.h"
130 #include "llvm/Support/MathExtras.h"
137 #include <algorithm>
138 #include <cassert>
139 #include <cstdint>
140 #include <cstdlib>
141 #include <functional>
142 #include <iterator>
143 #include <limits>
144 #include <memory>
145 #include <string>
146 #include <tuple>
147 #include <utility>
148 #include <vector>
149 
150 using namespace llvm;
151 
152 #define LV_NAME "loop-vectorize"
153 #define DEBUG_TYPE LV_NAME
154 
155 /// @{
156 /// Metadata attribute names
157 static const char *const LLVMLoopVectorizeFollowupAll =
158  "llvm.loop.vectorize.followup_all";
159 static const char *const LLVMLoopVectorizeFollowupVectorized =
160  "llvm.loop.vectorize.followup_vectorized";
161 static const char *const LLVMLoopVectorizeFollowupEpilogue =
162  "llvm.loop.vectorize.followup_epilogue";
163 /// @}
164 
165 STATISTIC(LoopsVectorized, "Number of loops vectorized");
166 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
167 
168 /// Loops with a known constant trip count below this number are vectorized only
169 /// if no scalar iteration overheads are incurred.
171  "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
172  cl::desc("Loops with a constant trip count that is smaller than this "
173  "value are vectorized only if no scalar iteration overheads "
174  "are incurred."));
175 
177  "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
178  cl::desc("Maximize bandwidth when selecting vectorization factor which "
179  "will be determined by the smallest type in loop."));
180 
182  "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
183  cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
184 
185 /// An interleave-group may need masking if it resides in a block that needs
186 /// predication, or in order to mask away gaps.
188  "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
189  cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
190 
191 /// We don't interleave loops with a known constant trip count below this
192 /// number.
193 static const unsigned TinyTripCountInterleaveThreshold = 128;
194 
196  "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
197  cl::desc("A flag that overrides the target's number of scalar registers."));
198 
200  "force-target-num-vector-regs", cl::init(0), cl::Hidden,
201  cl::desc("A flag that overrides the target's number of vector registers."));
202 
204  "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
205  cl::desc("A flag that overrides the target's max interleave factor for "
206  "scalar loops."));
207 
209  "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
210  cl::desc("A flag that overrides the target's max interleave factor for "
211  "vectorized loops."));
212 
214  "force-target-instruction-cost", cl::init(0), cl::Hidden,
215  cl::desc("A flag that overrides the target's expected cost for "
216  "an instruction to a single constant value. Mostly "
217  "useful for getting consistent testing."));
218 
220  "small-loop-cost", cl::init(20), cl::Hidden,
221  cl::desc(
222  "The cost of a loop that is considered 'small' by the interleaver."));
223 
225  "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
226  cl::desc("Enable the use of the block frequency analysis to access PGO "
227  "heuristics minimizing code growth in cold regions and being more "
228  "aggressive in hot regions."));
229 
230 // Runtime interleave loops for load/store throughput.
232  "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
233  cl::desc(
234  "Enable runtime interleaving until load/store ports are saturated"));
235 
236 /// The number of stores in a loop that are allowed to need predication.
238  "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
239  cl::desc("Max number of stores to be predicated behind an if."));
240 
242  "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
243  cl::desc("Count the induction variable only once when interleaving"));
244 
246  "enable-cond-stores-vec", cl::init(true), cl::Hidden,
247  cl::desc("Enable if predication of stores during vectorization."));
248 
250  "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
251  cl::desc("The maximum interleave count to use when interleaving a scalar "
252  "reduction in a nested loop."));
253 
255  "enable-vplan-native-path", cl::init(false), cl::Hidden,
256  cl::desc("Enable VPlan-native vectorization path with "
257  "support for outer loop vectorization."));
258 
259 // This flag enables the stress testing of the VPlan H-CFG construction in the
260 // VPlan-native vectorization path. It must be used in conjuction with
261 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
262 // verification of the H-CFGs built.
264  "vplan-build-stress-test", cl::init(false), cl::Hidden,
265  cl::desc(
266  "Build VPlan for every supported loop nest in the function and bail "
267  "out right after the build (stress test the VPlan H-CFG construction "
268  "in the VPlan-native vectorization path)."));
269 
270 /// A helper function for converting Scalar types to vector types.
271 /// If the incoming type is void, we return void. If the VF is 1, we return
272 /// the scalar type.
273 static Type *ToVectorTy(Type *Scalar, unsigned VF) {
274  if (Scalar->isVoidTy() || VF == 1)
275  return Scalar;
276  return VectorType::get(Scalar, VF);
277 }
278 
279 /// A helper function that returns the type of loaded or stored value.
281  assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
282  "Expected Load or Store instruction");
283  if (auto *LI = dyn_cast<LoadInst>(I))
284  return LI->getType();
285  return cast<StoreInst>(I)->getValueOperand()->getType();
286 }
287 
288 /// A helper function that returns true if the given type is irregular. The
289 /// type is irregular if its allocated size doesn't equal the store size of an
290 /// element of the corresponding vector type at the given vectorization factor.
291 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
292  // Determine if an array of VF elements of type Ty is "bitcast compatible"
293  // with a <VF x Ty> vector.
294  if (VF > 1) {
295  auto *VectorTy = VectorType::get(Ty, VF);
296  return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
297  }
298 
299  // If the vectorization factor is one, we just check if an array of type Ty
300  // requires padding between elements.
301  return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
302 }
303 
304 /// A helper function that returns the reciprocal of the block probability of
305 /// predicated blocks. If we return X, we are assuming the predicated block
306 /// will execute once for every X iterations of the loop header.
307 ///
308 /// TODO: We should use actual block probability here, if available. Currently,
309 /// we always assume predicated blocks have a 50% chance of executing.
310 static unsigned getReciprocalPredBlockProb() { return 2; }
311 
312 /// A helper function that adds a 'fast' flag to floating-point operations.
314  if (isa<FPMathOperator>(V)) {
315  FastMathFlags Flags;
316  Flags.setFast();
317  cast<Instruction>(V)->setFastMathFlags(Flags);
318  }
319  return V;
320 }
321 
322 /// A helper function that returns an integer or floating-point constant with
323 /// value C.
324 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
325  return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
326  : ConstantFP::get(Ty, C);
327 }
328 
329 namespace llvm {
330 
331 /// InnerLoopVectorizer vectorizes loops which contain only one basic
332 /// block to a specified vectorization factor (VF).
333 /// This class performs the widening of scalars into vectors, or multiple
334 /// scalars. This class also implements the following features:
335 /// * It inserts an epilogue loop for handling loops that don't have iteration
336 /// counts that are known to be a multiple of the vectorization factor.
337 /// * It handles the code generation for reduction variables.
338 /// * Scalarization (implementation using scalars) of un-vectorizable
339 /// instructions.
340 /// InnerLoopVectorizer does not perform any vectorization-legality
341 /// checks, and relies on the caller to check for the different legality
342 /// aspects. The InnerLoopVectorizer relies on the
343 /// LoopVectorizationLegality class to provide information about the induction
344 /// and reduction variables that were found to a given vectorization factor.
346 public:
349  const TargetLibraryInfo *TLI,
351  OptimizationRemarkEmitter *ORE, unsigned VecWidth,
352  unsigned UnrollFactor, LoopVectorizationLegality *LVL,
354  : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
355  AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
356  Builder(PSE.getSE()->getContext()),
357  VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
358  virtual ~InnerLoopVectorizer() = default;
359 
360  /// Create a new empty loop. Unlink the old loop and connect the new one.
361  /// Return the pre-header block of the new loop.
363 
364  /// Widen a single instruction within the innermost loop.
366 
367  /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
368  void fixVectorizedLoop();
369 
370  // Return true if any runtime check is added.
372 
373  /// A type for vectorized values in the new loop. Each value from the
374  /// original loop, when vectorized, is represented by UF vector values in the
375  /// new unrolled loop, where UF is the unroll factor.
377 
378  /// Vectorize a single PHINode in a block. This method handles the induction
379  /// variable canonicalization. It supports both VF = 1 for unrolled loops and
380  /// arbitrary length vectors.
381  void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
382 
383  /// A helper function to scalarize a single Instruction in the innermost loop.
384  /// Generates a sequence of scalar instances for each lane between \p MinLane
385  /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
386  /// inclusive..
387  void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
388  bool IfPredicateInstr);
389 
390  /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
391  /// is provided, the integer induction variable will first be truncated to
392  /// the corresponding type.
393  void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
394 
395  /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
396  /// vector or scalar value on-demand if one is not yet available. When
397  /// vectorizing a loop, we visit the definition of an instruction before its
398  /// uses. When visiting the definition, we either vectorize or scalarize the
399  /// instruction, creating an entry for it in the corresponding map. (In some
400  /// cases, such as induction variables, we will create both vector and scalar
401  /// entries.) Then, as we encounter uses of the definition, we derive values
402  /// for each scalar or vector use unless such a value is already available.
403  /// For example, if we scalarize a definition and one of its uses is vector,
404  /// we build the required vector on-demand with an insertelement sequence
405  /// when visiting the use. Otherwise, if the use is scalar, we can use the
406  /// existing scalar definition.
407  ///
408  /// Return a value in the new loop corresponding to \p V from the original
409  /// loop at unroll index \p Part. If the value has already been vectorized,
410  /// the corresponding vector entry in VectorLoopValueMap is returned. If,
411  /// however, the value has a scalar entry in VectorLoopValueMap, we construct
412  /// a new vector value on-demand by inserting the scalar values into a vector
413  /// with an insertelement sequence. If the value has been neither vectorized
414  /// nor scalarized, it must be loop invariant, so we simply broadcast the
415  /// value into a vector.
416  Value *getOrCreateVectorValue(Value *V, unsigned Part);
417 
418  /// Return a value in the new loop corresponding to \p V from the original
419  /// loop at unroll and vector indices \p Instance. If the value has been
420  /// vectorized but not scalarized, the necessary extractelement instruction
421  /// will be generated.
422  Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
423 
424  /// Construct the vector value of a scalarized value \p V one lane at a time.
425  void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
426 
427  /// Try to vectorize the interleaved access group that \p Instr belongs to,
428  /// optionally masking the vector operations if \p BlockInMask is non-null.
430  VectorParts *BlockInMask = nullptr);
431 
432  /// Vectorize Load and Store instructions, optionally masking the vector
433  /// operations if \p BlockInMask is non-null.
435  VectorParts *BlockInMask = nullptr);
436 
437  /// Set the debug location in the builder using the debug location in
438  /// the instruction.
439  void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
440 
441  /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
442  void fixNonInductionPHIs(void);
443 
444 protected:
446 
447  /// A small list of PHINodes.
449 
450  /// A type for scalarized values in the new loop. Each value from the
451  /// original loop, when scalarized, is represented by UF x VF scalar values
452  /// in the new unrolled loop, where UF is the unroll factor and VF is the
453  /// vectorization factor.
455 
456  /// Set up the values of the IVs correctly when exiting the vector loop.
457  void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
458  Value *CountRoundDown, Value *EndValue,
459  BasicBlock *MiddleBlock);
460 
461  /// Create a new induction variable inside L.
462  PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
463  Value *Step, Instruction *DL);
464 
465  /// Handle all cross-iteration phis in the header.
466  void fixCrossIterationPHIs();
467 
468  /// Fix a first-order recurrence. This is the second phase of vectorizing
469  /// this phi node.
470  void fixFirstOrderRecurrence(PHINode *Phi);
471 
472  /// Fix a reduction cross-iteration phi. This is the second phase of
473  /// vectorizing this phi node.
474  void fixReduction(PHINode *Phi);
475 
476  /// The Loop exit block may have single value PHI nodes with some
477  /// incoming value. While vectorizing we only handled real values
478  /// that were defined inside the loop and we should have one value for
479  /// each predecessor of its parent basic block. See PR14725.
480  void fixLCSSAPHIs();
481 
482  /// Iteratively sink the scalarized operands of a predicated instruction into
483  /// the block that was created for it.
484  void sinkScalarOperands(Instruction *PredInst);
485 
486  /// Shrinks vector element sizes to the smallest bitwidth they can be legally
487  /// represented as.
489 
490  /// Insert the new loop to the loop hierarchy and pass manager
491  /// and update the analysis passes.
492  void updateAnalysis();
493 
494  /// Create a broadcast instruction. This method generates a broadcast
495  /// instruction (shuffle) for loop invariant values and for the induction
496  /// value. If this is the induction variable then we extend it to N, N+1, ...
497  /// this is needed because each iteration in the loop corresponds to a SIMD
498  /// element.
499  virtual Value *getBroadcastInstrs(Value *V);
500 
501  /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
502  /// to each vector element of Val. The sequence starts at StartIndex.
503  /// \p Opcode is relevant for FP induction variable.
504  virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
505  Instruction::BinaryOps Opcode =
506  Instruction::BinaryOpsEnd);
507 
508  /// Compute scalar induction steps. \p ScalarIV is the scalar induction
509  /// variable on which to base the steps, \p Step is the size of the step, and
510  /// \p EntryVal is the value from the original loop that maps to the steps.
511  /// Note that \p EntryVal doesn't have to be an induction variable - it
512  /// can also be a truncate instruction.
513  void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
514  const InductionDescriptor &ID);
515 
516  /// Create a vector induction phi node based on an existing scalar one. \p
517  /// EntryVal is the value from the original loop that maps to the vector phi
518  /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
519  /// truncate instruction, instead of widening the original IV, we widen a
520  /// version of the IV truncated to \p EntryVal's type.
522  Value *Step, Instruction *EntryVal);
523 
524  /// Returns true if an instruction \p I should be scalarized instead of
525  /// vectorized for the chosen vectorization factor.
527 
528  /// Returns true if we should generate a scalar version of \p IV.
529  bool needsScalarInduction(Instruction *IV) const;
530 
531  /// If there is a cast involved in the induction variable \p ID, which should
532  /// be ignored in the vectorized loop body, this function records the
533  /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
534  /// cast. We had already proved that the casted Phi is equal to the uncasted
535  /// Phi in the vectorized loop (under a runtime guard), and therefore
536  /// there is no need to vectorize the cast - the same value can be used in the
537  /// vector loop for both the Phi and the cast.
538  /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
539  /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
540  ///
541  /// \p EntryVal is the value from the original loop that maps to the vector
542  /// phi node and is used to distinguish what is the IV currently being
543  /// processed - original one (if \p EntryVal is a phi corresponding to the
544  /// original IV) or the "newly-created" one based on the proof mentioned above
545  /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
546  /// latter case \p EntryVal is a TruncInst and we must not record anything for
547  /// that IV, but it's error-prone to expect callers of this routine to care
548  /// about that, hence this explicit parameter.
550  const Instruction *EntryVal,
551  Value *VectorLoopValue,
552  unsigned Part,
553  unsigned Lane = UINT_MAX);
554 
555  /// Generate a shuffle sequence that will reverse the vector Vec.
556  virtual Value *reverseVector(Value *Vec);
557 
558  /// Returns (and creates if needed) the original loop trip count.
559  Value *getOrCreateTripCount(Loop *NewLoop);
560 
561  /// Returns (and creates if needed) the trip count of the widened loop.
563 
564  /// Returns a bitcasted value to the requested vector type.
565  /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
567  const DataLayout &DL);
568 
569  /// Emit a bypass check to see if the vector trip count is zero, including if
570  /// it overflows.
572 
573  /// Emit a bypass check to see if all of the SCEV assumptions we've
574  /// had to make are correct.
575  void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
576 
577  /// Emit bypass checks to check any memory assumptions we may have made.
578  void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
579 
580  /// Compute the transformed value of Index at offset StartValue using step
581  /// StepValue.
582  /// For integer induction, returns StartValue + Index * StepValue.
583  /// For pointer induction, returns StartValue[Index * StepValue].
584  /// FIXME: The newly created binary instructions should contain nsw/nuw
585  /// flags, which can be found from the original scalar operations.
587  const DataLayout &DL,
588  const InductionDescriptor &ID) const;
589 
590  /// Add additional metadata to \p To that was not present on \p Orig.
591  ///
592  /// Currently this is used to add the noalias annotations based on the
593  /// inserted memchecks. Use this for instructions that are *cloned* into the
594  /// vector loop.
595  void addNewMetadata(Instruction *To, const Instruction *Orig);
596 
597  /// Add metadata from one instruction to another.
598  ///
599  /// This includes both the original MDs from \p From and additional ones (\see
600  /// addNewMetadata). Use this for *newly created* instructions in the vector
601  /// loop.
603 
604  /// Similar to the previous function but it adds the metadata to a
605  /// vector of instructions.
607 
608  /// The original loop.
610 
611  /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
612  /// dynamic knowledge to simplify SCEV expressions and converts them to a
613  /// more usable form.
615 
616  /// Loop Info.
618 
619  /// Dominator Tree.
621 
622  /// Alias Analysis.
624 
625  /// Target Library Info.
627 
628  /// Target Transform Info.
630 
631  /// Assumption Cache.
633 
634  /// Interface to emit optimization remarks.
636 
637  /// LoopVersioning. It's only set up (non-null) if memchecks were
638  /// used.
639  ///
640  /// This is currently only used to add no-alias metadata based on the
641  /// memchecks. The actually versioning is performed manually.
642  std::unique_ptr<LoopVersioning> LVer;
643 
644  /// The vectorization SIMD factor to use. Each vector will have this many
645  /// vector elements.
646  unsigned VF;
647 
648  /// The vectorization unroll factor to use. Each scalar is vectorized to this
649  /// many different vector instructions.
650  unsigned UF;
651 
652  /// The builder that we use
654 
655  // --- Vectorization state ---
656 
657  /// The vector-loop preheader.
659 
660  /// The scalar-loop preheader.
662 
663  /// Middle Block between the vector and the scalar.
665 
666  /// The ExitBlock of the scalar loop.
668 
669  /// The vector loop body.
671 
672  /// The scalar loop body.
674 
675  /// A list of all bypass blocks. The first block is the entry of the loop.
677 
678  /// The new Induction variable which was added to the new block.
679  PHINode *Induction = nullptr;
680 
681  /// The induction variable of the old basic block.
682  PHINode *OldInduction = nullptr;
683 
684  /// Maps values from the original loop to their corresponding values in the
685  /// vectorized loop. A key value can map to either vector values, scalar
686  /// values or both kinds of values, depending on whether the key was
687  /// vectorized and scalarized.
689 
690  /// Store instructions that were predicated.
692 
693  /// Trip count of the original loop.
694  Value *TripCount = nullptr;
695 
696  /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
697  Value *VectorTripCount = nullptr;
698 
699  /// The legality analysis.
701 
702  /// The profitablity analysis.
704 
705  // Record whether runtime checks are added.
706  bool AddedSafetyChecks = false;
707 
708  // Holds the end values for each induction variable. We save the end values
709  // so we can later fix-up the external users of the induction variables.
711 
712  // Vector of original scalar PHIs whose corresponding widened PHIs need to be
713  // fixed up at the end of vector code generation.
715 };
716 
718 public:
721  const TargetLibraryInfo *TLI,
723  OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
726  : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
727  UnrollFactor, LVL, CM) {}
728 
729 private:
730  Value *getBroadcastInstrs(Value *V) override;
731  Value *getStepVector(Value *Val, int StartIdx, Value *Step,
732  Instruction::BinaryOps Opcode =
733  Instruction::BinaryOpsEnd) override;
734  Value *reverseVector(Value *Vec) override;
735 };
736 
737 } // end namespace llvm
738 
739 /// Look for a meaningful debug location on the instruction or it's
740 /// operands.
742  if (!I)
743  return I;
744 
745  DebugLoc Empty;
746  if (I->getDebugLoc() != Empty)
747  return I;
748 
749  for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
750  if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
751  if (OpInst->getDebugLoc() != Empty)
752  return OpInst;
753  }
754 
755  return I;
756 }
757 
759  if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
760  const DILocation *DIL = Inst->getDebugLoc();
761  if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
762  !isa<DbgInfoIntrinsic>(Inst)) {
763  auto NewDIL = DIL->cloneWithDuplicationFactor(UF * VF);
764  if (NewDIL)
765  B.SetCurrentDebugLocation(NewDIL.getValue());
766  else
767  LLVM_DEBUG(dbgs()
768  << "Failed to create new discriminator: "
769  << DIL->getFilename() << " Line: " << DIL->getLine());
770  }
771  else
773  } else
775 }
776 
777 #ifndef NDEBUG
778 /// \return string containing a file name and a line # for the given loop.
779 static std::string getDebugLocString(const Loop *L) {
780  std::string Result;
781  if (L) {
782  raw_string_ostream OS(Result);
783  if (const DebugLoc LoopDbgLoc = L->getStartLoc())
784  LoopDbgLoc.print(OS);
785  else
786  // Just print the module name.
787  OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
788  OS.flush();
789  }
790  return Result;
791 }
792 #endif
793 
795  const Instruction *Orig) {
796  // If the loop was versioned with memchecks, add the corresponding no-alias
797  // metadata.
798  if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
799  LVer->annotateInstWithNoAlias(To, Orig);
800 }
801 
803  Instruction *From) {
804  propagateMetadata(To, From);
805  addNewMetadata(To, From);
806 }
807 
809  Instruction *From) {
810  for (Value *V : To) {
811  if (Instruction *I = dyn_cast<Instruction>(V))
812  addMetadata(I, From);
813  }
814 }
815 
816 namespace llvm {
817 
818 /// LoopVectorizationCostModel - estimates the expected speedups due to
819 /// vectorization.
820 /// In many cases vectorization is not profitable. This can happen because of
821 /// a number of reasons. In this class we mainly attempt to predict the
822 /// expected speedup/slowdowns due to the supported instruction set. We use the
823 /// TargetTransformInfo to query the different backends for the cost of
824 /// different operations.
826 public:
829  const TargetTransformInfo &TTI,
830  const TargetLibraryInfo *TLI, DemandedBits *DB,
833  const LoopVectorizeHints *Hints,
835  : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
836  AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {}
837 
838  /// \return An upper bound for the vectorization factor, or None if
839  /// vectorization should be avoided up front.
840  Optional<unsigned> computeMaxVF(bool OptForSize);
841 
842  /// \return The most profitable vectorization factor and the cost of that VF.
843  /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
844  /// then this vectorization factor will be selected if vectorization is
845  /// possible.
846  VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
847 
848  /// Setup cost-based decisions for user vectorization factor.
849  void selectUserVectorizationFactor(unsigned UserVF) {
850  collectUniformsAndScalars(UserVF);
851  collectInstsToScalarize(UserVF);
852  }
853 
854  /// \return The size (in bits) of the smallest and widest types in the code
855  /// that needs to be vectorized. We ignore values that remain scalar such as
856  /// 64 bit loop indices.
857  std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
858 
859  /// \return The desired interleave count.
860  /// If interleave count has been specified by metadata it will be returned.
861  /// Otherwise, the interleave count is computed and returned. VF and LoopCost
862  /// are the selected vectorization factor and the cost of the selected VF.
863  unsigned selectInterleaveCount(bool OptForSize, unsigned VF,
864  unsigned LoopCost);
865 
866  /// Memory access instruction may be vectorized in more than one way.
867  /// Form of instruction after vectorization depends on cost.
868  /// This function takes cost-based decisions for Load/Store instructions
869  /// and collects them in a map. This decisions map is used for building
870  /// the lists of loop-uniform and loop-scalar instructions.
871  /// The calculated cost is saved with widening decision in order to
872  /// avoid redundant calculations.
873  void setCostBasedWideningDecision(unsigned VF);
874 
875  /// A struct that represents some properties of the register usage
876  /// of a loop.
877  struct RegisterUsage {
878  /// Holds the number of loop invariant values that are used in the loop.
880 
881  /// Holds the maximum number of concurrent live intervals in the loop.
882  unsigned MaxLocalUsers;
883  };
884 
885  /// \return Returns information about the register usages of the loop for the
886  /// given vectorization factors.
887  SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
888 
889  /// Collect values we want to ignore in the cost model.
890  void collectValuesToIgnore();
891 
892  /// \returns The smallest bitwidth each instruction can be represented with.
893  /// The vector equivalents of these instructions should be truncated to this
894  /// type.
896  return MinBWs;
897  }
898 
899  /// \returns True if it is more profitable to scalarize instruction \p I for
900  /// vectorization factor \p VF.
901  bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
902  assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
903 
904  // Cost model is not run in the VPlan-native path - return conservative
905  // result until this changes.
907  return false;
908 
909  auto Scalars = InstsToScalarize.find(VF);
910  assert(Scalars != InstsToScalarize.end() &&
911  "VF not yet analyzed for scalarization profitability");
912  return Scalars->second.find(I) != Scalars->second.end();
913  }
914 
915  /// Returns true if \p I is known to be uniform after vectorization.
916  bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
917  if (VF == 1)
918  return true;
919 
920  // Cost model is not run in the VPlan-native path - return conservative
921  // result until this changes.
923  return false;
924 
925  auto UniformsPerVF = Uniforms.find(VF);
926  assert(UniformsPerVF != Uniforms.end() &&
927  "VF not yet analyzed for uniformity");
928  return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
929  }
930 
931  /// Returns true if \p I is known to be scalar after vectorization.
932  bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
933  if (VF == 1)
934  return true;
935 
936  // Cost model is not run in the VPlan-native path - return conservative
937  // result until this changes.
939  return false;
940 
941  auto ScalarsPerVF = Scalars.find(VF);
942  assert(ScalarsPerVF != Scalars.end() &&
943  "Scalar values are not calculated for VF");
944  return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
945  }
946 
947  /// \returns True if instruction \p I can be truncated to a smaller bitwidth
948  /// for vectorization factor \p VF.
949  bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
950  return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
951  !isProfitableToScalarize(I, VF) &&
952  !isScalarAfterVectorization(I, VF);
953  }
954 
955  /// Decision that was taken during cost calculation for memory instruction.
958  CM_Widen, // For consecutive accesses with stride +1.
959  CM_Widen_Reverse, // For consecutive accesses with stride -1.
962  CM_Scalarize
963  };
964 
965  /// Save vectorization decision \p W and \p Cost taken by the cost model for
966  /// instruction \p I and vector width \p VF.
968  unsigned Cost) {
969  assert(VF >= 2 && "Expected VF >=2");
970  WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
971  }
972 
973  /// Save vectorization decision \p W and \p Cost taken by the cost model for
974  /// interleaving group \p Grp and vector width \p VF.
975  void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
976  InstWidening W, unsigned Cost) {
977  assert(VF >= 2 && "Expected VF >=2");
978  /// Broadcast this decicion to all instructions inside the group.
979  /// But the cost will be assigned to one instruction only.
980  for (unsigned i = 0; i < Grp->getFactor(); ++i) {
981  if (auto *I = Grp->getMember(i)) {
982  if (Grp->getInsertPos() == I)
983  WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
984  else
985  WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
986  }
987  }
988  }
989 
990  /// Return the cost model decision for the given instruction \p I and vector
991  /// width \p VF. Return CM_Unknown if this instruction did not pass
992  /// through the cost modeling.
994  assert(VF >= 2 && "Expected VF >=2");
995 
996  // Cost model is not run in the VPlan-native path - return conservative
997  // result until this changes.
999  return CM_GatherScatter;
1000 
1001  std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1002  auto Itr = WideningDecisions.find(InstOnVF);
1003  if (Itr == WideningDecisions.end())
1004  return CM_Unknown;
1005  return Itr->second.first;
1006  }
1007 
1008  /// Return the vectorization cost for the given instruction \p I and vector
1009  /// width \p VF.
1010  unsigned getWideningCost(Instruction *I, unsigned VF) {
1011  assert(VF >= 2 && "Expected VF >=2");
1012  std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1013  assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1014  "The cost is not calculated");
1015  return WideningDecisions[InstOnVF].second;
1016  }
1017 
1018  /// Return True if instruction \p I is an optimizable truncate whose operand
1019  /// is an induction variable. Such a truncate will be removed by adding a new
1020  /// induction variable with the destination type.
1021  bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1022  // If the instruction is not a truncate, return false.
1023  auto *Trunc = dyn_cast<TruncInst>(I);
1024  if (!Trunc)
1025  return false;
1026 
1027  // Get the source and destination types of the truncate.
1028  Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1029  Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1030 
1031  // If the truncate is free for the given types, return false. Replacing a
1032  // free truncate with an induction variable would add an induction variable
1033  // update instruction to each iteration of the loop. We exclude from this
1034  // check the primary induction variable since it will need an update
1035  // instruction regardless.
1036  Value *Op = Trunc->getOperand(0);
1037  if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1038  return false;
1039 
1040  // If the truncated value is not an induction variable, return false.
1041  return Legal->isInductionPhi(Op);
1042  }
1043 
1044  /// Collects the instructions to scalarize for each predicated instruction in
1045  /// the loop.
1046  void collectInstsToScalarize(unsigned VF);
1047 
1048  /// Collect Uniform and Scalar values for the given \p VF.
1049  /// The sets depend on CM decision for Load/Store instructions
1050  /// that may be vectorized as interleave, gather-scatter or scalarized.
1051  void collectUniformsAndScalars(unsigned VF) {
1052  // Do the analysis once.
1053  if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1054  return;
1055  setCostBasedWideningDecision(VF);
1056  collectLoopUniforms(VF);
1057  collectLoopScalars(VF);
1058  }
1059 
1060  /// Returns true if the target machine supports masked store operation
1061  /// for the given \p DataType and kind of access to \p Ptr.
1063  return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType);
1064  }
1065 
1066  /// Returns true if the target machine supports masked load operation
1067  /// for the given \p DataType and kind of access to \p Ptr.
1069  return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType);
1070  }
1071 
1072  /// Returns true if the target machine supports masked scatter operation
1073  /// for the given \p DataType.
1075  return TTI.isLegalMaskedScatter(DataType);
1076  }
1077 
1078  /// Returns true if the target machine supports masked gather operation
1079  /// for the given \p DataType.
1081  return TTI.isLegalMaskedGather(DataType);
1082  }
1083 
1084  /// Returns true if the target machine can represent \p V as a masked gather
1085  /// or scatter operation.
1087  bool LI = isa<LoadInst>(V);
1088  bool SI = isa<StoreInst>(V);
1089  if (!LI && !SI)
1090  return false;
1091  auto *Ty = getMemInstValueType(V);
1092  return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
1093  }
1094 
1095  /// Returns true if \p I is an instruction that will be scalarized with
1096  /// predication. Such instructions include conditional stores and
1097  /// instructions that may divide by zero.
1098  /// If a non-zero VF has been calculated, we check if I will be scalarized
1099  /// predication for that VF.
1100  bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1101 
1102  // Returns true if \p I is an instruction that will be predicated either
1103  // through scalar predication or masked load/store or masked gather/scatter.
1104  // Superset of instructions that return true for isScalarWithPredication.
1106  if (!blockNeedsPredication(I->getParent()))
1107  return false;
1108  // Loads and stores that need some form of masked operation are predicated
1109  // instructions.
1110  if (isa<LoadInst>(I) || isa<StoreInst>(I))
1111  return Legal->isMaskRequired(I);
1112  return isScalarWithPredication(I);
1113  }
1114 
1115  /// Returns true if \p I is a memory instruction with consecutive memory
1116  /// access that can be widened.
1117  bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1118 
1119  /// Returns true if \p I is a memory instruction in an interleaved-group
1120  /// of memory accesses that can be vectorized with wide vector loads/stores
1121  /// and shuffles.
1122  bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1123 
1124  /// Check if \p Instr belongs to any interleaved access group.
1126  return InterleaveInfo.isInterleaved(Instr);
1127  }
1128 
1129  /// Get the interleaved access group that \p Instr belongs to.
1132  return InterleaveInfo.getInterleaveGroup(Instr);
1133  }
1134 
1135  /// Returns true if an interleaved group requires a scalar iteration
1136  /// to handle accesses with gaps, and there is nothing preventing us from
1137  /// creating a scalar epilogue.
1138  bool requiresScalarEpilogue() const {
1139  return IsScalarEpilogueAllowed && InterleaveInfo.requiresScalarEpilogue();
1140  }
1141 
1142  /// Returns true if a scalar epilogue is not allowed due to optsize.
1143  bool isScalarEpilogueAllowed() const { return IsScalarEpilogueAllowed; }
1144 
1145  /// Returns true if all loop blocks should be masked to fold tail loop.
1146  bool foldTailByMasking() const { return FoldTailByMasking; }
1147 
1149  return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1150  }
1151 
1152 private:
1153  unsigned NumPredStores = 0;
1154 
1155  /// \return An upper bound for the vectorization factor, larger than zero.
1156  /// One is returned if vectorization should best be avoided due to cost.
1157  unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount);
1158 
1159  /// The vectorization cost is a combination of the cost itself and a boolean
1160  /// indicating whether any of the contributing operations will actually
1161  /// operate on
1162  /// vector values after type legalization in the backend. If this latter value
1163  /// is
1164  /// false, then all operations will be scalarized (i.e. no vectorization has
1165  /// actually taken place).
1166  using VectorizationCostTy = std::pair<unsigned, bool>;
1167 
1168  /// Returns the expected execution cost. The unit of the cost does
1169  /// not matter because we use the 'cost' units to compare different
1170  /// vector widths. The cost that is returned is *not* normalized by
1171  /// the factor width.
1172  VectorizationCostTy expectedCost(unsigned VF);
1173 
1174  /// Returns the execution time cost of an instruction for a given vector
1175  /// width. Vector width of one means scalar.
1176  VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1177 
1178  /// The cost-computation logic from getInstructionCost which provides
1179  /// the vector type as an output parameter.
1180  unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1181 
1182  /// Calculate vectorization cost of memory instruction \p I.
1183  unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1184 
1185  /// The cost computation for scalarized memory instruction.
1186  unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1187 
1188  /// The cost computation for interleaving group of memory instructions.
1189  unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1190 
1191  /// The cost computation for Gather/Scatter instruction.
1192  unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1193 
1194  /// The cost computation for widening instruction \p I with consecutive
1195  /// memory access.
1196  unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1197 
1198  /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1199  /// Load: scalar load + broadcast.
1200  /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1201  /// element)
1202  unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1203 
1204  /// Returns whether the instruction is a load or store and will be a emitted
1205  /// as a vector operation.
1206  bool isConsecutiveLoadOrStore(Instruction *I);
1207 
1208  /// Returns true if an artificially high cost for emulated masked memrefs
1209  /// should be used.
1210  bool useEmulatedMaskMemRefHack(Instruction *I);
1211 
1212  /// Create an analysis remark that explains why vectorization failed
1213  ///
1214  /// \p RemarkName is the identifier for the remark. \return the remark object
1215  /// that can be streamed to.
1216  OptimizationRemarkAnalysis createMissedAnalysis(StringRef RemarkName) {
1217  return createLVMissedAnalysis(Hints->vectorizeAnalysisPassName(),
1218  RemarkName, TheLoop);
1219  }
1220 
1221  /// Map of scalar integer values to the smallest bitwidth they can be legally
1222  /// represented as. The vector equivalents of these values should be truncated
1223  /// to this type.
1225 
1226  /// A type representing the costs for instructions if they were to be
1227  /// scalarized rather than vectorized. The entries are Instruction-Cost
1228  /// pairs.
1230 
1231  /// A set containing all BasicBlocks that are known to present after
1232  /// vectorization as a predicated block.
1233  SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1234 
1235  /// Records whether it is allowed to have the original scalar loop execute at
1236  /// least once. This may be needed as a fallback loop in case runtime
1237  /// aliasing/dependence checks fail, or to handle the tail/remainder
1238  /// iterations when the trip count is unknown or doesn't divide by the VF,
1239  /// or as a peel-loop to handle gaps in interleave-groups.
1240  /// Under optsize and when the trip count is very small we don't allow any
1241  /// iterations to execute in the scalar loop.
1242  bool IsScalarEpilogueAllowed = true;
1243 
1244  /// All blocks of loop are to be masked to fold tail of scalar iterations.
1245  bool FoldTailByMasking = false;
1246 
1247  /// A map holding scalar costs for different vectorization factors. The
1248  /// presence of a cost for an instruction in the mapping indicates that the
1249  /// instruction will be scalarized when vectorizing with the associated
1250  /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1251  DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1252 
1253  /// Holds the instructions known to be uniform after vectorization.
1254  /// The data is collected per VF.
1256 
1257  /// Holds the instructions known to be scalar after vectorization.
1258  /// The data is collected per VF.
1260 
1261  /// Holds the instructions (address computations) that are forced to be
1262  /// scalarized.
1264 
1265  /// Returns the expected difference in cost from scalarizing the expression
1266  /// feeding a predicated instruction \p PredInst. The instructions to
1267  /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1268  /// non-negative return value implies the expression will be scalarized.
1269  /// Currently, only single-use chains are considered for scalarization.
1270  int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1271  unsigned VF);
1272 
1273  /// Collect the instructions that are uniform after vectorization. An
1274  /// instruction is uniform if we represent it with a single scalar value in
1275  /// the vectorized loop corresponding to each vector iteration. Examples of
1276  /// uniform instructions include pointer operands of consecutive or
1277  /// interleaved memory accesses. Note that although uniformity implies an
1278  /// instruction will be scalar, the reverse is not true. In general, a
1279  /// scalarized instruction will be represented by VF scalar values in the
1280  /// vectorized loop, each corresponding to an iteration of the original
1281  /// scalar loop.
1282  void collectLoopUniforms(unsigned VF);
1283 
1284  /// Collect the instructions that are scalar after vectorization. An
1285  /// instruction is scalar if it is known to be uniform or will be scalarized
1286  /// during vectorization. Non-uniform scalarized instructions will be
1287  /// represented by VF values in the vectorized loop, each corresponding to an
1288  /// iteration of the original scalar loop.
1289  void collectLoopScalars(unsigned VF);
1290 
1291  /// Keeps cost model vectorization decision and cost for instructions.
1292  /// Right now it is used for memory instructions only.
1294  std::pair<InstWidening, unsigned>>;
1295 
1296  DecisionList WideningDecisions;
1297 
1298 public:
1299  /// The loop that we evaluate.
1301 
1302  /// Predicated scalar evolution analysis.
1304 
1305  /// Loop Info analysis.
1307 
1308  /// Vectorization legality.
1310 
1311  /// Vector target information.
1313 
1314  /// Target Library Info.
1316 
1317  /// Demanded bits analysis.
1319 
1320  /// Assumption cache.
1322 
1323  /// Interface to emit optimization remarks.
1325 
1327 
1328  /// Loop Vectorize Hint.
1330 
1331  /// The interleave access information contains groups of interleaved accesses
1332  /// with the same stride and close to each other.
1334 
1335  /// Values to ignore in the cost model.
1337 
1338  /// Values to ignore in the cost model when VF > 1.
1340 };
1341 
1342 } // end namespace llvm
1343 
1344 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1345 // vectorization. The loop needs to be annotated with #pragma omp simd
1346 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1347 // vector length information is not provided, vectorization is not considered
1348 // explicit. Interleave hints are not allowed either. These limitations will be
1349 // relaxed in the future.
1350 // Please, note that we are currently forced to abuse the pragma 'clang
1351 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1352 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1353 // provides *explicit vectorization hints* (LV can bypass legal checks and
1354 // assume that vectorization is legal). However, both hints are implemented
1355 // using the same metadata (llvm.loop.vectorize, processed by
1356 // LoopVectorizeHints). This will be fixed in the future when the native IR
1357 // representation for pragma 'omp simd' is introduced.
1358 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1360  assert(!OuterLp->empty() && "This is not an outer loop");
1361  LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1362 
1363  // Only outer loops with an explicit vectorization hint are supported.
1364  // Unannotated outer loops are ignored.
1365  if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1366  return false;
1367 
1368  Function *Fn = OuterLp->getHeader()->getParent();
1369  if (!Hints.allowVectorization(Fn, OuterLp,
1370  true /*VectorizeOnlyWhenForced*/)) {
1371  LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1372  return false;
1373  }
1374 
1375  if (!Hints.getWidth()) {
1376  LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No user vector width.\n");
1377  Hints.emitRemarkWithHints();
1378  return false;
1379  }
1380 
1381  if (Hints.getInterleave() > 1) {
1382  // TODO: Interleave support is future work.
1383  LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1384  "outer loops.\n");
1385  Hints.emitRemarkWithHints();
1386  return false;
1387  }
1388 
1389  return true;
1390 }
1391 
1395  // Collect inner loops and outer loops without irreducible control flow. For
1396  // now, only collect outer loops that have explicit vectorization hints. If we
1397  // are stress testing the VPlan H-CFG construction, we collect the outermost
1398  // loop of every loop nest.
1399  if (L.empty() || VPlanBuildStressTest ||
1401  LoopBlocksRPO RPOT(&L);
1402  RPOT.perform(LI);
1403  if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1404  V.push_back(&L);
1405  // TODO: Collect inner loops inside marked outer loops in case
1406  // vectorization fails for the outer loop. Do not invoke
1407  // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1408  // already known to be reducible. We can use an inherited attribute for
1409  // that.
1410  return;
1411  }
1412  }
1413  for (Loop *InnerL : L)
1414  collectSupportedLoops(*InnerL, LI, ORE, V);
1415 }
1416 
1417 namespace {
1418 
1419 /// The LoopVectorize Pass.
1420 struct LoopVectorize : public FunctionPass {
1421  /// Pass identification, replacement for typeid
1422  static char ID;
1423 
1424  LoopVectorizePass Impl;
1425 
1426  explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1427  bool VectorizeOnlyWhenForced = false)
1428  : FunctionPass(ID) {
1429  Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
1430  Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
1432  }
1433 
1434  bool runOnFunction(Function &F) override {
1435  if (skipFunction(F))
1436  return false;
1437 
1438  auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1439  auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1440  auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1441  auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1442  auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1443  auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1444  auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
1445  auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1446  auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1447  auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1448  auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1449  auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1450 
1451  std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1452  [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1453 
1454  return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1455  GetLAA, *ORE);
1456  }
1457 
1458  void getAnalysisUsage(AnalysisUsage &AU) const override {
1469 
1470  // We currently do not preserve loopinfo/dominator analyses with outer loop
1471  // vectorization. Until this is addressed, mark these analyses as preserved
1472  // only for non-VPlan-native path.
1473  // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1474  if (!EnableVPlanNativePath) {
1477  }
1478 
1481  }
1482 };
1483 
1484 } // end anonymous namespace
1485 
1486 //===----------------------------------------------------------------------===//
1487 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1488 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1489 //===----------------------------------------------------------------------===//
1490 
1492  // We need to place the broadcast of invariant variables outside the loop,
1493  // but only if it's proven safe to do so. Else, broadcast will be inside
1494  // vector loop body.
1495  Instruction *Instr = dyn_cast<Instruction>(V);
1496  bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1497  (!Instr ||
1499  // Place the code for broadcasting invariant variables in the new preheader.
1501  if (SafeToHoist)
1503 
1504  // Broadcast the scalar into all locations in the vector.
1505  Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1506 
1507  return Shuf;
1508 }
1509 
1511  const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1512  assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1513  "Expected either an induction phi-node or a truncate of it!");
1514  Value *Start = II.getStartValue();
1515 
1516  // Construct the initial value of the vector IV in the vector loop preheader
1517  auto CurrIP = Builder.saveIP();
1519  if (isa<TruncInst>(EntryVal)) {
1520  assert(Start->getType()->isIntegerTy() &&
1521  "Truncation requires an integer type");
1522  auto *TruncType = cast<IntegerType>(EntryVal->getType());
1523  Step = Builder.CreateTrunc(Step, TruncType);
1524  Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1525  }
1526  Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1527  Value *SteppedStart =
1528  getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1529 
1530  // We create vector phi nodes for both integer and floating-point induction
1531  // variables. Here, we determine the kind of arithmetic we will perform.
1532  Instruction::BinaryOps AddOp;
1533  Instruction::BinaryOps MulOp;
1534  if (Step->getType()->isIntegerTy()) {
1535  AddOp = Instruction::Add;
1536  MulOp = Instruction::Mul;
1537  } else {
1538  AddOp = II.getInductionOpcode();
1539  MulOp = Instruction::FMul;
1540  }
1541 
1542  // Multiply the vectorization factor by the step using integer or
1543  // floating-point arithmetic as appropriate.
1544  Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1545  Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1546 
1547  // Create a vector splat to use in the induction update.
1548  //
1549  // FIXME: If the step is non-constant, we create the vector splat with
1550  // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1551  // handle a constant vector splat.
1552  Value *SplatVF = isa<Constant>(Mul)
1553  ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1554  : Builder.CreateVectorSplat(VF, Mul);
1555  Builder.restoreIP(CurrIP);
1556 
1557  // We may need to add the step a number of times, depending on the unroll
1558  // factor. The last of those goes into the PHI.
1559  PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1561  VecInd->setDebugLoc(EntryVal->getDebugLoc());
1562  Instruction *LastInduction = VecInd;
1563  for (unsigned Part = 0; Part < UF; ++Part) {
1564  VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1565 
1566  if (isa<TruncInst>(EntryVal))
1567  addMetadata(LastInduction, EntryVal);
1568  recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1569 
1570  LastInduction = cast<Instruction>(addFastMathFlag(
1571  Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1572  LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1573  }
1574 
1575  // Move the last step to the end of the latch block. This ensures consistent
1576  // placement of all induction updates.
1577  auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1578  auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1579  auto *ICmp = cast<Instruction>(Br->getCondition());
1580  LastInduction->moveBefore(ICmp);
1581  LastInduction->setName("vec.ind.next");
1582 
1583  VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1584  VecInd->addIncoming(LastInduction, LoopVectorLatch);
1585 }
1586 
1588  return Cost->isScalarAfterVectorization(I, VF) ||
1590 }
1591 
1594  return true;
1595  auto isScalarInst = [&](User *U) -> bool {
1596  auto *I = cast<Instruction>(U);
1598  };
1599  return llvm::any_of(IV->users(), isScalarInst);
1600 }
1601 
1603  const InductionDescriptor &ID, const Instruction *EntryVal,
1604  Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1605  assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1606  "Expected either an induction phi-node or a truncate of it!");
1607 
1608  // This induction variable is not the phi from the original loop but the
1609  // newly-created IV based on the proof that casted Phi is equal to the
1610  // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1611  // re-uses the same InductionDescriptor that original IV uses but we don't
1612  // have to do any recording in this case - that is done when original IV is
1613  // processed.
1614  if (isa<TruncInst>(EntryVal))
1615  return;
1616 
1617  const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1618  if (Casts.empty())
1619  return;
1620  // Only the first Cast instruction in the Casts vector is of interest.
1621  // The rest of the Casts (if exist) have no uses outside the
1622  // induction update chain itself.
1623  Instruction *CastInst = *Casts.begin();
1624  if (Lane < UINT_MAX)
1625  VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1626  else
1627  VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1628 }
1629 
1631  assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1632  "Primary induction variable must have an integer type");
1633 
1634  auto II = Legal->getInductionVars()->find(IV);
1635  assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
1636 
1637  auto ID = II->second;
1638  assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1639 
1640  // The scalar value to broadcast. This will be derived from the canonical
1641  // induction variable.
1642  Value *ScalarIV = nullptr;
1643 
1644  // The value from the original loop to which we are mapping the new induction
1645  // variable.
1646  Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1647 
1648  // True if we have vectorized the induction variable.
1649  auto VectorizedIV = false;
1650 
1651  // Determine if we want a scalar version of the induction variable. This is
1652  // true if the induction variable itself is not widened, or if it has at
1653  // least one user in the loop that is not widened.
1654  auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
1655 
1656  // Generate code for the induction step. Note that induction steps are
1657  // required to be loop-invariant
1658  assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
1659  "Induction step should be loop invariant");
1660  auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1661  Value *Step = nullptr;
1662  if (PSE.getSE()->isSCEVable(IV->getType())) {
1663  SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1664  Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
1666  } else {
1667  Step = cast<SCEVUnknown>(ID.getStep())->getValue();
1668  }
1669 
1670  // Try to create a new independent vector induction variable. If we can't
1671  // create the phi node, we will splat the scalar induction variable in each
1672  // loop iteration.
1673  if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
1674  createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1675  VectorizedIV = true;
1676  }
1677 
1678  // If we haven't yet vectorized the induction variable, or if we will create
1679  // a scalar one, we need to define the scalar induction variable and step
1680  // values. If we were given a truncation type, truncate the canonical
1681  // induction variable and step. Otherwise, derive these values from the
1682  // induction descriptor.
1683  if (!VectorizedIV || NeedsScalarIV) {
1684  ScalarIV = Induction;
1685  if (IV != OldInduction) {
1686  ScalarIV = IV->getType()->isIntegerTy()
1688  : Builder.CreateCast(Instruction::SIToFP, Induction,
1689  IV->getType());
1690  ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1691  ScalarIV->setName("offset.idx");
1692  }
1693  if (Trunc) {
1694  auto *TruncType = cast<IntegerType>(Trunc->getType());
1695  assert(Step->getType()->isIntegerTy() &&
1696  "Truncation requires an integer step");
1697  ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1698  Step = Builder.CreateTrunc(Step, TruncType);
1699  }
1700  }
1701 
1702  // If we haven't yet vectorized the induction variable, splat the scalar
1703  // induction variable, and build the necessary step vectors.
1704  // TODO: Don't do it unless the vectorized IV is really required.
1705  if (!VectorizedIV) {
1706  Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1707  for (unsigned Part = 0; Part < UF; ++Part) {
1708  Value *EntryPart =
1709  getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1710  VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1711  if (Trunc)
1712  addMetadata(EntryPart, Trunc);
1713  recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1714  }
1715  }
1716 
1717  // If an induction variable is only used for counting loop iterations or
1718  // calculating addresses, it doesn't need to be widened. Create scalar steps
1719  // that can be used by instructions we will later scalarize. Note that the
1720  // addition of the scalar steps will not increase the number of instructions
1721  // in the loop in the common case prior to InstCombine. We will be trading
1722  // one vector extract for each scalar step.
1723  if (NeedsScalarIV)
1724  buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1725 }
1726 
1728  Instruction::BinaryOps BinOp) {
1729  // Create and check the types.
1730  assert(Val->getType()->isVectorTy() && "Must be a vector");
1731  int VLen = Val->getType()->getVectorNumElements();
1732 
1733  Type *STy = Val->getType()->getScalarType();
1734  assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1735  "Induction Step must be an integer or FP");
1736  assert(Step->getType() == STy && "Step has wrong type");
1737 
1739 
1740  if (STy->isIntegerTy()) {
1741  // Create a vector of consecutive numbers from zero to VF.
1742  for (int i = 0; i < VLen; ++i)
1743  Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1744 
1745  // Add the consecutive indices to the vector value.
1746  Constant *Cv = ConstantVector::get(Indices);
1747  assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1748  Step = Builder.CreateVectorSplat(VLen, Step);
1749  assert(Step->getType() == Val->getType() && "Invalid step vec");
1750  // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1751  // which can be found from the original scalar operations.
1752  Step = Builder.CreateMul(Cv, Step);
1753  return Builder.CreateAdd(Val, Step, "induction");
1754  }
1755 
1756  // Floating point induction.
1757  assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1758  "Binary Opcode should be specified for FP induction");
1759  // Create a vector of consecutive numbers from zero to VF.
1760  for (int i = 0; i < VLen; ++i)
1761  Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1762 
1763  // Add the consecutive indices to the vector value.
1764  Constant *Cv = ConstantVector::get(Indices);
1765 
1766  Step = Builder.CreateVectorSplat(VLen, Step);
1767 
1768  // Floating point operations had to be 'fast' to enable the induction.
1769  FastMathFlags Flags;
1770  Flags.setFast();
1771 
1772  Value *MulOp = Builder.CreateFMul(Cv, Step);
1773  if (isa<Instruction>(MulOp))
1774  // Have to check, MulOp may be a constant
1775  cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1776 
1777  Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1778  if (isa<Instruction>(BOp))
1779  cast<Instruction>(BOp)->setFastMathFlags(Flags);
1780  return BOp;
1781 }
1782 
1784  Instruction *EntryVal,
1785  const InductionDescriptor &ID) {
1786  // We shouldn't have to build scalar steps if we aren't vectorizing.
1787  assert(VF > 1 && "VF should be greater than one");
1788 
1789  // Get the value type and ensure it and the step have the same integer type.
1790  Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1791  assert(ScalarIVTy == Step->getType() &&
1792  "Val and Step should have the same type");
1793 
1794  // We build scalar steps for both integer and floating-point induction
1795  // variables. Here, we determine the kind of arithmetic we will perform.
1796  Instruction::BinaryOps AddOp;
1797  Instruction::BinaryOps MulOp;
1798  if (ScalarIVTy->isIntegerTy()) {
1799  AddOp = Instruction::Add;
1800  MulOp = Instruction::Mul;
1801  } else {
1802  AddOp = ID.getInductionOpcode();
1803  MulOp = Instruction::FMul;
1804  }
1805 
1806  // Determine the number of scalars we need to generate for each unroll
1807  // iteration. If EntryVal is uniform, we only need to generate the first
1808  // lane. Otherwise, we generate all VF values.
1809  unsigned Lanes =
1810  Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
1811  : VF;
1812  // Compute the scalar steps and save the results in VectorLoopValueMap.
1813  for (unsigned Part = 0; Part < UF; ++Part) {
1814  for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
1815  auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
1816  auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
1817  auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
1818  VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
1819  recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
1820  }
1821  }
1822 }
1823 
1825  assert(V != Induction && "The new induction variable should not be used.");
1826  assert(!V->getType()->isVectorTy() && "Can't widen a vector");
1827  assert(!V->getType()->isVoidTy() && "Type does not produce a value");
1828 
1829  // If we have a stride that is replaced by one, do it here. Defer this for
1830  // the VPlan-native path until we start running Legal checks in that path.
1832  V = ConstantInt::get(V->getType(), 1);
1833 
1834  // If we have a vector mapped to this value, return it.
1835  if (VectorLoopValueMap.hasVectorValue(V, Part))
1836  return VectorLoopValueMap.getVectorValue(V, Part);
1837 
1838  // If the value has not been vectorized, check if it has been scalarized
1839  // instead. If it has been scalarized, and we actually need the value in
1840  // vector form, we will construct the vector values on demand.
1842  Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
1843 
1844  // If we've scalarized a value, that value should be an instruction.
1845  auto *I = cast<Instruction>(V);
1846 
1847  // If we aren't vectorizing, we can just copy the scalar map values over to
1848  // the vector map.
1849  if (VF == 1) {
1850  VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
1851  return ScalarValue;
1852  }
1853 
1854  // Get the last scalar instruction we generated for V and Part. If the value
1855  // is known to be uniform after vectorization, this corresponds to lane zero
1856  // of the Part unroll iteration. Otherwise, the last instruction is the one
1857  // we created for the last vector lane of the Part unroll iteration.
1858  unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
1859  auto *LastInst = cast<Instruction>(
1860  VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
1861 
1862  // Set the insert point after the last scalarized instruction. This ensures
1863  // the insertelement sequence will directly follow the scalar definitions.
1864  auto OldIP = Builder.saveIP();
1865  auto NewIP = std::next(BasicBlock::iterator(LastInst));
1866  Builder.SetInsertPoint(&*NewIP);
1867 
1868  // However, if we are vectorizing, we need to construct the vector values.
1869  // If the value is known to be uniform after vectorization, we can just
1870  // broadcast the scalar value corresponding to lane zero for each unroll
1871  // iteration. Otherwise, we construct the vector values using insertelement
1872  // instructions. Since the resulting vectors are stored in
1873  // VectorLoopValueMap, we will only generate the insertelements once.
1874  Value *VectorValue = nullptr;
1876  VectorValue = getBroadcastInstrs(ScalarValue);
1877  VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
1878  } else {
1879  // Initialize packing with insertelements to start from undef.
1881  VectorLoopValueMap.setVectorValue(V, Part, Undef);
1882  for (unsigned Lane = 0; Lane < VF; ++Lane)
1883  packScalarIntoVectorValue(V, {Part, Lane});
1884  VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
1885  }
1886  Builder.restoreIP(OldIP);
1887  return VectorValue;
1888  }
1889 
1890  // If this scalar is unknown, assume that it is a constant or that it is
1891  // loop invariant. Broadcast V and save the value for future uses.
1892  Value *B = getBroadcastInstrs(V);
1893  VectorLoopValueMap.setVectorValue(V, Part, B);
1894  return B;
1895 }
1896 
1897 Value *
1899  const VPIteration &Instance) {
1900  // If the value is not an instruction contained in the loop, it should
1901  // already be scalar.
1902  if (OrigLoop->isLoopInvariant(V))
1903  return V;
1904 
1905  assert(Instance.Lane > 0
1906  ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
1907  : true && "Uniform values only have lane zero");
1908 
1909  // If the value from the original loop has not been vectorized, it is
1910  // represented by UF x VF scalar values in the new loop. Return the requested
1911  // scalar value.
1912  if (VectorLoopValueMap.hasScalarValue(V, Instance))
1913  return VectorLoopValueMap.getScalarValue(V, Instance);
1914 
1915  // If the value has not been scalarized, get its entry in VectorLoopValueMap
1916  // for the given unroll part. If this entry is not a vector type (i.e., the
1917  // vectorization factor is one), there is no need to generate an
1918  // extractelement instruction.
1919  auto *U = getOrCreateVectorValue(V, Instance.Part);
1920  if (!U->getType()->isVectorTy()) {
1921  assert(VF == 1 && "Value not scalarized has non-vector type");
1922  return U;
1923  }
1924 
1925  // Otherwise, the value from the original loop has been vectorized and is
1926  // represented by UF vector values. Extract and return the requested scalar
1927  // value from the appropriate vector lane.
1928  return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
1929 }
1930 
1932  Value *V, const VPIteration &Instance) {
1933  assert(V != Induction && "The new induction variable should not be used.");
1934  assert(!V->getType()->isVectorTy() && "Can't pack a vector");
1935  assert(!V->getType()->isVoidTy() && "Type does not produce a value");
1936 
1937  Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
1938  Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
1939  VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
1940  Builder.getInt32(Instance.Lane));
1941  VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
1942 }
1943 
1945  assert(Vec->getType()->isVectorTy() && "Invalid type");
1946  SmallVector<Constant *, 8> ShuffleMask;
1947  for (unsigned i = 0; i < VF; ++i)
1948  ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
1949 
1950  return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
1951  ConstantVector::get(ShuffleMask),
1952  "reverse");
1953 }
1954 
1955 // Return whether we allow using masked interleave-groups (for dealing with
1956 // strided loads/stores that reside in predicated blocks, or for dealing
1957 // with gaps).
1959  // If an override option has been passed in for interleaved accesses, use it.
1960  if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
1962 
1964 }
1965 
1966 // Try to vectorize the interleave group that \p Instr belongs to.
1967 //
1968 // E.g. Translate following interleaved load group (factor = 3):
1969 // for (i = 0; i < N; i+=3) {
1970 // R = Pic[i]; // Member of index 0
1971 // G = Pic[i+1]; // Member of index 1
1972 // B = Pic[i+2]; // Member of index 2
1973 // ... // do something to R, G, B
1974 // }
1975 // To:
1976 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
1977 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements
1978 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements
1979 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements
1980 //
1981 // Or translate following interleaved store group (factor = 3):
1982 // for (i = 0; i < N; i+=3) {
1983 // ... do something to R, G, B
1984 // Pic[i] = R; // Member of index 0
1985 // Pic[i+1] = G; // Member of index 1
1986 // Pic[i+2] = B; // Member of index 2
1987 // }
1988 // To:
1989 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
1990 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
1991 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
1992 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
1993 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
1995  VectorParts *BlockInMask) {
1996  const InterleaveGroup<Instruction> *Group =
1998  assert(Group && "Fail to get an interleaved access group.");
1999 
2000  // Skip if current instruction is not the insert position.
2001  if (Instr != Group->getInsertPos())
2002  return;
2003 
2004  const DataLayout &DL = Instr->getModule()->getDataLayout();
2005  Value *Ptr = getLoadStorePointerOperand(Instr);
2006 
2007  // Prepare for the vector type of the interleaved load/store.
2008  Type *ScalarTy = getMemInstValueType(Instr);
2009  unsigned InterleaveFactor = Group->getFactor();
2010  Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2011  Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr));
2012 
2013  // Prepare for the new pointers.
2015  SmallVector<Value *, 2> NewPtrs;
2016  unsigned Index = Group->getIndex(Instr);
2017 
2018  VectorParts Mask;
2019  bool IsMaskForCondRequired = BlockInMask;
2020  if (IsMaskForCondRequired) {
2021  Mask = *BlockInMask;
2022  // TODO: extend the masked interleaved-group support to reversed access.
2023  assert(!Group->isReverse() && "Reversed masked interleave-group "
2024  "not supported.");
2025  }
2026 
2027  // If the group is reverse, adjust the index to refer to the last vector lane
2028  // instead of the first. We adjust the index from the first vector lane,
2029  // rather than directly getting the pointer for lane VF - 1, because the
2030  // pointer operand of the interleaved access is supposed to be uniform. For
2031  // uniform instructions, we're only required to generate a value for the
2032  // first vector lane in each unroll iteration.
2033  if (Group->isReverse())
2034  Index += (VF - 1) * Group->getFactor();
2035 
2036  bool InBounds = false;
2037  if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2038  InBounds = gep->isInBounds();
2039 
2040  for (unsigned Part = 0; Part < UF; Part++) {
2041  Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});
2042 
2043  // Notice current instruction could be any index. Need to adjust the address
2044  // to the member of index 0.
2045  //
2046  // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2047  // b = A[i]; // Member of index 0
2048  // Current pointer is pointed to A[i+1], adjust it to A[i].
2049  //
2050  // E.g. A[i+1] = a; // Member of index 1
2051  // A[i] = b; // Member of index 0
2052  // A[i+2] = c; // Member of index 2 (Current instruction)
2053  // Current pointer is pointed to A[i+2], adjust it to A[i].
2054  NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-Index));
2055  if (InBounds)
2056  cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true);
2057 
2058  // Cast to the vector pointer type.
2059  NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
2060  }
2061 
2062  setDebugLocFromInst(Builder, Instr);
2063  Value *UndefVec = UndefValue::get(VecTy);
2064 
2065  Value *MaskForGaps = nullptr;
2066  if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2067  MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2068  assert(MaskForGaps && "Mask for Gaps is required but it is null");
2069  }
2070 
2071  // Vectorize the interleaved load group.
2072  if (isa<LoadInst>(Instr)) {
2073  // For each unroll part, create a wide load for the group.
2074  SmallVector<Value *, 2> NewLoads;
2075  for (unsigned Part = 0; Part < UF; Part++) {
2076  Instruction *NewLoad;
2077  if (IsMaskForCondRequired || MaskForGaps) {
2079  "masked interleaved groups are not allowed.");
2080  Value *GroupMask = MaskForGaps;
2081  if (IsMaskForCondRequired) {
2082  auto *Undefs = UndefValue::get(Mask[Part]->getType());
2083  auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2084  Value *ShuffledMask = Builder.CreateShuffleVector(
2085  Mask[Part], Undefs, RepMask, "interleaved.mask");
2086  GroupMask = MaskForGaps
2087  ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2088  MaskForGaps)
2089  : ShuffledMask;
2090  }
2091  NewLoad =
2092  Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(),
2093  GroupMask, UndefVec, "wide.masked.vec");
2094  }
2095  else
2096  NewLoad = Builder.CreateAlignedLoad(NewPtrs[Part],
2097  Group->getAlignment(), "wide.vec");
2098  Group->addMetadata(NewLoad);
2099  NewLoads.push_back(NewLoad);
2100  }
2101 
2102  // For each member in the group, shuffle out the appropriate data from the
2103  // wide loads.
2104  for (unsigned I = 0; I < InterleaveFactor; ++I) {
2105  Instruction *Member = Group->getMember(I);
2106 
2107  // Skip the gaps in the group.
2108  if (!Member)
2109  continue;
2110 
2111  Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2112  for (unsigned Part = 0; Part < UF; Part++) {
2113  Value *StridedVec = Builder.CreateShuffleVector(
2114  NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2115 
2116  // If this member has different type, cast the result type.
2117  if (Member->getType() != ScalarTy) {
2118  VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2119  StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2120  }
2121 
2122  if (Group->isReverse())
2123  StridedVec = reverseVector(StridedVec);
2124 
2125  VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2126  }
2127  }
2128  return;
2129  }
2130 
2131  // The sub vector type for current instruction.
2132  VectorType *SubVT = VectorType::get(ScalarTy, VF);
2133 
2134  // Vectorize the interleaved store group.
2135  for (unsigned Part = 0; Part < UF; Part++) {
2136  // Collect the stored vector from each member.
2137  SmallVector<Value *, 4> StoredVecs;
2138  for (unsigned i = 0; i < InterleaveFactor; i++) {
2139  // Interleaved store group doesn't allow a gap, so each index has a member
2140  Instruction *Member = Group->getMember(i);
2141  assert(Member && "Fail to get a member from an interleaved store group");
2142 
2143  Value *StoredVec = getOrCreateVectorValue(
2144  cast<StoreInst>(Member)->getValueOperand(), Part);
2145  if (Group->isReverse())
2146  StoredVec = reverseVector(StoredVec);
2147 
2148  // If this member has different type, cast it to a unified type.
2149 
2150  if (StoredVec->getType() != SubVT)
2151  StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2152 
2153  StoredVecs.push_back(StoredVec);
2154  }
2155 
2156  // Concatenate all vectors into a wide vector.
2157  Value *WideVec = concatenateVectors(Builder, StoredVecs);
2158 
2159  // Interleave the elements in the wide vector.
2160  Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2161  Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2162  "interleaved.vec");
2163 
2164  Instruction *NewStoreInstr;
2165  if (IsMaskForCondRequired) {
2166  auto *Undefs = UndefValue::get(Mask[Part]->getType());
2167  auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2168  Value *ShuffledMask = Builder.CreateShuffleVector(
2169  Mask[Part], Undefs, RepMask, "interleaved.mask");
2170  NewStoreInstr = Builder.CreateMaskedStore(
2171  IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask);
2172  }
2173  else
2174  NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part],
2175  Group->getAlignment());
2176 
2177  Group->addMetadata(NewStoreInstr);
2178  }
2179 }
2180 
2182  VectorParts *BlockInMask) {
2183  // Attempt to issue a wide load.
2184  LoadInst *LI = dyn_cast<LoadInst>(Instr);
2185  StoreInst *SI = dyn_cast<StoreInst>(Instr);
2186 
2187  assert((LI || SI) && "Invalid Load/Store instruction");
2188 
2190  Cost->getWideningDecision(Instr, VF);
2192  "CM decision should be taken at this point");
2194  return vectorizeInterleaveGroup(Instr);
2195 
2196  Type *ScalarDataTy = getMemInstValueType(Instr);
2197  Type *DataTy = VectorType::get(ScalarDataTy, VF);
2198  Value *Ptr = getLoadStorePointerOperand(Instr);
2199  unsigned Alignment = getLoadStoreAlignment(Instr);
2200  // An alignment of 0 means target abi alignment. We need to use the scalar's
2201  // target abi alignment in such a case.
2202  const DataLayout &DL = Instr->getModule()->getDataLayout();
2203  if (!Alignment)
2204  Alignment = DL.getABITypeAlignment(ScalarDataTy);
2205  unsigned AddressSpace = getLoadStoreAddressSpace(Instr);
2206 
2207  // Determine if the pointer operand of the access is either consecutive or
2208  // reverse consecutive.
2209  bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2210  bool ConsecutiveStride =
2211  Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2212  bool CreateGatherScatter =
2214 
2215  // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2216  // gather/scatter. Otherwise Decision should have been to Scalarize.
2217  assert((ConsecutiveStride || CreateGatherScatter) &&
2218  "The instruction should be scalarized");
2219 
2220  // Handle consecutive loads/stores.
2221  if (ConsecutiveStride)
2222  Ptr = getOrCreateScalarValue(Ptr, {0, 0});
2223 
2224  VectorParts Mask;
2225  bool isMaskRequired = BlockInMask;
2226  if (isMaskRequired)
2227  Mask = *BlockInMask;
2228 
2229  bool InBounds = false;
2230  if (auto *gep = dyn_cast<GetElementPtrInst>(
2231  getLoadStorePointerOperand(Instr)->stripPointerCasts()))
2232  InBounds = gep->isInBounds();
2233 
2234  const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2235  // Calculate the pointer for the specific unroll-part.
2236  GetElementPtrInst *PartPtr = nullptr;
2237 
2238  if (Reverse) {
2239  // If the address is consecutive but reversed, then the
2240  // wide store needs to start at the last vector element.
2241  PartPtr = cast<GetElementPtrInst>(
2242  Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)));
2243  PartPtr->setIsInBounds(InBounds);
2244  PartPtr = cast<GetElementPtrInst>(
2245  Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)));
2246  PartPtr->setIsInBounds(InBounds);
2247  if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2248  Mask[Part] = reverseVector(Mask[Part]);
2249  } else {
2250  PartPtr = cast<GetElementPtrInst>(
2251  Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)));
2252  PartPtr->setIsInBounds(InBounds);
2253  }
2254 
2255  return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2256  };
2257 
2258  // Handle Stores:
2259  if (SI) {
2261 
2262  for (unsigned Part = 0; Part < UF; ++Part) {
2263  Instruction *NewSI = nullptr;
2264  Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
2265  if (CreateGatherScatter) {
2266  Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2267  Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2268  NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2269  MaskPart);
2270  } else {
2271  if (Reverse) {
2272  // If we store to reverse consecutive memory locations, then we need
2273  // to reverse the order of elements in the stored value.
2274  StoredVal = reverseVector(StoredVal);
2275  // We don't want to update the value in the map as it might be used in
2276  // another expression. So don't call resetVectorValue(StoredVal).
2277  }
2278  auto *VecPtr = CreateVecPtr(Part, Ptr);
2279  if (isMaskRequired)
2280  NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2281  Mask[Part]);
2282  else
2283  NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2284  }
2285  addMetadata(NewSI, SI);
2286  }
2287  return;
2288  }
2289 
2290  // Handle loads.
2291  assert(LI && "Must have a load instruction");
2293  for (unsigned Part = 0; Part < UF; ++Part) {
2294  Value *NewLI;
2295  if (CreateGatherScatter) {
2296  Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2297  Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2298  NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2299  nullptr, "wide.masked.gather");
2300  addMetadata(NewLI, LI);
2301  } else {
2302  auto *VecPtr = CreateVecPtr(Part, Ptr);
2303  if (isMaskRequired)
2304  NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
2305  UndefValue::get(DataTy),
2306  "wide.masked.load");
2307  else
2308  NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load");
2309 
2310  // Add metadata to the load, but setVectorValue to the reverse shuffle.
2311  addMetadata(NewLI, LI);
2312  if (Reverse)
2313  NewLI = reverseVector(NewLI);
2314  }
2315  VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2316  }
2317 }
2318 
2320  const VPIteration &Instance,
2321  bool IfPredicateInstr) {
2322  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2323 
2324  setDebugLocFromInst(Builder, Instr);
2325 
2326  // Does this instruction return a value ?
2327  bool IsVoidRetTy = Instr->getType()->isVoidTy();
2328 
2329  Instruction *Cloned = Instr->clone();
2330  if (!IsVoidRetTy)
2331  Cloned->setName(Instr->getName() + ".cloned");
2332 
2333  // Replace the operands of the cloned instructions with their scalar
2334  // equivalents in the new loop.
2335  for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2336  auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2337  Cloned->setOperand(op, NewOp);
2338  }
2339  addNewMetadata(Cloned, Instr);
2340 
2341  // Place the cloned scalar in the new loop.
2342  Builder.Insert(Cloned);
2343 
2344  // Add the cloned scalar to the scalar map entry.
2345  VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2346 
2347  // If we just cloned a new assumption, add it the assumption cache.
2348  if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2349  if (II->getIntrinsicID() == Intrinsic::assume)
2350  AC->registerAssumption(II);
2351 
2352  // End if-block.
2353  if (IfPredicateInstr)
2354  PredicatedInstructions.push_back(Cloned);
2355 }
2356 
2358  Value *End, Value *Step,
2359  Instruction *DL) {
2360  BasicBlock *Header = L->getHeader();
2361  BasicBlock *Latch = L->getLoopLatch();
2362  // As we're just creating this loop, it's possible no latch exists
2363  // yet. If so, use the header as this will be a single block loop.
2364  if (!Latch)
2365  Latch = Header;
2366 
2369  setDebugLocFromInst(Builder, OldInst);
2370  auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2371 
2373  setDebugLocFromInst(Builder, OldInst);
2374 
2375  // Create i+1 and fill the PHINode.
2376  Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2377  Induction->addIncoming(Start, L->getLoopPreheader());
2378  Induction->addIncoming(Next, Latch);
2379  // Create the compare.
2380  Value *ICmp = Builder.CreateICmpEQ(Next, End);
2381  Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2382 
2383  // Now we have two terminators. Remove the old one from the block.
2384  Latch->getTerminator()->eraseFromParent();
2385 
2386  return Induction;
2387 }
2388 
2390  if (TripCount)
2391  return TripCount;
2392 
2393  assert(L && "Create Trip Count for null loop.");
2395  // Find the loop boundaries.
2396  ScalarEvolution *SE = PSE.getSE();
2397  const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2398  assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2399  "Invalid loop count");
2400 
2401  Type *IdxTy = Legal->getWidestInductionType();
2402  assert(IdxTy && "No type for induction");
2403 
2404  // The exit count might have the type of i64 while the phi is i32. This can
2405  // happen if we have an induction variable that is sign extended before the
2406  // compare. The only way that we get a backedge taken count is that the
2407  // induction variable was signed and as such will not overflow. In such a case
2408  // truncation is legal.
2409  if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
2410  IdxTy->getPrimitiveSizeInBits())
2411  BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2412  BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2413 
2414  // Get the total trip count from the count by adding 1.
2415  const SCEV *ExitCount = SE->getAddExpr(
2416  BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2417 
2418  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2419 
2420  // Expand the trip count and place the new instructions in the preheader.
2421  // Notice that the pre-header does not change, only the loop body.
2422  SCEVExpander Exp(*SE, DL, "induction");
2423 
2424  // Count holds the overall loop count (N).
2425  TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2427 
2428  if (TripCount->getType()->isPointerTy())
2429  TripCount =
2430  CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2432 
2433  return TripCount;
2434 }
2435 
2437  if (VectorTripCount)
2438  return VectorTripCount;
2439 
2440  Value *TC = getOrCreateTripCount(L);
2442 
2443  Type *Ty = TC->getType();
2444  Constant *Step = ConstantInt::get(Ty, VF * UF);
2445 
2446  // If the tail is to be folded by masking, round the number of iterations N
2447  // up to a multiple of Step instead of rounding down. This is done by first
2448  // adding Step-1 and then rounding down. Note that it's ok if this addition
2449  // overflows: the vector induction variable will eventually wrap to zero given
2450  // that it starts at zero and its Step is a power of two; the loop will then
2451  // exit, with the last early-exit vector comparison also producing all-true.
2452  if (Cost->foldTailByMasking()) {
2453  assert(isPowerOf2_32(VF * UF) &&
2454  "VF*UF must be a power of 2 when folding tail by masking");
2455  TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2456  }
2457 
2458  // Now we need to generate the expression for the part of the loop that the
2459  // vectorized body will execute. This is equal to N - (N % Step) if scalar
2460  // iterations are not required for correctness, or N - Step, otherwise. Step
2461  // is equal to the vectorization factor (number of SIMD elements) times the
2462  // unroll factor (number of SIMD instructions).
2463  Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2464 
2465  // If there is a non-reversed interleaved group that may speculatively access
2466  // memory out-of-bounds, we need to ensure that there will be at least one
2467  // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2468  // the trip count, we set the remainder to be equal to the step. If the step
2469  // does not evenly divide the trip count, no adjustment is necessary since
2470  // there will already be scalar iterations. Note that the minimum iterations
2471  // check ensures that N >= Step.
2472  if (VF > 1 && Cost->requiresScalarEpilogue()) {
2473  auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2474  R = Builder.CreateSelect(IsZero, Step, R);
2475  }
2476 
2477  VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2478 
2479  return VectorTripCount;
2480 }
2481 
2483  const DataLayout &DL) {
2484  // Verify that V is a vector type with same number of elements as DstVTy.
2485  unsigned VF = DstVTy->getNumElements();
2486  VectorType *SrcVecTy = cast<VectorType>(V->getType());
2487  assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2488  Type *SrcElemTy = SrcVecTy->getElementType();
2489  Type *DstElemTy = DstVTy->getElementType();
2490  assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2491  "Vector elements must have same size");
2492 
2493  // Do a direct cast if element types are castable.
2494  if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2495  return Builder.CreateBitOrPointerCast(V, DstVTy);
2496  }
2497  // V cannot be directly casted to desired vector type.
2498  // May happen when V is a floating point vector but DstVTy is a vector of
2499  // pointers or vice-versa. Handle this using a two-step bitcast using an
2500  // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2501  assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2502  "Only one type should be a pointer type");
2503  assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2504  "Only one type should be a floating point type");
2505  Type *IntTy =
2507  VectorType *VecIntTy = VectorType::get(IntTy, VF);
2508  Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2509  return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2510 }
2511 
2513  BasicBlock *Bypass) {
2514  Value *Count = getOrCreateTripCount(L);
2515  BasicBlock *BB = L->getLoopPreheader();
2517 
2518  // Generate code to check if the loop's trip count is less than VF * UF, or
2519  // equal to it in case a scalar epilogue is required; this implies that the
2520  // vector trip count is zero. This check also covers the case where adding one
2521  // to the backedge-taken count overflowed leading to an incorrect trip count
2522  // of zero. In this case we will also jump to the scalar loop.
2525 
2526  // If tail is to be folded, vector loop takes care of all iterations.
2527  Value *CheckMinIters = Builder.getFalse();
2528  if (!Cost->foldTailByMasking())
2529  CheckMinIters = Builder.CreateICmp(
2530  P, Count, ConstantInt::get(Count->getType(), VF * UF),
2531  "min.iters.check");
2532 
2533  BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2534  // Update dominator tree immediately if the generated block is a
2535  // LoopBypassBlock because SCEV expansions to generate loop bypass
2536  // checks may query it before the current function is finished.
2537  DT->addNewBlock(NewBB, BB);
2538  if (L->getParentLoop())
2539  L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2541  BranchInst::Create(Bypass, NewBB, CheckMinIters));
2542  LoopBypassBlocks.push_back(BB);
2543 }
2544 
2546  BasicBlock *BB = L->getLoopPreheader();
2547 
2548  // Generate the code to check that the SCEV assumptions that we made.
2549  // We want the new basic block to start at the first instruction in a
2550  // sequence of instructions that form a check.
2551  SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2552  "scev.check");
2553  Value *SCEVCheck =
2554  Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
2555 
2556  if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2557  if (C->isZero())
2558  return;
2559 
2561  "Cannot SCEV check stride or overflow when folding tail");
2562  // Create a new block containing the stride check.
2563  BB->setName("vector.scevcheck");
2564  auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2565  // Update dominator tree immediately if the generated block is a
2566  // LoopBypassBlock because SCEV expansions to generate loop bypass
2567  // checks may query it before the current function is finished.
2568  DT->addNewBlock(NewBB, BB);
2569  if (L->getParentLoop())
2570  L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2572  BranchInst::Create(Bypass, NewBB, SCEVCheck));
2573  LoopBypassBlocks.push_back(BB);
2574  AddedSafetyChecks = true;
2575 }
2576 
2578  // VPlan-native path does not do any analysis for runtime checks currently.
2580  return;
2581 
2582  BasicBlock *BB = L->getLoopPreheader();
2583 
2584  // Generate the code that checks in runtime if arrays overlap. We put the
2585  // checks into a separate block to make the more common case of few elements
2586  // faster.
2587  Instruction *FirstCheckInst;
2588  Instruction *MemRuntimeCheck;
2589  std::tie(FirstCheckInst, MemRuntimeCheck) =
2591  if (!MemRuntimeCheck)
2592  return;
2593 
2594  assert(!Cost->foldTailByMasking() && "Cannot check memory when folding tail");
2595  // Create a new block containing the memory check.
2596  BB->setName("vector.memcheck");
2597  auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2598  // Update dominator tree immediately if the generated block is a
2599  // LoopBypassBlock because SCEV expansions to generate loop bypass
2600  // checks may query it before the current function is finished.
2601  DT->addNewBlock(NewBB, BB);
2602  if (L->getParentLoop())
2603  L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2605  BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
2606  LoopBypassBlocks.push_back(BB);
2607  AddedSafetyChecks = true;
2608 
2609  // We currently don't use LoopVersioning for the actual loop cloning but we
2610  // still use it to add the noalias metadata.
2611  LVer = llvm::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2612  PSE.getSE());
2613  LVer->prepareNoAliasMetadata();
2614 }
2615 
2617  IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2618  const InductionDescriptor &ID) const {
2619 
2620  SCEVExpander Exp(*SE, DL, "induction");
2621  auto Step = ID.getStep();
2622  auto StartValue = ID.getStartValue();
2623  assert(Index->getType() == Step->getType() &&
2624  "Index type does not match StepValue type");
2625 
2626  // Note: the IR at this point is broken. We cannot use SE to create any new
2627  // SCEV and then expand it, hoping that SCEV's simplification will give us
2628  // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2629  // lead to various SCEV crashes. So all we can do is to use builder and rely
2630  // on InstCombine for future simplifications. Here we handle some trivial
2631  // cases only.
2632  auto CreateAdd = [&B](Value *X, Value *Y) {
2633  assert(X->getType() == Y->getType() && "Types don't match!");
2634  if (auto *CX = dyn_cast<ConstantInt>(X))
2635  if (CX->isZero())
2636  return Y;
2637  if (auto *CY = dyn_cast<ConstantInt>(Y))
2638  if (CY->isZero())
2639  return X;
2640  return B.CreateAdd(X, Y);
2641  };
2642 
2643  auto CreateMul = [&B](Value *X, Value *Y) {
2644  assert(X->getType() == Y->getType() && "Types don't match!");
2645  if (auto *CX = dyn_cast<ConstantInt>(X))
2646  if (CX->isOne())
2647  return Y;
2648  if (auto *CY = dyn_cast<ConstantInt>(Y))
2649  if (CY->isOne())
2650  return X;
2651  return B.CreateMul(X, Y);
2652  };
2653 
2654  switch (ID.getKind()) {
2656  assert(Index->getType() == StartValue->getType() &&
2657  "Index type does not match StartValue type");
2659  return B.CreateSub(StartValue, Index);
2660  auto *Offset = CreateMul(
2661  Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
2662  return CreateAdd(StartValue, Offset);
2663  }
2665  assert(isa<SCEVConstant>(Step) &&
2666  "Expected constant step for pointer induction");
2667  return B.CreateGEP(
2668  nullptr, StartValue,
2669  CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
2670  &*B.GetInsertPoint())));
2671  }
2673  assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2674  auto InductionBinOp = ID.getInductionBinOp();
2675  assert(InductionBinOp &&
2676  (InductionBinOp->getOpcode() == Instruction::FAdd ||
2677  InductionBinOp->getOpcode() == Instruction::FSub) &&
2678  "Original bin op should be defined for FP induction");
2679 
2680  Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2681 
2682  // Floating point operations had to be 'fast' to enable the induction.
2683  FastMathFlags Flags;
2684  Flags.setFast();
2685 
2686  Value *MulExp = B.CreateFMul(StepValue, Index);
2687  if (isa<Instruction>(MulExp))
2688  // We have to check, the MulExp may be a constant.
2689  cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2690 
2691  Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2692  "induction");
2693  if (isa<Instruction>(BOp))
2694  cast<Instruction>(BOp)->setFastMathFlags(Flags);
2695 
2696  return BOp;
2697  }
2699  return nullptr;
2700  }
2701  llvm_unreachable("invalid enum");
2702 }
2703 
2705  /*
2706  In this function we generate a new loop. The new loop will contain
2707  the vectorized instructions while the old loop will continue to run the
2708  scalar remainder.
2709 
2710  [ ] <-- loop iteration number check.
2711  / |
2712  / v
2713  | [ ] <-- vector loop bypass (may consist of multiple blocks).
2714  | / |
2715  | / v
2716  || [ ] <-- vector pre header.
2717  |/ |
2718  | v
2719  | [ ] \
2720  | [ ]_| <-- vector loop.
2721  | |
2722  | v
2723  | -[ ] <--- middle-block.
2724  | / |
2725  | / v
2726  -|- >[ ] <--- new preheader.
2727  | |
2728  | v
2729  | [ ] \
2730  | [ ]_| <-- old scalar loop to handle remainder.
2731  \ |
2732  \ v
2733  >[ ] <-- exit block.
2734  ...
2735  */
2736 
2737  BasicBlock *OldBasicBlock = OrigLoop->getHeader();
2738  BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
2739  BasicBlock *ExitBlock = OrigLoop->getExitBlock();
2740  MDNode *OrigLoopID = OrigLoop->getLoopID();
2741  assert(VectorPH && "Invalid loop structure");
2742  assert(ExitBlock && "Must have an exit block");
2743 
2744  // Some loops have a single integer induction variable, while other loops
2745  // don't. One example is c++ iterators that often have multiple pointer
2746  // induction variables. In the code below we also support a case where we
2747  // don't have a single induction variable.
2748  //
2749  // We try to obtain an induction variable from the original loop as hard
2750  // as possible. However if we don't find one that:
2751  // - is an integer
2752  // - counts from zero, stepping by one
2753  // - is the size of the widest induction variable type
2754  // then we create a new one.
2756  Type *IdxTy = Legal->getWidestInductionType();
2757 
2758  // Split the single block loop into the two loop structure described above.
2759  BasicBlock *VecBody =
2760  VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
2761  BasicBlock *MiddleBlock =
2762  VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
2763  BasicBlock *ScalarPH =
2764  MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
2765 
2766  // Create and register the new vector loop.
2767  Loop *Lp = LI->AllocateLoop();
2768  Loop *ParentLoop = OrigLoop->getParentLoop();
2769 
2770  // Insert the new loop into the loop nest and register the new basic blocks
2771  // before calling any utilities such as SCEV that require valid LoopInfo.
2772  if (ParentLoop) {
2773  ParentLoop->addChildLoop(Lp);
2774  ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
2775  ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
2776  } else {
2777  LI->addTopLevelLoop(Lp);
2778  }
2779  Lp->addBasicBlockToLoop(VecBody, *LI);
2780 
2781  // Find the loop boundaries.
2782  Value *Count = getOrCreateTripCount(Lp);
2783 
2784  Value *StartIdx = ConstantInt::get(IdxTy, 0);
2785 
2786  // Now, compare the new count to zero. If it is zero skip the vector loop and
2787  // jump to the scalar loop. This check also covers the case where the
2788  // backedge-taken count is uint##_max: adding one to it will overflow leading
2789  // to an incorrect trip count of zero. In this (rare) case we will also jump
2790  // to the scalar loop.
2791  emitMinimumIterationCountCheck(Lp, ScalarPH);
2792 
2793  // Generate the code to check any assumptions that we've made for SCEV
2794  // expressions.
2795  emitSCEVChecks(Lp, ScalarPH);
2796 
2797  // Generate the code that checks in runtime if arrays overlap. We put the
2798  // checks into a separate block to make the more common case of few elements
2799  // faster.
2800  emitMemRuntimeChecks(Lp, ScalarPH);
2801 
2802  // Generate the induction variable.
2803  // The loop step is equal to the vectorization factor (num of SIMD elements)
2804  // times the unroll factor (num of SIMD instructions).
2805  Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
2806  Constant *Step = ConstantInt::get(IdxTy, VF * UF);
2807  Induction =
2808  createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
2810 
2811  // We are going to resume the execution of the scalar loop.
2812  // Go over all of the induction variables that we found and fix the
2813  // PHIs that are left in the scalar version of the loop.
2814  // The starting values of PHI nodes depend on the counter of the last
2815  // iteration in the vectorized loop.
2816  // If we come from a bypass edge then we need to start from the original
2817  // start value.
2818 
2819  // This variable saves the new starting index for the scalar loop. It is used
2820  // to test if there are any tail iterations left once the vector loop has
2821  // completed.
2823  for (auto &InductionEntry : *List) {
2824  PHINode *OrigPhi = InductionEntry.first;
2825  InductionDescriptor II = InductionEntry.second;
2826 
2827  // Create phi nodes to merge from the backedge-taken check block.
2828  PHINode *BCResumeVal = PHINode::Create(
2829  OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
2830  // Copy original phi DL over to the new one.
2831  BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
2832  Value *&EndValue = IVEndValues[OrigPhi];
2833  if (OrigPhi == OldInduction) {
2834  // We know what the end value is.
2835  EndValue = CountRoundDown;
2836  } else {
2838  Type *StepType = II.getStep()->getType();
2839  Instruction::CastOps CastOp =
2840  CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
2841  Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
2842  const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2843  EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
2844  EndValue->setName("ind.end");
2845  }
2846 
2847  // The new PHI merges the original incoming value, in case of a bypass,
2848  // or the value at the end of the vectorized loop.
2849  BCResumeVal->addIncoming(EndValue, MiddleBlock);
2850 
2851  // Fix the scalar body counter (PHI node).
2852  unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
2853 
2854  // The old induction's phi node in the scalar body needs the truncated
2855  // value.
2856  for (BasicBlock *BB : LoopBypassBlocks)
2857  BCResumeVal->addIncoming(II.getStartValue(), BB);
2858  OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
2859  }
2860 
2861  // Add a check in the middle block to see if we have completed
2862  // all of the iterations in the first vector loop.
2863  // If (N - N%VF) == N, then we *don't* need to run the remainder.
2864  // If tail is to be folded, we know we don't need to run the remainder.
2865  Value *CmpN = Builder.getTrue();
2866  if (!Cost->foldTailByMasking())
2867  CmpN =
2868  CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
2869  CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
2870  ReplaceInstWithInst(MiddleBlock->getTerminator(),
2871  BranchInst::Create(ExitBlock, ScalarPH, CmpN));
2872 
2873  // Get ready to start creating new instructions into the vectorized body.
2875 
2876  // Save the state.
2878  LoopScalarPreHeader = ScalarPH;
2879  LoopMiddleBlock = MiddleBlock;
2880  LoopExitBlock = ExitBlock;
2881  LoopVectorBody = VecBody;
2882  LoopScalarBody = OldBasicBlock;
2883 
2884  Optional<MDNode *> VectorizedLoopID =
2886  LLVMLoopVectorizeFollowupVectorized});
2887  if (VectorizedLoopID.hasValue()) {
2888  Lp->setLoopID(VectorizedLoopID.getValue());
2889 
2890  // Do not setAlreadyVectorized if loop attributes have been defined
2891  // explicitly.
2892  return LoopVectorPreHeader;
2893  }
2894 
2895  // Keep all loop hints from the original loop on the vector loop (we'll
2896  // replace the vectorizer-specific hints below).
2897  if (MDNode *LID = OrigLoop->getLoopID())
2898  Lp->setLoopID(LID);
2899 
2900  LoopVectorizeHints Hints(Lp, true, *ORE);
2901  Hints.setAlreadyVectorized();
2902 
2903  return LoopVectorPreHeader;
2904 }
2905 
2906 // Fix up external users of the induction variable. At this point, we are
2907 // in LCSSA form, with all external PHIs that use the IV having one input value,
2908 // coming from the remainder loop. We need those PHIs to also have a correct
2909 // value for the IV when arriving directly from the middle block.
2911  const InductionDescriptor &II,
2912  Value *CountRoundDown, Value *EndValue,
2913  BasicBlock *MiddleBlock) {
2914  // There are two kinds of external IV usages - those that use the value
2915  // computed in the last iteration (the PHI) and those that use the penultimate
2916  // value (the value that feeds into the phi from the loop latch).
2917  // We allow both, but they, obviously, have different values.
2918 
2919  assert(OrigLoop->getExitBlock() && "Expected a single exit block");
2920 
2921  DenseMap<Value *, Value *> MissingVals;
2922 
2923  // An external user of the last iteration's value should see the value that
2924  // the remainder loop uses to initialize its own IV.
2926  for (User *U : PostInc->users()) {
2927  Instruction *UI = cast<Instruction>(U);
2928  if (!OrigLoop->contains(UI)) {
2929  assert(isa<PHINode>(UI) && "Expected LCSSA form");
2930  MissingVals[UI] = EndValue;
2931  }
2932  }
2933 
2934  // An external user of the penultimate value need to see EndValue - Step.
2935  // The simplest way to get this is to recompute it from the constituent SCEVs,
2936  // that is Start + (Step * (CRD - 1)).
2937  for (User *U : OrigPhi->users()) {
2938  auto *UI = cast<Instruction>(U);
2939  if (!OrigLoop->contains(UI)) {
2940  const DataLayout &DL =
2942  assert(isa<PHINode>(UI) && "Expected LCSSA form");
2943 
2944  IRBuilder<> B(MiddleBlock->getTerminator());
2945  Value *CountMinusOne = B.CreateSub(
2946  CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
2947  Value *CMO =
2948  !II.getStep()->getType()->isIntegerTy()
2949  ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
2950  II.getStep()->getType())
2951  : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
2952  CMO->setName("cast.cmo");
2953  Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
2954  Escape->setName("ind.escape");
2955  MissingVals[UI] = Escape;
2956  }
2957  }
2958 
2959  for (auto &I : MissingVals) {
2960  PHINode *PHI = cast<PHINode>(I.first);
2961  // One corner case we have to handle is two IVs "chasing" each-other,
2962  // that is %IV2 = phi [...], [ %IV1, %latch ]
2963  // In this case, if IV1 has an external use, we need to avoid adding both
2964  // "last value of IV1" and "penultimate value of IV2". So, verify that we
2965  // don't already have an incoming value for the middle block.
2966  if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
2967  PHI->addIncoming(I.second, MiddleBlock);
2968  }
2969 }
2970 
2971 namespace {
2972 
2973 struct CSEDenseMapInfo {
2974  static bool canHandle(const Instruction *I) {
2975  return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
2976  isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
2977  }
2978 
2979  static inline Instruction *getEmptyKey() {
2981  }
2982 
2983  static inline Instruction *getTombstoneKey() {
2985  }
2986 
2987  static unsigned getHashValue(const Instruction *I) {
2988  assert(canHandle(I) && "Unknown instruction!");
2990  I->value_op_end()));
2991  }
2992 
2993  static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
2994  if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
2995  LHS == getTombstoneKey() || RHS == getTombstoneKey())
2996  return LHS == RHS;
2997  return LHS->isIdenticalTo(RHS);
2998  }
2999 };
3000 
3001 } // end anonymous namespace
3002 
3003 ///Perform cse of induction variable instructions.
3004 static void cse(BasicBlock *BB) {
3005  // Perform simple cse.
3007  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3008  Instruction *In = &*I++;
3009 
3010  if (!CSEDenseMapInfo::canHandle(In))
3011  continue;
3012 
3013  // Check if we can replace this instruction with any of the
3014  // visited instructions.
3015  if (Instruction *V = CSEMap.lookup(In)) {
3016  In->replaceAllUsesWith(V);
3017  In->eraseFromParent();
3018  continue;
3019  }
3020 
3021  CSEMap[In] = In;
3022  }
3023 }
3024 
3025 /// Estimate the overhead of scalarizing an instruction. This is a
3026 /// convenience wrapper for the type-based getScalarizationOverhead API.
3027 static unsigned getScalarizationOverhead(Instruction *I, unsigned VF,
3028  const TargetTransformInfo &TTI) {
3029  if (VF == 1)
3030  return 0;
3031 
3032  unsigned Cost = 0;
3033  Type *RetTy = ToVectorTy(I->getType(), VF);
3034  if (!RetTy->isVoidTy() &&
3035  (!isa<LoadInst>(I) ||
3037  Cost += TTI.getScalarizationOverhead(RetTy, true, false);
3038 
3039  // Some targets keep addresses scalar.
3040  if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
3041  return Cost;
3042 
3043  if (CallInst *CI = dyn_cast<CallInst>(I)) {
3044  SmallVector<const Value *, 4> Operands(CI->arg_operands());
3045  Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
3046  }
3047  else if (!isa<StoreInst>(I) ||
3050  Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
3051  }
3052 
3053  return Cost;
3054 }
3055 
3056 // Estimate cost of a call instruction CI if it were vectorized with factor VF.
3057 // Return the cost of the instruction, including scalarization overhead if it's
3058 // needed. The flag NeedToScalarize shows if the call needs to be scalarized -
3059 // i.e. either vector version isn't available, or is too expensive.
3060 static unsigned getVectorCallCost(CallInst *CI, unsigned VF,
3061  const TargetTransformInfo &TTI,
3062  const TargetLibraryInfo *TLI,
3063  bool &NeedToScalarize) {
3064  Function *F = CI->getCalledFunction();
3065  StringRef FnName = CI->getCalledFunction()->getName();
3066  Type *ScalarRetTy = CI->getType();
3067  SmallVector<Type *, 4> Tys, ScalarTys;
3068  for (auto &ArgOp : CI->arg_operands())
3069  ScalarTys.push_back(ArgOp->getType());
3070 
3071  // Estimate cost of scalarized vector call. The source operands are assumed
3072  // to be vectors, so we need to extract individual elements from there,
3073  // execute VF scalar calls, and then gather the result into the vector return
3074  // value.
3075  unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3076  if (VF == 1)
3077  return ScalarCallCost;
3078 
3079  // Compute corresponding vector type for return value and arguments.
3080  Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3081  for (Type *ScalarTy : ScalarTys)
3082  Tys.push_back(ToVectorTy(ScalarTy, VF));
3083 
3084  // Compute costs of unpacking argument values for the scalar calls and
3085  // packing the return values to a vector.
3086  unsigned ScalarizationCost = getScalarizationOverhead(CI, VF, TTI);
3087 
3088  unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3089 
3090  // If we can't emit a vector call for this function, then the currently found
3091  // cost is the cost we need to return.
3092  NeedToScalarize = true;
3093  if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
3094  return Cost;
3095 
3096  // If the corresponding vector cost is cheaper, return its cost.
3097  unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3098  if (VectorCallCost < Cost) {
3099  NeedToScalarize = false;
3100  return VectorCallCost;
3101  }
3102  return Cost;
3103 }
3104 
3105 // Estimate cost of an intrinsic call instruction CI if it were vectorized with
3106 // factor VF. Return the cost of the instruction, including scalarization
3107 // overhead if it's needed.
3108 static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF,
3109  const TargetTransformInfo &TTI,
3110  const TargetLibraryInfo *TLI) {
3112  assert(ID && "Expected intrinsic call!");
3113 
3114  FastMathFlags FMF;
3115  if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3116  FMF = FPMO->getFastMathFlags();
3117 
3118  SmallVector<Value *, 4> Operands(CI->arg_operands());
3119  return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
3120 }
3121 
3123  auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3124  auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3125  return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3126 }
3128  auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3129  auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3130  return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3131 }
3132 
3134  // For every instruction `I` in MinBWs, truncate the operands, create a
3135  // truncated version of `I` and reextend its result. InstCombine runs
3136  // later and will remove any ext/trunc pairs.
3137  SmallPtrSet<Value *, 4> Erased;
3138  for (const auto &KV : Cost->getMinimalBitwidths()) {
3139  // If the value wasn't vectorized, we must maintain the original scalar
3140  // type. The absence of the value from VectorLoopValueMap indicates that it
3141  // wasn't vectorized.
3142  if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3143  continue;
3144  for (unsigned Part = 0; Part < UF; ++Part) {
3145  Value *I = getOrCreateVectorValue(KV.first, Part);
3146  if (Erased.find(I) != Erased.end() || I->use_empty() ||
3147  !isa<Instruction>(I))
3148  continue;
3149  Type *OriginalTy = I->getType();
3150  Type *ScalarTruncatedTy =
3151  IntegerType::get(OriginalTy->getContext(), KV.second);
3152  Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3153  OriginalTy->getVectorNumElements());
3154  if (TruncatedTy == OriginalTy)
3155  continue;
3156 
3157  IRBuilder<> B(cast<Instruction>(I));
3158  auto ShrinkOperand = [&](Value *V) -> Value * {
3159  if (auto *ZI = dyn_cast<ZExtInst>(V))
3160  if (ZI->getSrcTy() == TruncatedTy)
3161  return ZI->getOperand(0);
3162  return B.CreateZExtOrTrunc(V, TruncatedTy);
3163  };
3164 
3165  // The actual instruction modification depends on the instruction type,
3166  // unfortunately.
3167  Value *NewI = nullptr;
3168  if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3169  NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3170  ShrinkOperand(BO->getOperand(1)));
3171 
3172  // Any wrapping introduced by shrinking this operation shouldn't be
3173  // considered undefined behavior. So, we can't unconditionally copy
3174  // arithmetic wrapping flags to NewI.
3175  cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3176  } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3177  NewI =
3178  B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3179  ShrinkOperand(CI->getOperand(1)));
3180  } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3181  NewI = B.CreateSelect(SI->getCondition(),
3182  ShrinkOperand(SI->getTrueValue()),
3183  ShrinkOperand(SI->getFalseValue()));
3184  } else if (auto *CI = dyn_cast<CastInst>(I)) {
3185  switch (CI->getOpcode()) {
3186  default:
3187  llvm_unreachable("Unhandled cast!");
3188  case Instruction::Trunc:
3189  NewI = ShrinkOperand(CI->getOperand(0));
3190  break;
3191  case Instruction::SExt:
3192  NewI = B.CreateSExtOrTrunc(
3193  CI->getOperand(0),
3194  smallestIntegerVectorType(OriginalTy, TruncatedTy));
3195  break;
3196  case Instruction::ZExt:
3197  NewI = B.CreateZExtOrTrunc(
3198  CI->getOperand(0),
3199  smallestIntegerVectorType(OriginalTy, TruncatedTy));
3200  break;
3201  }
3202  } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3203  auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3204  auto *O0 = B.CreateZExtOrTrunc(
3205  SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3206  auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3207  auto *O1 = B.CreateZExtOrTrunc(
3208  SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3209 
3210  NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3211  } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3212  // Don't do anything with the operands, just extend the result.
3213  continue;
3214  } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3215  auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3216  auto *O0 = B.CreateZExtOrTrunc(
3217  IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3218  auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3219  NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3220  } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3221  auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3222  auto *O0 = B.CreateZExtOrTrunc(
3223  EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3224  NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3225  } else {
3226  // If we don't know what to do, be conservative and don't do anything.
3227  continue;
3228  }
3229 
3230  // Lastly, extend the result.
3231  NewI->takeName(cast<Instruction>(I));
3232  Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3233  I->replaceAllUsesWith(Res);
3234  cast<Instruction>(I)->eraseFromParent();
3235  Erased.insert(I);
3236  VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3237  }
3238  }
3239 
3240  // We'll have created a bunch of ZExts that are now parentless. Clean up.
3241  for (const auto &KV : Cost->getMinimalBitwidths()) {
3242  // If the value wasn't vectorized, we must maintain the original scalar
3243  // type. The absence of the value from VectorLoopValueMap indicates that it
3244  // wasn't vectorized.
3245  if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3246  continue;
3247  for (unsigned Part = 0; Part < UF; ++Part) {
3248  Value *I = getOrCreateVectorValue(KV.first, Part);
3249  ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3250  if (Inst && Inst->use_empty()) {
3251  Value *NewI = Inst->getOperand(0);
3252  Inst->eraseFromParent();
3253  VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3254  }
3255  }
3256  }
3257 }
3258 
3260  // Insert truncates and extends for any truncated instructions as hints to
3261  // InstCombine.
3262  if (VF > 1)
3264 
3265  // Fix widened non-induction PHIs by setting up the PHI operands.
3266  if (OrigPHIsToFix.size()) {
3268  "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3270  }
3271 
3272  // At this point every instruction in the original loop is widened to a
3273  // vector form. Now we need to fix the recurrences in the loop. These PHI
3274  // nodes are currently empty because we did not want to introduce cycles.
3275  // This is the second stage of vectorizing recurrences.
3277 
3278  // Update the dominator tree.
3279  //
3280  // FIXME: After creating the structure of the new loop, the dominator tree is
3281  // no longer up-to-date, and it remains that way until we update it
3282  // here. An out-of-date dominator tree is problematic for SCEV,
3283  // because SCEVExpander uses it to guide code generation. The
3284  // vectorizer use SCEVExpanders in several places. Instead, we should
3285  // keep the dominator tree up-to-date as we go.
3286  updateAnalysis();
3287 
3288  // Fix-up external users of the induction variables.
3289  for (auto &Entry : *Legal->getInductionVars())
3290  fixupIVUsers(Entry.first, Entry.second,
3292  IVEndValues[Entry.first], LoopMiddleBlock);
3293 
3294  fixLCSSAPHIs();
3296  sinkScalarOperands(&*PI);
3297 
3298  // Remove redundant induction instructions.
3300 }
3301 
3303  // In order to support recurrences we need to be able to vectorize Phi nodes.
3304  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3305  // stage #2: We now need to fix the recurrences by adding incoming edges to
3306  // the currently empty PHI nodes. At this point every instruction in the
3307  // original loop is widened to a vector form so we can use them to construct
3308  // the incoming edges.
3309  for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3310  // Handle first-order recurrences and reductions that need to be fixed.
3311  if (Legal->isFirstOrderRecurrence(&Phi))
3313  else if (Legal->isReductionVariable(&Phi))
3314  fixReduction(&Phi);
3315  }
3316 }
3317 
3319  // This is the second phase of vectorizing first-order recurrences. An
3320  // overview of the transformation is described below. Suppose we have the
3321  // following loop.
3322  //
3323  // for (int i = 0; i < n; ++i)
3324  // b[i] = a[i] - a[i - 1];
3325  //
3326  // There is a first-order recurrence on "a". For this loop, the shorthand
3327  // scalar IR looks like:
3328  //
3329  // scalar.ph:
3330  // s_init = a[-1]
3331  // br scalar.body
3332  //
3333  // scalar.body:
3334  // i = phi [0, scalar.ph], [i+1, scalar.body]
3335  // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3336  // s2 = a[i]
3337  // b[i] = s2 - s1
3338  // br cond, scalar.body, ...
3339  //
3340  // In this example, s1 is a recurrence because it's value depends on the
3341  // previous iteration. In the first phase of vectorization, we created a
3342  // temporary value for s1. We now complete the vectorization and produce the
3343  // shorthand vector IR shown below (for VF = 4, UF = 1).
3344  //
3345  // vector.ph:
3346  // v_init = vector(..., ..., ..., a[-1])
3347  // br vector.body
3348  //
3349  // vector.body
3350  // i = phi [0, vector.ph], [i+4, vector.body]
3351  // v1 = phi [v_init, vector.ph], [v2, vector.body]
3352  // v2 = a[i, i+1, i+2, i+3];
3353  // v3 = vector(v1(3), v2(0, 1, 2))
3354  // b[i, i+1, i+2, i+3] = v2 - v3
3355  // br cond, vector.body, middle.block
3356  //
3357  // middle.block:
3358  // x = v2(3)
3359  // br scalar.ph
3360  //
3361  // scalar.ph:
3362  // s_init = phi [x, middle.block], [a[-1], otherwise]
3363  // br scalar.body
3364  //
3365  // After execution completes the vector loop, we extract the next value of
3366  // the recurrence (x) to use as the initial value in the scalar loop.
3367 
3368  // Get the original loop preheader and single loop latch.
3369  auto *Preheader = OrigLoop->getLoopPreheader();
3370  auto *Latch = OrigLoop->getLoopLatch();
3371 
3372  // Get the initial and previous values of the scalar recurrence.
3373  auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3374  auto *Previous = Phi->getIncomingValueForBlock(Latch);
3375 
3376  // Create a vector from the initial value.
3377  auto *VectorInit = ScalarInit;
3378  if (VF > 1) {
3380  VectorInit = Builder.CreateInsertElement(
3381  UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3382  Builder.getInt32(VF - 1), "vector.recur.init");
3383  }
3384 
3385  // We constructed a temporary phi node in the first phase of vectorization.
3386  // This phi node will eventually be deleted.
3388  cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3389 
3390  // Create a phi node for the new recurrence. The current value will either be
3391  // the initial value inserted into a vector or loop-varying vector value.
3392  auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3393  VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3394 
3395  // Get the vectorized previous value of the last part UF - 1. It appears last
3396  // among all unrolled iterations, due to the order of their construction.
3397  Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3398 
3399  // Set the insertion point after the previous value if it is an instruction.
3400  // Note that the previous value may have been constant-folded so it is not
3401  // guaranteed to be an instruction in the vector loop. Also, if the previous
3402  // value is a phi node, we should insert after all the phi nodes to avoid
3403  // breaking basic block verification.
3404  if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) ||
3405  isa<PHINode>(PreviousLastPart))
3407  else
3409  &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart)));
3410 
3411  // We will construct a vector for the recurrence by combining the values for
3412  // the current and previous iterations. This is the required shuffle mask.
3413  SmallVector<Constant *, 8> ShuffleMask(VF);
3414  ShuffleMask[0] = Builder.getInt32(VF - 1);
3415  for (unsigned I = 1; I < VF; ++I)
3416  ShuffleMask[I] = Builder.getInt32(I + VF - 1);
3417 
3418  // The vector from which to take the initial value for the current iteration
3419  // (actual or unrolled). Initially, this is the vector phi node.
3420  Value *Incoming = VecPhi;
3421 
3422  // Shuffle the current and previous vector and update the vector parts.
3423  for (unsigned Part = 0; Part < UF; ++Part) {
3424  Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3425  Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3426  auto *Shuffle =
3427  VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3428  ConstantVector::get(ShuffleMask))
3429  : Incoming;
3430  PhiPart->replaceAllUsesWith(Shuffle);
3431  cast<Instruction>(PhiPart)->eraseFromParent();
3432  VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3433  Incoming = PreviousPart;
3434  }
3435 
3436  // Fix the latch value of the new recurrence in the vector loop.
3437  VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3438 
3439  // Extract the last vector element in the middle block. This will be the
3440  // initial value for the recurrence when jumping to the scalar loop.
3441  auto *ExtractForScalar = Incoming;
3442  if (VF > 1) {
3444  ExtractForScalar = Builder.CreateExtractElement(
3445  ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3446  }
3447  // Extract the second last element in the middle block if the
3448  // Phi is used outside the loop. We need to extract the phi itself
3449  // and not the last element (the phi update in the current iteration). This
3450  // will be the value when jumping to the exit block from the LoopMiddleBlock,
3451  // when the scalar loop is not run at all.
3452  Value *ExtractForPhiUsedOutsideLoop = nullptr;
3453  if (VF > 1)
3454  ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3455  Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3456  // When loop is unrolled without vectorizing, initialize
3457  // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3458  // `Incoming`. This is analogous to the vectorized case above: extracting the
3459  // second last element when VF > 1.
3460  else if (UF > 1)
3461  ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3462 
3463  // Fix the initial value of the original recurrence in the scalar loop.
3465  auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3466  for (auto *BB : predecessors(LoopScalarPreHeader)) {
3467  auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3468  Start->addIncoming(Incoming, BB);
3469  }
3470 
3472  Phi->setName("scalar.recur");
3473 
3474  // Finally, fix users of the recurrence outside the loop. The users will need
3475  // either the last value of the scalar recurrence or the last value of the
3476  // vector recurrence we extracted in the middle block. Since the loop is in
3477  // LCSSA form, we just need to find all the phi nodes for the original scalar
3478  // recurrence in the exit block, and then add an edge for the middle block.
3479  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3480  if (LCSSAPhi.getIncomingValue(0) == Phi) {
3481  LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3482  }
3483  }
3484 }
3485 
3487  Constant *Zero = Builder.getInt32(0);
3488 
3489  // Get it's reduction variable descriptor.
3491  "Unable to find the reduction variable");
3492  RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
3493 
3495  TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3496  Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3498  RdxDesc.getMinMaxRecurrenceKind();
3499  setDebugLocFromInst(Builder, ReductionStartValue);
3500 
3501  // We need to generate a reduction vector from the incoming scalar.
3502  // To do so, we need to generate the 'identity' vector and override
3503  // one of the elements with the incoming scalar reduction. We need
3504  // to do it in the vector-loop preheader.
3506 
3507  // This is the vector-clone of the value that leaves the loop.
3508  Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3509 
3510  // Find the reduction identity variable. Zero for addition, or, xor,
3511  // one for multiplication, -1 for And.
3512  Value *Identity;
3513  Value *VectorStart;
3516  // MinMax reduction have the start value as their identify.
3517  if (VF == 1) {
3518  VectorStart = Identity = ReductionStartValue;
3519  } else {
3520  VectorStart = Identity =
3521  Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3522  }
3523  } else {
3524  // Handle other reduction kinds:
3526  RK, VecTy->getScalarType());
3527  if (VF == 1) {
3528  Identity = Iden;
3529  // This vector is the Identity vector where the first element is the
3530  // incoming scalar reduction.
3531  VectorStart = ReductionStartValue;
3532  } else {
3533  Identity = ConstantVector::getSplat(VF, Iden);
3534 
3535  // This vector is the Identity vector where the first element is the
3536  // incoming scalar reduction.
3537  VectorStart =
3538  Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3539  }
3540  }
3541 
3542  // Fix the vector-loop phi.
3543 
3544  // Reductions do not have to start at zero. They can start with
3545  // any loop invariant values.
3546  BasicBlock *Latch = OrigLoop->getLoopLatch();
3547  Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3548  for (unsigned Part = 0; Part < UF; ++Part) {
3549  Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3550  Value *Val = getOrCreateVectorValue(LoopVal, Part);
3551  // Make sure to add the reduction stat value only to the
3552  // first unroll part.
3553  Value *StartVal = (Part == 0) ? VectorStart : Identity;
3554  cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3555  cast<PHINode>(VecRdxPhi)
3556  ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3557  }
3558 
3559  // Before each round, move the insertion point right between
3560  // the PHIs and the values we are going to write.
3561  // This allows us to write both PHINodes and the extractelement
3562  // instructions.
3564 
3565  setDebugLocFromInst(Builder, LoopExitInst);
3566 
3567  // If the vector reduction can be performed in a smaller type, we truncate
3568  // then extend the loop exit value to enable InstCombine to evaluate the
3569  // entire expression in the smaller type.
3570  if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3571  Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3574  VectorParts RdxParts(UF);
3575  for (unsigned Part = 0; Part < UF; ++Part) {
3576  RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3577  Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3578  Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3579  : Builder.CreateZExt(Trunc, VecTy);
3580  for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3581  UI != RdxParts[Part]->user_end();)
3582  if (*UI != Trunc) {
3583  (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3584  RdxParts[Part] = Extnd;
3585  } else {
3586  ++UI;
3587  }
3588  }
3590  for (unsigned Part = 0; Part < UF; ++Part) {
3591  RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3592  VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3593  }
3594  }
3595 
3596  // Reduce all of the unrolled parts into a single vector.
3597  Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3599  setDebugLocFromInst(Builder, ReducedPartRdx);
3600  for (unsigned Part = 1; Part < UF; ++Part) {
3601  Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3602  if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3603  // Floating point operations had to be 'fast' to enable the reduction.
3604  ReducedPartRdx = addFastMathFlag(
3606  ReducedPartRdx, "bin.rdx"));
3607  else
3608  ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3609  RdxPart);
3610  }
3611 
3612  if (VF > 1) {
3613  bool NoNaN = Legal->hasFunNoNaNAttr();
3614  ReducedPartRdx =
3615  createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3616  // If the reduction can be performed in a smaller type, we need to extend
3617  // the reduction to the wider type before we branch to the original loop.
3618  if (Phi->getType() != RdxDesc.getRecurrenceType())
3619  ReducedPartRdx =
3620  RdxDesc.isSigned()
3621  ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3622  : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3623  }
3624 
3625  // Create a phi node that merges control-flow from the backedge-taken check
3626  // block and the middle block.
3627  PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3629  for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3630  BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3631  BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3632 
3633  // Now, we need to fix the users of the reduction variable
3634  // inside and outside of the scalar remainder loop.
3635  // We know that the loop is in LCSSA form. We need to update the
3636  // PHI nodes in the exit blocks.
3637  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3638  // All PHINodes need to have a single entry edge, or two if
3639  // we already fixed them.
3640  assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3641 
3642  // We found a reduction value exit-PHI. Update it with the
3643  // incoming bypass edge.
3644  if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3645  LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3646  } // end of the LCSSA phi scan.
3647 
3648  // Fix the scalar loop reduction variable with the incoming reduction sum
3649  // from the vector body and from the backedge value.
3650  int IncomingEdgeBlockIdx =
3652  assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3653  // Pick the other block.
3654  int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3655  Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3656  Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3657 }
3658 
3660  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3661  if (LCSSAPhi.getNumIncomingValues() == 1) {
3662  auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3663  // Non-instruction incoming values will have only one value.
3664  unsigned LastLane = 0;
3665  if (isa<Instruction>(IncomingValue))
3666  LastLane = Cost->isUniformAfterVectorization(
3667  cast<Instruction>(IncomingValue), VF)
3668  ? 0
3669  : VF - 1;
3670  // Can be a loop invariant incoming value or the last scalar value to be
3671  // extracted from the vectorized loop.
3673  Value *lastIncomingValue =
3674  getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3675  LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3676  }
3677  }
3678 }
3679 
3681  // The basic block and loop containing the predicated instruction.
3682  auto *PredBB = PredInst->getParent();
3683  auto *VectorLoop = LI->getLoopFor(PredBB);
3684 
3685  // Initialize a worklist with the operands of the predicated instruction.
3686  SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3687 
3688  // Holds instructions that we need to analyze again. An instruction may be
3689  // reanalyzed if we don't yet know if we can sink it or not.
3690  SmallVector<Instruction *, 8> InstsToReanalyze;
3691 
3692  // Returns true if a given use occurs in the predicated block. Phi nodes use
3693  // their operands in their corresponding predecessor blocks.
3694  auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3695  auto *I = cast<Instruction>(U.getUser());
3696  BasicBlock *BB = I->getParent();
3697  if (auto *Phi = dyn_cast<PHINode>(I))
3698  BB = Phi->getIncomingBlock(
3699  PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3700  return BB == PredBB;
3701  };
3702 
3703  // Iteratively sink the scalarized operands of the predicated instruction
3704  // into the block we created for it. When an instruction is sunk, it's
3705  // operands are then added to the worklist. The algorithm ends after one pass
3706  // through the worklist doesn't sink a single instruction.
3707  bool Changed;
3708  do {
3709  // Add the instructions that need to be reanalyzed to the worklist, and
3710  // reset the changed indicator.
3711  Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3712  InstsToReanalyze.clear();
3713  Changed = false;
3714 
3715  while (!Worklist.empty()) {
3716  auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3717 
3718  // We can't sink an instruction if it is a phi node, is already in the
3719  // predicated block, is not in the loop, or may have side effects.
3720  if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
3721  !VectorLoop->contains(I) || I->mayHaveSideEffects())
3722  continue;
3723 
3724  // It's legal to sink the instruction if all its uses occur in the
3725  // predicated block. Otherwise, there's nothing to do yet, and we may
3726  // need to reanalyze the instruction.
3727  if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3728  InstsToReanalyze.push_back(I);
3729  continue;
3730  }
3731 
3732  // Move the instruction to the beginning of the predicated block, and add
3733  // it's operands to the worklist.
3734  I->moveBefore(&*PredBB->getFirstInsertionPt());
3735  Worklist.insert(I->op_begin(), I->op_end());
3736 
3737  // The sinking may have enabled other instructions to be sunk, so we will
3738  // need to iterate.
3739  Changed = true;
3740  }
3741  } while (Changed);
3742 }
3743 
3745  for (PHINode *OrigPhi : OrigPHIsToFix) {
3746  PHINode *NewPhi =
3747  cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
3748  unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
3749 
3750  SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
3751  predecessors(OrigPhi->getParent()));
3752  SmallVector<BasicBlock *, 2> VectorBBPredecessors(
3753  predecessors(NewPhi->getParent()));
3754  assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
3755  "Scalar and Vector BB should have the same number of predecessors");
3756 
3757  // The insertion point in Builder may be invalidated by the time we get
3758  // here. Force the Builder insertion point to something valid so that we do
3759  // not run into issues during insertion point restore in
3760  // getOrCreateVectorValue calls below.
3761  Builder.SetInsertPoint(NewPhi);
3762 
3763  // The predecessor order is preserved and we can rely on mapping between
3764  // scalar and vector block predecessors.
3765  for (unsigned i = 0; i < NumIncomingValues; ++i) {
3766  BasicBlock *NewPredBB = VectorBBPredecessors[i];
3767 
3768  // When looking up the new scalar/vector values to fix up, use incoming
3769  // values from original phi.
3770  Value *ScIncV =
3771  OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
3772 
3773  // Scalar incoming value may need a broadcast
3774  Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
3775  NewPhi->addIncoming(NewIncV, NewPredBB);
3776  }
3777  }
3778 }
3779 
3781  unsigned VF) {
3782  PHINode *P = cast<PHINode>(PN);
3783  if (EnableVPlanNativePath) {
3784  // Currently we enter here in the VPlan-native path for non-induction
3785  // PHIs where all control flow is uniform. We simply widen these PHIs.
3786  // Create a vector phi with no operands - the vector phi operands will be
3787  // set at the end of vector code generation.
3788  Type *VecTy =
3789  (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3790  Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
3791  VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
3792  OrigPHIsToFix.push_back(P);
3793 
3794  return;
3795  }
3796 
3797  assert(PN->getParent() == OrigLoop->getHeader() &&
3798  "Non-header phis should have been handled elsewhere");
3799 
3800  // In order to support recurrences we need to be able to vectorize Phi nodes.
3801  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3802  // stage #1: We create a new vector PHI node with no incoming edges. We'll use
3803  // this value when we vectorize all of the instructions that use the PHI.
3805  for (unsigned Part = 0; Part < UF; ++Part) {
3806  // This is phase one of vectorizing PHIs.
3807  Type *VecTy =
3808  (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3809  Value *EntryPart = PHINode::Create(
3810  VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
3811  VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
3812  }
3813  return;
3814  }
3815 
3817 
3818  // This PHINode must be an induction variable.
3819  // Make sure that we know about it.
3820  assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
3821 
3823  const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
3824 
3825  // FIXME: The newly created binary instructions should contain nsw/nuw flags,
3826  // which can be found from the original scalar operations.
3827  switch (II.getKind()) {
3829  llvm_unreachable("Unknown induction");
3832  llvm_unreachable("Integer/fp induction is handled elsewhere.");
3834  // Handle the pointer induction variable case.
3835  assert(P->getType()->isPointerTy() && "Unexpected type.");
3836  // This is the normalized GEP that starts counting at zero.
3837  Value *PtrInd = Induction;
3838  PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
3839  // Determine the number of scalars we need to generate for each unroll
3840  // iteration. If the instruction is uniform, we only need to generate the
3841  // first lane. Otherwise, we generate all VF values.
3842  unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
3843  // These are the scalar results. Notice that we don't generate vector GEPs
3844  // because scalar GEPs result in better code.
3845  for (unsigned Part = 0; Part < UF; ++Part) {
3846  for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
3847  Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
3848  Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
3849  Value *SclrGep =
3850  emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
3851  SclrGep->setName("next.gep");
3852  VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
3853  }
3854  }
3855  return;
3856  }
3857  }
3858 }
3859 
3860 /// A helper function for checking whether an integer division-related
3861 /// instruction may divide by zero (in which case it must be predicated if
3862 /// executed conditionally in the scalar code).
3863 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
3864 /// Non-zero divisors that are non compile-time constants will not be
3865 /// converted into multiplication, so we will still end up scalarizing
3866 /// the division, but can do so w/o predication.
3868  assert((I.getOpcode() == Instruction::UDiv ||
3869  I.getOpcode() == Instruction::SDiv ||
3870  I.getOpcode() == Instruction::URem ||
3871  I.getOpcode() == Instruction::SRem) &&
3872  "Unexpected instruction");
3873  Value *Divisor = I.getOperand(1);
3874  auto *CInt = dyn_cast<ConstantInt>(Divisor);
3875  return !CInt || CInt->isZero();
3876 }
3877 
3879  switch (I.getOpcode()) {
3880  case Instruction::Br:
3881  case Instruction::PHI:
3882  llvm_unreachable("This instruction is handled by a different recipe.");
3883  case Instruction::GetElementPtr: {
3884  // Construct a vector GEP by widening the operands of the scalar GEP as
3885  // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
3886  // results in a vector of pointers when at least one operand of the GEP
3887  // is vector-typed. Thus, to keep the representation compact, we only use
3888  // vector-typed operands for loop-varying values.
3889  auto *GEP = cast<GetElementPtrInst>(&I);
3890 
3891  if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
3892  // If we are vectorizing, but the GEP has only loop-invariant operands,
3893  // the GEP we build (by only using vector-typed operands for
3894  // loop-varying values) would be a scalar pointer. Thus, to ensure we
3895  // produce a vector of pointers, we need to either arbitrarily pick an
3896  // operand to broadcast, or broadcast a clone of the original GEP.
3897  // Here, we broadcast a clone of the original.
3898  //
3899  // TODO: If at some point we decide to scalarize instructions having
3900  // loop-invariant operands, this special case will no longer be
3901  // required. We would add the scalarization decision to
3902  // collectLoopScalars() and teach getVectorValue() to broadcast
3903  // the lane-zero scalar value.
3904  auto *Clone = Builder.Insert(GEP->clone());
3905  for (unsigned Part = 0; Part < UF; ++Part) {
3906  Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
3907  VectorLoopValueMap.setVectorValue(&I, Part, EntryPart);
3908  addMetadata(EntryPart, GEP);
3909  }
3910  } else {
3911  // If the GEP has at least one loop-varying operand, we are sure to
3912  // produce a vector of pointers. But if we are only unrolling, we want
3913  // to produce a scalar GEP for each unroll part. Thus, the GEP we
3914  // produce with the code below will be scalar (if VF == 1) or vector
3915  // (otherwise). Note that for the unroll-only case, we still maintain
3916  // values in the vector mapping with initVector, as we do for other
3917  // instructions.
3918  for (unsigned Part = 0; Part < UF; ++Part) {
3919  // The pointer operand of the new GEP. If it's loop-invariant, we
3920  // won't broadcast it.
3921  auto *Ptr =
3922  OrigLoop->isLoopInvariant(GEP->getPointerOperand())
3923  ? GEP->getPointerOperand()
3924  : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
3925 
3926  // Collect all the indices for the new GEP. If any index is
3927  // loop-invariant, we won't broadcast it.
3928  SmallVector<Value *, 4> Indices;
3929  for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
3930  if (OrigLoop->isLoopInvariant(U.get()))
3931  Indices.push_back(U.get());
3932  else
3933  Indices.push_back(getOrCreateVectorValue(U.get(), Part));
3934  }
3935 
3936  // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
3937  // but it should be a vector, otherwise.
3938  auto *NewGEP = GEP->isInBounds()
3939  ? Builder.CreateInBoundsGEP(Ptr, Indices)
3940  : Builder.CreateGEP(Ptr, Indices);
3941  assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
3942  "NewGEP is not a pointer vector");
3943  VectorLoopValueMap.setVectorValue(&I, Part, NewGEP);
3944  addMetadata(NewGEP, GEP);
3945  }
3946  }
3947 
3948  break;
3949  }
3950  case Instruction::UDiv:
3951  case Instruction::SDiv:
3952  case Instruction::SRem:
3953  case Instruction::URem:
3954  case Instruction::Add:
3955  case Instruction::FAdd:
3956  case Instruction::Sub:
3957  case Instruction::FSub:
3958  case Instruction::Mul:
3959  case Instruction::FMul:
3960  case Instruction::FDiv:
3961  case Instruction::FRem:
3962  case Instruction::Shl:
3963  case Instruction::LShr:
3964  case Instruction::AShr:
3965  case Instruction::And:
3966  case Instruction::Or:
3967  case Instruction::Xor: {
3968  // Just widen binops.
3969  auto *BinOp = cast<BinaryOperator>(&I);
3970  setDebugLocFromInst(Builder, BinOp);
3971 
3972  for (unsigned Part = 0; Part < UF; ++Part) {
3973  Value *A = getOrCreateVectorValue(BinOp->getOperand(0), Part);
3974  Value *B = getOrCreateVectorValue(BinOp->getOperand(1), Part);
3975  Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
3976 
3977  if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))
3978  VecOp->copyIRFlags(BinOp);
3979 
3980  // Use this vector value for all users of the original instruction.
3981  VectorLoopValueMap.setVectorValue(&I, Part, V);
3982  addMetadata(V, BinOp);
3983  }
3984 
3985  break;
3986  }
3987  case Instruction::Select: {
3988  // Widen selects.
3989  // If the selector is loop invariant we can create a select
3990  // instruction with a scalar condition. Otherwise, use vector-select.
3991  auto *SE = PSE.getSE();
3992  bool InvariantCond =
3995 
3996  // The condition can be loop invariant but still defined inside the
3997  // loop. This means that we can't just use the original 'cond' value.
3998  // We have to take the 'vectorized' value and pick the first lane.
3999  // Instcombine will make this a no-op.
4000 
4001  auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4002 
4003  for (unsigned Part = 0; Part < UF; ++Part) {
4004  Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4005  Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4006  Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4007  Value *Sel =
4008  Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4009  VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4010  addMetadata(Sel, &I);
4011  }
4012 
4013  break;
4014  }
4015 
4016  case Instruction::ICmp:
4017  case Instruction::FCmp: {
4018  // Widen compares. Generate vector compares.
4019  bool FCmp = (I.getOpcode() == Instruction::FCmp);
4020  auto *Cmp = dyn_cast<CmpInst>(&I);
4022  for (unsigned Part = 0; Part < UF; ++Part) {
4023  Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4024  Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4025  Value *C = nullptr;
4026  if (FCmp) {
4027  // Propagate fast math flags.
4029  Builder.setFastMathFlags(Cmp->getFastMathFlags());
4030  C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4031  } else {
4032  C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4033  }
4034  VectorLoopValueMap.setVectorValue(&I, Part, C);
4035  addMetadata(C, &I);
4036  }
4037 
4038  break;
4039  }
4040 
4041  case Instruction::ZExt:
4042  case Instruction::SExt:
4043  case Instruction::FPToUI:
4044  case Instruction::FPToSI:
4045  case Instruction::FPExt:
4046  case Instruction::PtrToInt:
4047  case Instruction::IntToPtr:
4048  case Instruction::SIToFP:
4049  case Instruction::UIToFP:
4050  case Instruction::Trunc:
4051  case Instruction::FPTrunc:
4052  case Instruction::BitCast: {
4053  auto *CI = dyn_cast<CastInst>(&I);
4055 
4056  /// Vectorize casts.
4057  Type *DestTy =
4058  (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4059 
4060  for (unsigned Part = 0; Part < UF; ++Part) {
4061  Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4062  Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4063  VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4064  addMetadata(Cast, &I);
4065  }
4066  break;
4067  }
4068 
4069  case Instruction::Call: {
4070  // Ignore dbg intrinsics.
4071  if (isa<DbgInfoIntrinsic>(I))
4072  break;
4074 
4075  Module *M = I.getParent()->getParent()->getParent();
4076  auto *CI = cast<CallInst>(&I);
4077 
4078  StringRef FnName = CI->getCalledFunction()->getName();
4079  Function *F = CI->getCalledFunction();
4080  Type *RetTy = ToVectorTy(CI->getType(), VF);
4082  for (Value *ArgOperand : CI->arg_operands())
4083  Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4084 
4086 
4087  // The flag shows whether we use Intrinsic or a usual Call for vectorized
4088  // version of the instruction.
4089  // Is it beneficial to perform intrinsic call compared to lib call?
4090  bool NeedToScalarize;
4091  unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
4092  bool UseVectorIntrinsic =
4093  ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
4094  assert((UseVectorIntrinsic || !NeedToScalarize) &&
4095  "Instruction should be scalarized elsewhere.");
4096 
4097  for (unsigned Part = 0; Part < UF; ++Part) {
4099  for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4100  Value *Arg = CI->getArgOperand(i);
4101  // Some intrinsics have a scalar argument - don't replace it with a
4102  // vector.
4103  if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
4104  Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4105  Args.push_back(Arg);
4106  }
4107 
4108  Function *VectorF;
4109  if (UseVectorIntrinsic) {
4110  // Use vector version of the intrinsic.
4111  Type *TysForDecl[] = {CI->getType()};
4112  if (VF > 1)
4113  TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4114  VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4115  } else {
4116  // Use vector version of the library call.
4117  StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
4118  assert(!VFnName.empty() && "Vector function name is empty.");
4119  VectorF = M->getFunction(VFnName);
4120  if (!VectorF) {
4121  // Generate a declaration
4122  FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
4123  VectorF =
4124  Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
4125  VectorF->copyAttributesFrom(F);
4126  }
4127  }
4128  assert(VectorF && "Can't create vector function.");
4129 
4131  CI->getOperandBundlesAsDefs(OpBundles);
4132  CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4133 
4134  if (isa<FPMathOperator>(V))
4135  V->copyFastMathFlags(CI);
4136 
4137  VectorLoopValueMap.setVectorValue(&I, Part, V);
4138  addMetadata(V, &I);
4139  }
4140 
4141  break;
4142  }
4143 
4144  default:
4145  // This instruction is not vectorized by simple widening.
4146  LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4147  llvm_unreachable("Unhandled instruction!");
4148  } // end of switch.
4149 }
4150 
4152  // Forget the original basic block.
4154 
4155  // DT is not kept up-to-date for outer loop vectorization
4157  return;
4158 
4159  // Update the dominator tree information.
4161  "Entry does not dominate exit.");
4162 
4169 }
4170 
4171 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4172  // We should not collect Scalars more than once per VF. Right now, this
4173  // function is called from collectUniformsAndScalars(), which already does
4174  // this check. Collecting Scalars for VF=1 does not make any sense.
4175  assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4176  "This function should not be visited twice for the same VF");
4177 
4179 
4180  // These sets are used to seed the analysis with pointers used by memory
4181  // accesses that will remain scalar.
4183  SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4184 
4185  // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4186  // The pointer operands of loads and stores will be scalar as long as the
4187  // memory access is not a gather or scatter operation. The value operand of a
4188  // store will remain scalar if the store is scalarized.
4189  auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4190  InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4191  assert(WideningDecision != CM_Unknown &&
4192  "Widening decision should be ready at this moment");
4193  if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4194  if (Ptr == Store->getValueOperand())
4195  return WideningDecision == CM_Scalarize;
4196  assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4197  "Ptr is neither a value or pointer operand");
4198  return WideningDecision != CM_GatherScatter;
4199  };
4200 
4201  // A helper that returns true if the given value is a bitcast or
4202  // getelementptr instruction contained in the loop.
4203  auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4204  return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4205  isa<GetElementPtrInst>(V)) &&
4206  !TheLoop->isLoopInvariant(V);
4207  };
4208 
4209  // A helper that evaluates a memory access's use of a pointer. If the use
4210  // will be a scalar use, and the pointer is only used by memory accesses, we
4211  // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4212  // PossibleNonScalarPtrs.
4213  auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4214  // We only care about bitcast and getelementptr instructions contained in
4215  // the loop.
4216  if (!isLoopVaryingBitCastOrGEP(Ptr))
4217  return;
4218 
4219  // If the pointer has already been identified as scalar (e.g., if it was
4220  // also identified as uniform), there's nothing to do.
4221  auto *I = cast<Instruction>(Ptr);
4222  if (Worklist.count(I))
4223  return;
4224 
4225  // If the use of the pointer will be a scalar use, and all users of the
4226  // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4227  // place the pointer in PossibleNonScalarPtrs.
4228  if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4229  return isa<LoadInst>(U) || isa<StoreInst>(U);
4230  }))
4231  ScalarPtrs.insert(I);
4232  else
4233  PossibleNonScalarPtrs.insert(I);
4234  };
4235 
4236  // We seed the scalars analysis with three classes of instructions: (1)
4237  // instructions marked uniform-after-vectorization, (2) bitcast and
4238  // getelementptr instructions used by memory accesses requiring a scalar use,
4239  // and (3) pointer induction variables and their update instructions (we
4240  // currently only scalarize these).
4241  //
4242  // (1) Add to the worklist all instructions that have been identified as
4243  // uniform-after-vectorization.
4244  Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4245 
4246  // (2) Add to the worklist all bitcast and getelementptr instructions used by
4247  // memory accesses requiring a scalar use. The pointer operands of loads and
4248  // stores will be scalar as long as the memory accesses is not a gather or
4249  // scatter operation. The value operand of a store will remain scalar if the
4250  // store is scalarized.
4251  for (auto *BB : TheLoop->blocks())
4252  for (auto &I : *BB) {
4253  if (auto *Load = dyn_cast<LoadInst>(&I)) {
4254  evaluatePtrUse(Load, Load->getPointerOperand());
4255  } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4256  evaluatePtrUse(Store, Store->getPointerOperand());
4257  evaluatePtrUse(Store, Store->getValueOperand());
4258  }
4259  }
4260  for (auto *I : ScalarPtrs)
4261  if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4262  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4263  Worklist.insert(I);
4264  }
4265 
4266  // (3) Add to the worklist all pointer induction variables and their update
4267  // instructions.
4268  //
4269  // TODO: Once we are able to vectorize pointer induction variables we should
4270  // no longer insert them into the worklist here.
4271  auto *Latch = TheLoop->getLoopLatch();
4272  for (auto &Induction : *Legal->getInductionVars()) {
4273  auto *Ind = Induction.first;
4274  auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4275  if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4276  continue;
4277  Worklist.insert(Ind);
4278  Worklist.insert(IndUpdate);
4279  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4280  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4281  << "\n");
4282  }
4283 
4284  // Insert the forced scalars.
4285  // FIXME: Currently widenPHIInstruction() often creates a dead vector
4286  // induction variable when the PHI user is scalarized.
4287  auto ForcedScalar = ForcedScalars.find(VF);
4288  if (ForcedScalar != ForcedScalars.end())
4289  for (auto *I : ForcedScalar->second)
4290  Worklist.insert(I);
4291 
4292  // Expand the worklist by looking through any bitcasts and getelementptr
4293  // instructions we've already identified as scalar. This is similar to the
4294  // expansion step in collectLoopUniforms(); however, here we're only
4295  // expanding to include additional bitcasts and getelementptr instructions.
4296  unsigned Idx = 0;
4297  while (Idx != Worklist.size()) {
4298  Instruction *Dst = Worklist[Idx++];
4299  if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4300  continue;
4301  auto *Src = cast<Instruction>(Dst->getOperand(0));
4302  if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4303  auto *J = cast<Instruction>(U);
4304  return !TheLoop->contains(J) || Worklist.count(J) ||
4305  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4306  isScalarUse(J, Src));
4307  })) {
4308  Worklist.insert(Src);
4309  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4310  }
4311  }
4312 
4313  // An induction variable will remain scalar if all users of the induction
4314  // variable and induction variable update remain scalar.
4315  for (auto &Induction : *Legal->getInductionVars()) {
4316  auto *Ind = Induction.first;
4317  auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4318 
4319  // We already considered pointer induction variables, so there's no reason
4320  // to look at their users again.
4321  //
4322  // TODO: Once we are able to vectorize pointer induction variables we
4323  // should no longer skip over them here.
4324  if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4325  continue;
4326 
4327  // Determine if all users of the induction variable are scalar after
4328  // vectorization.
4329  auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4330  auto *I = cast<Instruction>(U);
4331  return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4332  });
4333  if (!ScalarInd)
4334  continue;
4335 
4336  // Determine if all users of the induction variable update instruction are
4337  // scalar after vectorization.
4338  auto ScalarIndUpdate =
4339  llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4340  auto *I = cast<Instruction>(U);
4341  return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4342  });
4343  if (!ScalarIndUpdate)
4344  continue;
4345 
4346  // The induction variable and its update instruction will remain scalar.
4347  Worklist.insert(Ind);
4348  Worklist.insert(IndUpdate);
4349  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4350  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4351  << "\n");
4352  }
4353 
4354  Scalars[VF].insert(Worklist.begin(), Worklist.end());
4355 }
4356 
4358  if (!blockNeedsPredication(I->getParent()))
4359  return false;
4360  switch(I->getOpcode()) {
4361  default:
4362  break;
4363  case Instruction::Load:
4364  case Instruction::Store: {
4365  if (!Legal->isMaskRequired(I))
4366  return false;
4367  auto *Ptr = getLoadStorePointerOperand(I);
4368  auto *Ty = getMemInstValueType(I);
4369  // We have already decided how to vectorize this instruction, get that
4370  // result.
4371  if (VF > 1) {
4372  InstWidening WideningDecision = getWideningDecision(I, VF);
4373  assert(WideningDecision != CM_Unknown &&
4374  "Widening decision should be ready at this moment");
4375  return WideningDecision == CM_Scalarize;
4376  }
4377  return isa<LoadInst>(I) ?
4378  !(isLegalMaskedLoad(Ty, Ptr) || isLegalMaskedGather(Ty))
4379  : !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty));
4380  }
4381  case Instruction::UDiv:
4382  case Instruction::SDiv:
4383  case Instruction::SRem:
4384  case Instruction::URem:
4385  return mayDivideByZero(*I);
4386  }
4387  return false;
4388 }
4389 
4391  unsigned VF) {
4392  assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4393  assert(getWideningDecision(I, VF) == CM_Unknown &&
4394  "Decision should not be set yet.");
4395  auto *Group = getInterleavedAccessGroup(I);
4396  assert(Group && "Must have a group.");
4397 
4398  // Check if masking is required.
4399  // A Group may need masking for one of two reasons: it resides in a block that
4400  // needs predication, or it was decided to use masking to deal with gaps.
4401  bool PredicatedAccessRequiresMasking =
4403  bool AccessWithGapsRequiresMasking =
4404  Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed;
4405  if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4406  return true;
4407 
4408  // If masked interleaving is required, we expect that the user/target had
4409  // enabled it, because otherwise it either wouldn't have been created or
4410  // it should have been invalidated by the CostModel.
4412  "Masked interleave-groups for predicated accesses are not enabled.");
4413 
4414  auto *Ty = getMemInstValueType(I);
4415  return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty)
4416  : TTI.isLegalMaskedStore(Ty);
4417 }
4418 
4420  unsigned VF) {
4421  // Get and ensure we have a valid memory instruction.
4422  LoadInst *LI = dyn_cast<LoadInst>(I);
4424  assert((LI || SI) && "Invalid memory instruction");
4425 
4426  auto *Ptr = getLoadStorePointerOperand(I);
4427 
4428  // In order to be widened, the pointer should be consecutive, first of all.
4429  if (!Legal->isConsecutivePtr(Ptr))
4430  return false;
4431 
4432  // If the instruction is a store located in a predicated block, it will be
4433  // scalarized.
4434  if (isScalarWithPredication(I))
4435  return false;
4436 
4437  // If the instruction's allocated size doesn't equal it's type size, it
4438  // requires padding and will be scalarized.
4439  auto &DL = I->getModule()->getDataLayout();
4440  auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4441  if (hasIrregularType(ScalarTy, DL, VF))
4442  return false;
4443 
4444  return true;
4445 }
4446 
4447 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4448  // We should not collect Uniforms more than once per VF. Right now,
4449  // this function is called from collectUniformsAndScalars(), which
4450  // already does this check. Collecting Uniforms for VF=1 does not make any
4451  // sense.
4452 
4453  assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4454  "This function should not be visited twice for the same VF");
4455 
4456  // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4457  // not analyze again. Uniforms.count(VF) will return 1.
4458  Uniforms[VF].clear();
4459 
4460  // We now know that the loop is vectorizable!
4461  // Collect instructions inside the loop that will remain uniform after
4462  // vectorization.
4463 
4464  // Global values, params and instructions outside of current loop are out of
4465  // scope.
4466  auto isOutOfScope = [&](Value *V) -> bool {
4468  return (!I || !TheLoop->contains(I));
4469  };
4470 
4471  SetVector<Instruction *> Worklist;
4472  BasicBlock *Latch = TheLoop->getLoopLatch();
4473 
4474  // Start with the conditional branch. If the branch condition is an
4475  // instruction contained in the loop that is only used by the branch, it is
4476  // uniform.
4477  auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4478  if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
4479  Worklist.insert(Cmp);
4480  LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
4481  }
4482 
4483  // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4484  // are pointers that are treated like consecutive pointers during
4485  // vectorization. The pointer operands of interleaved accesses are an
4486  // example.
4487  SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4488 
4489  // Holds pointer operands of instructions that are possibly non-uniform.
4490  SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4491 
4492  auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4493  InstWidening WideningDecision = getWideningDecision(I, VF);
4494  assert(WideningDecision != CM_Unknown &&
4495  "Widening decision should be ready at this moment");
4496 
4497  return (WideningDecision == CM_Widen ||
4498  WideningDecision == CM_Widen_Reverse ||
4499  WideningDecision == CM_Interleave);
4500  };
4501  // Iterate over the instructions in the loop, and collect all
4502  // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4503  // that a consecutive-like pointer operand will be scalarized, we collect it
4504  // in PossibleNonUniformPtrs instead. We use two sets here because a single
4505  // getelementptr instruction can be used by both vectorized and scalarized
4506  // memory instructions. For example, if a loop loads and stores from the same
4507  // location, but the store is conditional, the store will be scalarized, and
4508  // the getelementptr won't remain uniform.
4509  for (auto *BB : TheLoop->blocks())
4510  for (auto &I : *BB) {
4511  // If there's no pointer operand, there's nothing to do.
4512  auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4513  if (!Ptr)
4514  continue;
4515 
4516  // True if all users of Ptr are memory accesses that have Ptr as their
4517  // pointer operand.
4518  auto UsersAreMemAccesses =
4519  llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4520  return getLoadStorePointerOperand(U) == Ptr;
4521  });
4522 
4523  // Ensure the memory instruction will not be scalarized or used by
4524  // gather/scatter, making its pointer operand non-uniform. If the pointer
4525  // operand is used by any instruction other than a memory access, we
4526  // conservatively assume the pointer operand may be non-uniform.
4527  if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4528  PossibleNonUniformPtrs.insert(Ptr);
4529 
4530  // If the memory instruction will be vectorized and its pointer operand
4531  // is consecutive-like, or interleaving - the pointer operand should
4532  // remain uniform.
4533  else
4534  ConsecutiveLikePtrs.insert(Ptr);
4535  }
4536 
4537  // Add to the Worklist all consecutive and consecutive-like pointers that
4538  // aren't also identified as possibly non-uniform.
4539  for (auto *V : ConsecutiveLikePtrs)
4540  if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) {
4541  LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
4542  Worklist.insert(V);
4543  }
4544 
4545  // Expand Worklist in topological order: whenever a new instruction
4546  // is added , its users should be already inside Worklist. It ensures
4547  // a uniform instruction will only be used by uniform instructions.
4548  unsigned idx = 0;
4549  while (idx != Worklist.size()) {
4550  Instruction *I = Worklist[idx++];
4551 
4552  for (auto OV : I->operand_values()) {
4553  // isOutOfScope operands cannot be uniform instructions.
4554  if (isOutOfScope(OV))
4555  continue;
4556  // First order recurrence Phi's should typically be considered
4557  // non-uniform.
4558  auto *OP = dyn_cast<PHINode>(OV);
4559  if (OP && Legal->isFirstOrderRecurrence(OP))
4560  continue;
4561  // If all the users of the operand are uniform, then add the
4562  // operand into the uniform worklist.
4563  auto *OI = cast<Instruction>(OV);
4564  if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4565  auto *J = cast<Instruction>(U);
4566  return Worklist.count(J) ||
4567  (OI == getLoadStorePointerOperand(J) &&
4568  isUniformDecision(J, VF));
4569  })) {
4570  Worklist.insert(OI);
4571  LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
4572  }
4573  }
4574  }
4575 
4576  // Returns true if Ptr is the pointer operand of a memory access instruction
4577  // I, and I is known to not require scalarization.
4578  auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4579  return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4580  };
4581 
4582  // For an instruction to be added into Worklist above, all its users inside
4583  // the loop should also be in Worklist. However, this condition cannot be
4584  // true for phi nodes that form a cyclic dependence. We must process phi
4585  // nodes separately. An induction variable will remain uniform if all users
4586  // of the induction variable and induction variable update remain uniform.
4587  // The code below handles both pointer and non-pointer induction variables.
4588  for (auto &Induction : *Legal->getInductionVars()) {
4589  auto *Ind = Induction.first;
4590  auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4591 
4592  // Determine if all users of the induction variable are uniform after
4593  // vectorization.
4594  auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4595  auto *I = cast<Instruction>(U);
4596  return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4597  isVectorizedMemAccessUse(I, Ind);
4598  });
4599  if (!UniformInd)
4600  continue;
4601 
4602  // Determine if all users of the induction variable update instruction are
4603  // uniform after vectorization.
4604  auto UniformIndUpdate =
4605  llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4606  auto *I = cast<Instruction>(U);
4607  return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4608  isVectorizedMemAccessUse(I, IndUpdate);
4609  });
4610  if (!UniformIndUpdate)
4611  continue;
4612 
4613  // The induction variable and its update instruction will remain uniform.
4614  Worklist.insert(Ind);
4615  Worklist.insert(IndUpdate);
4616  LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
4617  LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate
4618  << "\n");
4619  }
4620 
4621  Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4622 }
4623 
4626  // TODO: It may by useful to do since it's still likely to be dynamically
4627  // uniform if the target can skip.
4628  LLVM_DEBUG(
4629  dbgs() << "LV: Not inserting runtime ptr check for divergent target");
4630 
4631  ORE->emit(
4632  createMissedAnalysis("CantVersionLoopWithDivergentTarget")
4633  << "runtime pointer checks needed. Not enabled for divergent target");
4634 
4635  return None;
4636  }
4637 
4638  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4639  if (!OptForSize) // Remaining checks deal with scalar loop when OptForSize.
4640  return computeFeasibleMaxVF(OptForSize, TC);
4641 
4643  ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
4644  << "runtime pointer checks needed. Enable vectorization of this "
4645  "loop with '#pragma clang loop vectorize(enable)' when "
4646  "compiling with -Os/-Oz");
4647  LLVM_DEBUG(
4648  dbgs()
4649  << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");
4650  return None;
4651  }
4652 
4653  if (!PSE.getUnionPredicate().getPredicates().empty()) {
4654  ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
4655  << "runtime SCEV checks needed. Enable vectorization of this "
4656  "loop with '#pragma clang loop vectorize(enable)' when "
4657  "compiling with -Os/-Oz");
4658  LLVM_DEBUG(
4659  dbgs()
4660  << "LV: Aborting. Runtime SCEV check is required with -Os/-Oz.\n");
4661  return None;
4662  }
4663 
4664  // FIXME: Avoid specializing for stride==1 instead of bailing out.
4665  if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4666  ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
4667  << "runtime stride == 1 checks needed. Enable vectorization of "
4668  "this loop with '#pragma clang loop vectorize(enable)' when "
4669  "compiling with -Os/-Oz");
4670  LLVM_DEBUG(
4671  dbgs()
4672  << "LV: Aborting. Runtime stride check is required with -Os/-Oz.\n");
4673  return None;
4674  }
4675 
4676  // If we optimize the program for size, avoid creating the tail loop.
4677  LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4678 
4679  if (TC == 1) {
4680  ORE->emit(createMissedAnalysis("SingleIterationLoop")
4681  << "loop trip count is one, irrelevant for vectorization");
4682  LLVM_DEBUG(dbgs() << "LV: Aborting, single iteration (non) loop.\n");
4683  return None;
4684  }
4685 
4686  // Record that scalar epilogue is not allowed.
4687  LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4688 
4689  IsScalarEpilogueAllowed = !OptForSize;
4690 
4691  // We don't create an epilogue when optimizing for size.
4692  // Invalidate interleave groups that require an epilogue if we can't mask
4693  // the interleave-group.
4695  InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4696 
4697  unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC);
4698 
4699  if (TC > 0 && TC % MaxVF == 0) {
4700  LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4701  return MaxVF;
4702  }
4703 
4704  // If we don't know the precise trip count, or if the trip count that we
4705  // found modulo the vectorization factor is not zero, try to fold the tail
4706  // by masking.
4707  // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4708  if (Legal->canFoldTailByMasking()) {
4709  FoldTailByMasking = true;
4710  return MaxVF;
4711  }
4712 
4713  if (TC == 0) {
4714  ORE->emit(
4715  createMissedAnalysis("UnknownLoopCountComplexCFG")
4716  << "unable to calculate the loop count due to complex control flow");
4717  return None;
4718  }
4719 
4720  ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize")
4721  << "cannot optimize for size and vectorize at the same time. "
4722  "Enable vectorization of this loop with '#pragma clang loop "
4723  "vectorize(enable)' when compiling with -Os/-Oz");
4724  return None;
4725 }
4726 
4727 unsigned
4728 LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,
4729  unsigned ConstTripCount) {
4730  MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4731  unsigned SmallestType, WidestType;
4732  std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4733  unsigned WidestRegister = TTI.getRegisterBitWidth(true);
4734 
4735  // Get the maximum safe dependence distance in bits computed by LAA.
4736  // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4737  // the memory accesses that is most restrictive (involved in the smallest
4738  // dependence distance).
4739  unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
4740 
4741  WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
4742 
4743  unsigned MaxVectorSize = WidestRegister / WidestType;
4744 
4745  LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4746  << " / " << WidestType << " bits.\n");
4747  LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4748  << WidestRegister << " bits.\n");
4749 
4750  assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
4751  " into one vector!");
4752  if (MaxVectorSize == 0) {
4753  LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
4754  MaxVectorSize = 1;
4755  return MaxVectorSize;
4756  } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
4757  isPowerOf2_32(ConstTripCount)) {
4758  // We need to clamp the VF to be the ConstTripCount. There is no point in
4759  // choosing a higher viable VF as done in the loop below.
4760  LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
4761  << ConstTripCount << "\n");
4762  MaxVectorSize = ConstTripCount;
4763  return MaxVectorSize;
4764  }
4765 
4766  unsigned MaxVF = MaxVectorSize;
4767  if (TTI.shouldMaximizeVectorBandwidth(OptForSize) ||
4768  (MaximizeBandwidth && !OptForSize)) {
4769  // Collect all viable vectorization factors larger than the default MaxVF
4770  // (i.e. MaxVectorSize).
4772  unsigned NewMaxVectorSize = WidestRegister / SmallestType;
4773  for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
4774  VFs.push_back(VS);
4775 
4776  // For each VF calculate its register usage.
4777  auto RUs = calculateRegisterUsage(VFs);
4778 
4779  // Select the largest VF which doesn't require more registers than existing
4780  // ones.
4781  unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);
4782  for (int i = RUs.size() - 1; i >= 0; --i) {
4783  if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
4784  MaxVF = VFs[i];
4785  break;
4786  }
4787  }
4788  if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
4789  if (MaxVF < MinVF) {
4790  LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4791  << ") with target's minimum: " << MinVF << '\n');
4792  MaxVF = MinVF;
4793  }
4794  }
4795  }
4796  return MaxVF;
4797 }
4798 
4801  float Cost = expectedCost(1).first;
4802  const float ScalarCost = Cost;
4803  unsigned Width = 1;
4804  LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
4805 
4806  bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
4807  if (ForceVectorization && MaxVF > 1) {
4808  // Ignore scalar width, because the user explicitly wants vectorization.
4809  // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4810  // evaluation.
4812  }
4813 
4814  for (unsigned i = 2; i <= MaxVF; i *= 2) {
4815  // Notice that the vector loop needs to be executed less times, so
4816  // we need to divide the cost of the vector loops by the width of
4817  // the vector elements.
4818  VectorizationCostTy C = expectedCost(i);
4819  float VectorCost = C.first / (float)i;
4820  LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
4821  << " costs: " << (int)VectorCost << ".\n");
4822  if (!C.second && !ForceVectorization) {
4823  LLVM_DEBUG(
4824  dbgs() << "LV: Not considering vector loop of width " << i
4825  << " because it will not generate any vector instructions.\n");
4826  continue;
4827  }
4828  if (VectorCost < Cost) {
4829  Cost = VectorCost;
4830  Width = i;
4831  }
4832  }
4833 
4834  if (!EnableCondStoresVectorization && NumPredStores) {
4835  ORE->emit(createMissedAnalysis("ConditionalStore")
4836  << "store that is conditionally executed prevents vectorization");
4837  LLVM_DEBUG(
4838  dbgs() << "LV: No vectorization. There are conditional stores.\n");
4839  Width = 1;
4840  Cost = ScalarCost;
4841  }
4842 
4843  LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
4844  << "LV: Vectorization seems to be not beneficial, "
4845  << "but was forced by a user.\n");
4846  LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
4847  VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
4848  return Factor;
4849 }
4850 
4851 std::pair<unsigned, unsigned>
4853  unsigned MinWidth = -1U;
4854  unsigned MaxWidth = 8;
4855  const DataLayout &DL = TheFunction->getParent()->getDataLayout();
4856 
4857  // For each block.
4858  for (BasicBlock *BB : TheLoop->blocks()) {
4859  // For each instruction in the loop.
4860  for (Instruction &I : BB->instructionsWithoutDebug()) {
4861  Type *T = I.getType();
4862 
4863  // Skip ignored values.
4864  if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
4865  continue;
4866 
4867  // Only examine Loads, Stores and PHINodes.
4868  if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
4869  continue;
4870 
4871  // Examine PHI nodes that are reduction variables. Update the type to
4872  // account for the recurrence type.
4873  if (auto *PN = dyn_cast<PHINode>(&I)) {
4874  if (!Legal->isReductionVariable(PN))
4875  continue;
4876  RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
4877  T = RdxDesc.getRecurrenceType();
4878  }
4879 
4880  // Examine the stored values.
4881  if (auto *ST = dyn_cast<StoreInst>(&I))
4882  T = ST->getValueOperand()->getType();
4883 
4884  // Ignore loaded pointer types and stored pointer types that are not
4885  // vectorizable.
4886  //
4887  // FIXME: The check here attempts to predict whether a load or store will
4888  // be vectorized. We only know this for certain after a VF has
4889  // been selected. Here, we assume that if an access can be
4890  // vectorized, it will be. We should also look at extending this
4891  // optimization to non-pointer types.
4892  //
4893  if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
4894  !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
4895  continue;
4896 
4897  MinWidth = std::min(MinWidth,
4898  (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
4899  MaxWidth = std::max(MaxWidth,
4900  (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
4901  }
4902  }
4903 
4904  return {MinWidth, MaxWidth};
4905 }
4906 
4908  unsigned VF,
4909  unsigned LoopCost) {
4910  // -- The interleave heuristics --
4911  // We interleave the loop in order to expose ILP and reduce the loop overhead.
4912  // There are many micro-architectural considerations that we can't predict
4913  // at this level. For example, frontend pressure (on decode or fetch) due to
4914  // code size, or the number and capabilities of the execution ports.
4915  //
4916  // We use the following heuristics to select the interleave count:
4917  // 1. If the code has reductions, then we interleave to break the cross
4918  // iteration dependency.
4919  // 2. If the loop is really small, then we interleave to reduce the loop
4920  // overhead.
4921  // 3. We don't interleave if we think that we will spill registers to memory
4922  // due to the increased register pressure.
4923 
4924  // When we optimize for size, we don't interleave.
4925  if (OptForSize)
4926  return 1;
4927 
4928  // We used the distance for the interleave count.
4929  if (Legal->getMaxSafeDepDistBytes() != -1U)
4930  return 1;
4931 
4932  // Do not interleave loops with a relatively small trip count.
4933  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4934  if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
4935  return 1;
4936 
4937  unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
4938  LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
4939  << " registers\n");
4940 
4941  if (VF == 1) {
4942  if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
4943  TargetNumRegisters = ForceTargetNumScalarRegs;
4944  } else {
4945  if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
4946  TargetNumRegisters = ForceTargetNumVectorRegs;
4947  }
4948 
4949  RegisterUsage R = calculateRegisterUsage({VF})[0];
4950  // We divide by these constants so assume that we have at least one
4951  // instruction that uses at least one register.
4953 
4954  // We calculate the interleave count using the following formula.
4955  // Subtract the number of loop invariants from the number of available
4956  // registers. These registers are used by all of the interleaved instances.
4957  // Next, divide the remaining registers by the number of registers that is
4958  // required by the loop, in order to estimate how many parallel instances
4959  // fit without causing spills. All of this is rounded down if necessary to be
4960  // a power of two. We want power of two interleave count to simplify any
4961  // addressing operations or alignment considerations.
4962  // We also want power of two interleave counts to ensure that the induction
4963  // variable of the vector loop wraps to zero, when tail is folded by masking;
4964  // this currently happens when OptForSize, in which case IC is set to 1 above.
4965  unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
4966  R.MaxLocalUsers);
4967 
4968  // Don't count the induction variable as interleaved.
4970  IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
4971  std::max(1U, (R.MaxLocalUsers - 1)));
4972 
4973  // Clamp the interleave ranges to reasonable counts.
4974  unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
4975 
4976  // Check if the user has overridden the max.
4977  if (VF == 1) {
4978  if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
4979  MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
4980  } else {
4981  if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
4982  MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
4983  }
4984 
4985  // If we did not calculate the cost for VF (because the user selected the VF)
4986  // then we calculate the cost of VF here.
4987  if (LoopCost == 0)
4988  LoopCost = expectedCost(VF).first;
4989 
4990  // Clamp the calculated IC to be between the 1 and the max interleave count
4991  // that the target allows.
4992  if (IC > MaxInterleaveCount)
4993  IC = MaxInterleaveCount;
4994  else if (IC < 1)
4995  IC = 1;
4996 
4997  // Interleave if we vectorized this loop and there is a reduction that could
4998  // benefit from interleaving.
4999  if (VF > 1 && !Legal->getReductionVars()->empty()) {
5000  LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5001  return IC;
5002  }
5003 
5004  // Note that if we've already vectorized the loop we will have done the
5005  // runtime check and so interleaving won't require further checks.
5006  bool InterleavingRequiresRuntimePointerCheck =
5007  (VF == 1 && Legal->getRuntimePointerChecking()->Need);
5008 
5009  // We want to interleave small loops in order to reduce the loop overhead and
5010  // potentially expose ILP opportunities.
5011  LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
5012  if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5013  // We assume that the cost overhead is 1 and we use the cost model
5014  // to estimate the cost of the loop and interleave until the cost of the
5015  // loop overhead is about 5% of the cost of the loop.
5016  unsigned SmallIC =
5017  std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5018 
5019  // Interleave until store/load ports (estimated by max interleave count) are
5020  // saturated.
5021  unsigned NumStores = Legal->getNumStores();
5022  unsigned NumLoads = Legal->getNumLoads();
5023  unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5024  unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5025 
5026  // If we have a scalar reduction (vector reductions are already dealt with
5027  // by this point), we can increase the critical path length if the loop
5028  // we're interleaving is inside another loop. Limit, by default to 2, so the
5029  // critical path only gets increased by one reduction operation.
5030  if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) {
5031  unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5032  SmallIC = std::min(SmallIC, F);
5033  StoresIC = std::min(StoresIC, F);
5034  LoadsIC = std::min(LoadsIC, F);
5035  }
5036 
5038  std::max(StoresIC, LoadsIC) > SmallIC) {
5039  LLVM_DEBUG(
5040  dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5041  return std::max(StoresIC, LoadsIC);
5042  }
5043 
5044  LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5045  return SmallIC;
5046  }
5047 
5048  // Interleave if this is a large loop (small loops are already dealt with by
5049  // this point) that could benefit from interleaving.
5050  bool HasReductions = !Legal->getReductionVars()->empty();
5051  if (TTI.enableAggressiveInterleaving(HasReductions)) {
5052  LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5053  return IC;
5054  }
5055 
5056  LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5057  return 1;
5058 }
5059 
5062  // This function calculates the register usage by measuring the highest number
5063  // of values that are alive at a single location. Obviously, this is a very
5064  // rough estimation. We scan the loop in a topological order in order and
5065  // assign a number to each instruction. We use RPO to ensure that defs are
5066  // met before their users. We assume that each instruction that has in-loop
5067  // users starts an interval. We record every time that an in-loop value is
5068  // used, so we have a list of the first and last occurrences of each
5069  // instruction. Next, we transpose this data structure into a multi map that
5070  // holds the list of intervals that *end* at a specific location. This multi
5071  // map allows us to perform a linear search. We scan the instructions linearly
5072  // and record each time that a new interval starts, by placing it in a set.
5073  // If we find this value in the multi-map then we remove it from the set.
5074  // The max register usage is the maximum size of the set.
5075  // We also search for instructions that are defined outside the loop, but are
5076  // used inside the loop. We need this number separately from the max-interval
5077  // usage number because when we unroll, loop-invariant values do not take
5078  // more register.
5079  LoopBlocksDFS DFS(TheLoop);
5080  DFS.perform(LI);
5081 
5082  RegisterUsage RU;
5083 
5084  // Each 'key' in the map opens a new interval. The values
5085  // of the map are the index of the 'last seen' usage of the
5086  // instruction that is the key.
5088 
5089  // Maps instruction to its index.
5090  SmallVector<Instruction *, 64> IdxToInstr;
5091  // Marks the end of each interval.
5092  IntervalMap EndPoint;
5093  // Saves the list of instruction indices that are used in the loop.
5095  // Saves the list of values that are used in the loop but are
5096  // defined outside the loop, such as arguments and constants.
5097  SmallPtrSet<Value *, 8> LoopInvariants;
5098 
5099  for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5100  for (Instruction &I : BB->instructionsWithoutDebug()) {
5101  IdxToInstr.push_back(&I);
5102 
5103  // Save the end location of each USE.
5104  for (Value *U : I.operands()) {
5105  auto *Instr = dyn_cast<Instruction>(U);
5106 
5107  // Ignore non-instruction values such as arguments, constants, etc.
5108  if (!Instr)
5109  continue;
5110 
5111  // If this instruction is outside the loop then record it and continue.
5112  if (!TheLoop->contains(Instr)) {
5113  LoopInvariants.insert(Instr);
5114  continue;
5115  }
5116 
5117  // Overwrite previous end points.
5118  EndPoint[Instr] = IdxToInstr.size();
5119  Ends.insert(Instr);
5120  }
5121  }
5122  }
5123 
5124  // Saves the list of intervals that end with the index in 'key'.
5125  using InstrList = SmallVector<Instruction *, 2>;
5126  DenseMap<unsigned, InstrList> TransposeEnds;
5127 
5128  // Transpose the EndPoints to a list of values that end at each index.
5129  for (auto &Interval : EndPoint)
5130  TransposeEnds[Interval.second].push_back(Interval.first);
5131 
5132  SmallPtrSet<Instruction *, 8> OpenIntervals;
5133 
5134  // Get the size of the widest register.
5135  unsigned MaxSafeDepDist = -1U;
5136  if (Legal->getMaxSafeDepDistBytes() != -1U)
5137  MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5138  unsigned WidestRegister =
5139  std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5140  const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5141 
5143  SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0);
5144 
5145  LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5146 
5147  // A lambda that gets the register usage for the given type and VF.
5148  auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
5149  if (Ty->isTokenTy())
5150  return 0U;
5151  unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5152  return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
5153  };
5154 
5155  for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5156  Instruction *I = IdxToInstr[i];
5157 
5158  // Remove all of the instructions that end at this location.
5159  InstrList &List = TransposeEnds[i];
5160  for (Instruction *ToRemove : List)
5161  OpenIntervals.erase(ToRemove);
5162 
5163  // Ignore instructions that are never used within the loop.
5164  if (Ends.find(I) == Ends.end())
5165  continue;
5166 
5167  // Skip ignored values.
5168  if (ValuesToIgnore.find(I) != ValuesToIgnore.end())
5169  continue;
5170 
5171  // For each VF find the maximum usage of registers.
5172  for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5173  if (VFs[j] == 1) {
5174  MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size());
5175  continue;
5176  }
5177  collectUniformsAndScalars(VFs[j]);
5178  // Count the number of live intervals.
5179  unsigned RegUsage = 0;
5180  for (auto Inst : OpenIntervals) {
5181  // Skip ignored values for VF > 1.
5182  if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end() ||
5183  isScalarAfterVectorization(Inst, VFs[j]))
5184  continue;
5185  RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
5186  }
5187  MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
5188  }
5189 
5190  LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5191  << OpenIntervals.size() << '\n');
5192 
5193  // Add the current instruction to the list of open intervals.
5194  OpenIntervals.insert(I);
5195  }
5196 
5197  for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5198  unsigned Invariant = 0;
5199  if (VFs[i] == 1)
5200  Invariant = LoopInvariants.size();
5201  else {
5202  for (auto Inst : LoopInvariants)
5203  Invariant += GetRegUsage(Inst->getType(), VFs[i]);
5204  }
5205 
5206  LLVM_DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n');
5207  LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n');
5208  LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant
5209  << '\n');
5210 
5211  RU.LoopInvariantRegs = Invariant;
5212  RU.MaxLocalUsers = MaxUsages[i];
5213  RUs[i] = RU;
5214  }
5215 
5216  return RUs;
5217 }
5218 
5219 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5220  // TODO: Cost model for emulated masked load/store is completely
5221  // broken. This hack guides the cost model to use an artificially
5222  // high enough value to practically disable vectorization with such
5223  // operations, except where previously deployed legality hack allowed
5224  // using very low cost values. This is to avoid regressions coming simply
5225  // from moving "masked load/store" check from legality to cost model.
5226  // Masked Load/Gather emulation was previously never allowed.
5227  // Limited number of Masked Store/Scatter emulation was allowed.
5228  assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5229  return isa<LoadInst>(I) ||
5230  (isa<StoreInst>(I) &&
5231  NumPredStores > NumberOfStoresToPredicate);
5232 }
5233 
5235  // If we aren't vectorizing the loop, or if we've already collected the
5236  // instructions to scalarize, there's nothing to do. Collection may already
5237  // have occurred if we have a user-selected VF and are now computing the
5238  // expected cost for interleaving.
5239  if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
5240  return;
5241 
5242  // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5243  // not profitable to scalarize any instructions, the presence of VF in the
5244  // map will indicate that we've analyzed it already.
5245  ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5246 
5247  // Find all the instructions that are scalar with predication in the loop and
5248  // determine if it would be better to not if-convert the blocks they are in.
5249  // If so, we also record the instructions to scalarize.
5250  for (BasicBlock *BB : TheLoop->blocks()) {
5251  if (!blockNeedsPredication(BB))
5252  continue;
5253  for (Instruction &I : *BB)
5254  if (isScalarWithPredication(&I)) {
5255  ScalarCostsTy ScalarCosts;
5256  // Do not apply discount logic if hacked cost is needed
5257  // for emulated masked memrefs.
5258  if (!useEmulatedMaskMemRefHack(&I) &&
5259  computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5260  ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5261  // Remember that BB will remain after vectorization.
5262  PredicatedBBsAfterVectorization.insert(BB);
5263  }
5264  }
5265 }
5266 
5267 int LoopVectorizationCostModel::computePredInstDiscount(
5268  Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5269  unsigned VF) {
5270  assert(!isUniformAfterVectorization(PredInst, VF) &&
5271  "Instruction marked uniform-after-vectorization will be predicated");
5272 
5273  // Initialize the discount to zero, meaning that the scalar version and the
5274  // vector version cost the same.
5275  int Discount = 0;
5276 
5277  // Holds instructions to analyze. The instructions we visit are mapped in
5278  // ScalarCosts. Those instructions are the ones that would be scalarized if
5279  // we find that the scalar version costs less.
5281 
5282  // Returns true if the given instruction can be scalarized.
5283  auto canBeScalarized = [&](Instruction *I) -> bool {
5284  // We only attempt to scalarize instructions forming a single-use chain
5285  // from the original predicated block that would otherwise be vectorized.
5286  // Although not strictly necessary, we give up on instructions we know will
5287  // already be scalar to avoid traversing chains that are unlikely to be
5288  // beneficial.
5289  if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5290  isScalarAfterVectorization(I, VF))
5291  return false;
5292 
5293  // If the instruction is scalar with predication, it will be analyzed
5294  // separately. We ignore it within the context of PredInst.
5295  if (isScalarWithPredication(I))
5296  return false;
5297 
5298  // If any of the instruction's operands are uniform after vectorization,
5299  // the instruction cannot be scalarized. This prevents, for example, a
5300  // masked load from being scalarized.
5301  //
5302  // We assume we will only emit a value for lane zero of an instruction
5303  // marked uniform after vectorization, rather than VF identical values.
5304  // Thus, if we scalarize an instruction that uses a uniform, we would
5305  // create uses of values corresponding to the lanes we aren't emitting code
5306  // for. This behavior can be changed by allowing getScalarValue to clone
5307  // the lane zero values for uniforms rather than asserting.
5308  for (Use &U : I->operands())
5309  if (auto *J = dyn_cast<Instruction>(U.get()))
5310  if (isUniformAfterVectorization(J, VF))
5311  return false;
5312 
5313  // Otherwise, we can scalarize the instruction.
5314  return true;
5315  };
5316 
5317  // Returns true if an operand that cannot be scalarized must be extracted
5318  // from a vector. We will account for this scalarization overhead below. Note
5319  // that the non-void predicated instructions are placed in their own blocks,
5320  // and their return values are inserted into vectors. Thus, an extract would
5321  // still be required.
5322  auto needsExtract = [&](Instruction *I) -> bool {
5323  return TheLoop->contains(I) && !isScalarAfterVectorization(I, VF);
5324  };
5325 
5326  // Compute the expected cost discount from scalarizing the entire expression
5327  // feeding the predicated instruction. We currently only consider expressions
5328  // that are single-use instruction chains.
5329  Worklist.push_back(PredInst);
5330  while (!Worklist.empty()) {
5331  Instruction *I = Worklist.pop_back_val();
5332 
5333  // If we've already analyzed the instruction, there's nothing to do.
5334  if (ScalarCosts.find(I) != ScalarCosts.end())
5335  continue;
5336 
5337  // Compute the cost of the vector instruction. Note that this cost already
5338  // includes the scalarization overhead of the predicated instruction.
5339  unsigned VectorCost = getInstructionCost(I, VF).first;
5340 
5341  // Compute the cost of the scalarized instruction. This cost is the cost of
5342  // the instruction as if it wasn't if-converted and instead remained in the
5343  // predicated block. We will scale this cost by block probability after
5344  // computing the scalarization overhead.
5345  unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
5346 
5347  // Compute the scalarization overhead of needed insertelement instructions
5348  // and phi nodes.
5349  if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
5350  ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
5351  true, false);
5352  ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
5353  }
5354 
5355  // Compute the scalarization overhead of needed extractelement
5356  // instructions. For each of the instruction's operands, if the operand can
5357  // be scalarized, add it to the worklist; otherwise, account for the
5358  // overhead.
5359  for (Use &U : I->operands())
5360  if (auto *J = dyn_cast<Instruction>(U.get())) {
5361  assert(VectorType::isValidElementType(J->getType()) &&
5362  "Instruction has non-scalar type");
5363  if (canBeScalarized(J))
5364  Worklist.push_back(J);
5365  else if (needsExtract(J))
5366  ScalarCost += TTI.getScalarizationOverhead(
5367  ToVectorTy(J->getType(),VF), false, true);
5368  }
5369 
5370  // Scale the total scalar cost by block probability.
5371  ScalarCost /= getReciprocalPredBlockProb();
5372 
5373  // Compute the discount. A non-negative discount means the vector version
5374  // of the instruction costs more, and scalarizing would be beneficial.
5375  Discount += VectorCost - ScalarCost;
5376  ScalarCosts[I] = ScalarCost;
5377  }
5378 
5379  return Discount;
5380 }
5381 
5382 LoopVectorizationCostModel::VectorizationCostTy
5383 LoopVectorizationCostModel::expectedCost(unsigned VF) {
5384  VectorizationCostTy Cost;
5385 
5386  // For each block.
5387  for (BasicBlock *BB : TheLoop->blocks()) {
5388  VectorizationCostTy BlockCost;
5389 
5390  // For each instruction in the old loop.
5391  for (Instruction &I : BB->instructionsWithoutDebug()) {
5392  // Skip ignored values.
5393  if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() ||
5394  (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end()))
5395  continue;
5396 
5397  VectorizationCostTy C = getInstructionCost(&I, VF);
5398 
5399  // Check if we should override the cost.
5400  if (ForceTargetInstructionCost.getNumOccurrences() > 0)
5401  C.first = ForceTargetInstructionCost;
5402 
5403  BlockCost.first += C.first;
5404  BlockCost.second |= C.second;
5405  LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5406  << " for VF " << VF << " For instruction: " << I
5407  << '\n');
5408  }
5409 
5410  // If we are vectorizing a predicated block, it will have been
5411  // if-converted. This means that the block's instructions (aside from
5412  // stores and instructions that may divide by zero) will now be
5413  // unconditionally executed. For the scalar case, we may not always execute
5414  // the predicated block. Thus, scale the block's cost by the probability of
5415  // executing it.
5416  if (VF == 1 && blockNeedsPredication(BB))
5417  BlockCost.first /= getReciprocalPredBlockProb();
5418 
5419  Cost.first += BlockCost.first;
5420  Cost.second |= BlockCost.second;
5421  }
5422 
5423  return Cost;
5424 }
5425 
5426 /// Gets Address Access SCEV after verifying that the access pattern
5427 /// is loop invariant except the induction variable dependence.
5428 ///
5429 /// This SCEV can be sent to the Target in order to estimate the address
5430 /// calculation cost.
5432  Value *Ptr,
5435  const Loop *TheLoop) {
5436 
5437  auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5438  if (!Gep)
5439  return nullptr;
5440 
5441  // We are looking for a gep with all loop invariant indices except for one
5442  // which should be an induction variable.
5443  auto SE = PSE.getSE();
5444  unsigned NumOperands = Gep->getNumOperands();
5445  for (unsigned i = 1; i < NumOperands; ++i) {
5446  Value *Opd = Gep->getOperand(i);
5447  if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5448  !Legal->isInductionVariable(Opd))
5449  return nullptr;
5450  }
5451 
5452  // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5453  return PSE.getSCEV(Ptr);
5454 }
5455 
5457  return Legal->hasStride(I->getOperand(0)) ||
5458  Legal->hasStride(I->getOperand(1));
5459 }
5460 
5461 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5462  unsigned VF) {
5463  assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
5464  Type *ValTy = getMemInstValueType(I);
5465  auto SE = PSE.getSE();
5466 
5467  unsigned Alignment = getLoadStoreAlignment(I);
5468  unsigned AS = getLoadStoreAddressSpace(I);
5470  Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5471 
5472  // Figure out whether the access is strided and get the stride value
5473  // if it's known in compile time
5474  const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5475 
5476  // Get the cost of the scalar memory instruction and address computation.
5477  unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5478 
5479  // Don't pass *I here, since it is scalar but will actually be part of a
5480  // vectorized loop where the user of it is a vectorized instruction.
5481  Cost += VF *
5482  TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
5483  AS);
5484 
5485  // Get the overhead of the extractelement and insertelement instructions
5486  // we might create due to scalarization.
5487  Cost += getScalarizationOverhead(I, VF, TTI);
5488 
5489  // If we have a predicated store, it may not be executed for each vector
5490  // lane. Scale the cost by the probability of executing the predicated
5491  // block.
5492  if (isPredicatedInst(I)) {
5493  Cost /= getReciprocalPredBlockProb();
5494 
5495  if (useEmulatedMaskMemRefHack(I))
5496  // Artificially setting to a high enough value to practically disable
5497  // vectorization with such operations.
5498  Cost = 3000000;
5499  }
5500 
5501  return Cost;
5502 }
5503 
5504 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5505  unsigned VF) {
5506  Type *ValTy = getMemInstValueType(I);
5507  Type *VectorTy = ToVectorTy(ValTy, VF);
5508  unsigned Alignment = getLoadStoreAlignment(I);
5510  unsigned AS = getLoadStoreAddressSpace(I);
5511  int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
5512 
5513  assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5514  "Stride should be 1 or -1 for consecutive memory access");
5515  unsigned Cost = 0;
5516  if (Legal->isMaskRequired(I))
5517  Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
5518  else
5519  Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
5520 
5521  bool Reverse = ConsecutiveStride < 0;
5522  if (Reverse)
5523  Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5524  return Cost;
5525 }
5526 
5527 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5528  unsigned VF) {
5529  Type *ValTy = getMemInstValueType(I);
5530  Type *VectorTy = ToVectorTy(ValTy, VF);
5531  unsigned Alignment = getLoadStoreAlignment(I);
5532  unsigned AS = getLoadStoreAddressSpace(I);
5533  if (isa<LoadInst>(I)) {
5534  return TTI.getAddressComputationCost(ValTy) +
5535  TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
5537  }
5538  StoreInst *SI = cast<StoreInst>(I);
5539 
5540  bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
5541  return TTI.getAddressComputationCost(ValTy) +
5542  TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
5543  (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost(
5544  Instruction::ExtractElement,
5545  VectorTy, VF - 1));
5546 }
5547 
5548 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5549  unsigned VF) {
5550  Type *ValTy = getMemInstValueType(I);
5551  Type *VectorTy = ToVectorTy(ValTy, VF);
5552  unsigned Alignment = getLoadStoreAlignment(I);
5554 
5555  return TTI.getAddressComputationCost(VectorTy) +
5556  TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5557  Legal->isMaskRequired(I), Alignment);
5558 }
5559 
5560 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5561  unsigned VF) {
5562  Type *ValTy = getMemInstValueType(I);
5563  Type *VectorTy = ToVectorTy(ValTy, VF);
5564  unsigned AS = getLoadStoreAddressSpace(I);
5565 
5566  auto Group = getInterleavedAccessGroup(I);
5567  assert(Group && "Fail to get an interleaved access group.");
5568 
5569  unsigned InterleaveFactor = Group->getFactor();
5570  Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5571 
5572  // Holds the indices of existing members in an interleaved load group.
5573  // An interleaved store group doesn't need this as it doesn't allow gaps.
5574  SmallVector<unsigned, 4> Indices;
5575  if (isa<LoadInst>(I)) {
5576  for (unsigned i = 0; i < InterleaveFactor; i++)
5577  if (Group->getMember(i))
5578  Indices.push_back(i);
5579  }
5580 
5581  // Calculate the cost of the whole interleaved group.
5582  bool UseMaskForGaps =
5583  Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed;
5584  unsigned Cost = TTI.getInterleavedMemoryOpCost(
5585  I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5586  Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
5587 
5588  if (Group->isReverse()) {
5589  // TODO: Add support for reversed masked interleaved access.
5590  assert(!Legal->isMaskRequired(I) &&
5591  "Reverse masked interleaved access not supported.");
5592  Cost += Group->getNumMembers() *
5594  }
5595  return Cost;
5596 }
5597 
5598 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5599  unsigned VF) {
5600  // Calculate scalar cost only. Vectorization cost should be ready at this
5601  // moment.
5602  if (VF == 1) {
5603  Type *ValTy = getMemInstValueType(I);
5604  unsigned Alignment = getLoadStoreAlignment(I);
5605  unsigned AS = getLoadStoreAddressSpace(I);
5606 
5607  return TTI.getAddressComputationCost(ValTy) +
5608  TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
5609  }
5610  return getWideningCost(I, VF);
5611 }
5612 
5613 LoopVectorizationCostModel::VectorizationCostTy
5614 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
5615  // If we know that this instruction will remain uniform, check the cost of
5616  // the scalar version.
5617  if (isUniformAfterVectorization(I, VF))
5618  VF = 1;
5619 
5620  if (VF > 1 && isProfitableToScalarize(I, VF))
5621  return VectorizationCostTy(InstsToScalarize[VF][I], false);
5622 
5623  // Forced scalars do not have any scalarization overhead.
5624  auto ForcedScalar = ForcedScalars.find(VF);
5625  if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
5626  auto InstSet = ForcedScalar->second;
5627  if (InstSet.find(I) != InstSet.end())
5628  return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
5629  }
5630 
5631  Type *VectorTy;
5632  unsigned C = getInstructionCost(I, VF, VectorTy);
5633 
5634  bool TypeNotScalarized =
5635  VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
5636  return VectorizationCostTy(C, TypeNotScalarized);
5637 }
5638 
5640  if (VF == 1)
5641  return;
5642  NumPredStores = 0;
5643  for (BasicBlock *BB : TheLoop->blocks()) {
5644  // For each instruction in the old loop.
5645  for (Instruction &I : *BB) {
5646  Value *Ptr = getLoadStorePointerOperand(&I);
5647  if (!Ptr)
5648  continue;
5649 
5650  // TODO: We should generate better code and update the cost model for
5651  // predicated uniform stores. Today they are treated as any other
5652  // predicated store (see added test cases in
5653  // invariant-store-vectorization.ll).
5654  if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
5655  NumPredStores++;
5656 
5657  if (Legal->isUniform(Ptr) &&
5658  // Conditional loads and stores should be scalarized and predicated.
5659  // isScalarWithPredication cannot be used here since masked
5660  // gather/scatters are not considered scalar with predication.
5662  // TODO: Avoid replicating loads and stores instead of
5663  // relying on instcombine to remove them.
5664  // Load: Scalar load + broadcast
5665  // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
5666  unsigned Cost = getUniformMemOpCost(&I, VF);
5667  setWideningDecision(&I, VF, CM_Scalarize, Cost);
5668  continue;
5669  }
5670 
5671  // We assume that widening is the best solution when possible.
5672  if (memoryInstructionCanBeWidened(&I, VF)) {
5673  unsigned Cost = getConsecutiveMemOpCost(&I, VF);
5674  int ConsecutiveStride =
5676  assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5677  "Expected consecutive stride.");
5678  InstWidening Decision =
5679  ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
5680  setWideningDecision(&I, VF, Decision, Cost);
5681  continue;
5682  }
5683 
5684  // Choose between Interleaving, Gather/Scatter or Scalarization.
5685  unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
5686  unsigned NumAccesses = 1;
5687  if (isAccessInterleaved(&I)) {
5688  auto Group = getInterleavedAccessGroup(&I);
5689  assert(Group && "Fail to get an interleaved access group.");
5690 
5691  // Make one decision for the whole group.
5692  if (getWideningDecision(&I, VF) != CM_Unknown)
5693  continue;
5694 
5695  NumAccesses = Group->getNumMembers();
5696  if (interleavedAccessCanBeWidened(&I, VF))
5697  InterleaveCost = getInterleaveGroupCost(&I, VF);
5698  }
5699 
5700  unsigned GatherScatterCost =
5701  isLegalGatherOrScatter(&I)
5702  ? getGatherScatterCost(&I, VF) * NumAccesses
5704 
5705  unsigned ScalarizationCost =
5706  getMemInstScalarizationCost(&I, VF) * NumAccesses;
5707 
5708  // Choose better solution for the current VF,
5709  // write down this decision and use it during vectorization.
5710  unsigned Cost;
5711  InstWidening Decision;
5712  if (InterleaveCost <= GatherScatterCost &&
5713  InterleaveCost < ScalarizationCost) {
5714  Decision = CM_Interleave;
5715  Cost = InterleaveCost;
5716  } else if (GatherScatterCost < ScalarizationCost) {
5717  Decision = CM_GatherScatter;
5718  Cost = GatherScatterCost;
5719  } else {
5720  Decision = CM_Scalarize;
5721  Cost = ScalarizationCost;
5722  }
5723  // If the instructions belongs to an interleave group, the whole group
5724  // receives the same decision. The whole group receives the cost, but
5725  // the cost will actually be assigned to one instruction.
5726  if (auto Group = getInterleavedAccessGroup(&I))
5727  setWideningDecision(Group, VF, Decision, Cost);
5728  else
5729  setWideningDecision(&I, VF, Decision, Cost);
5730  }
5731  }
5732 
5733  // Make sure that any load of address and any other address computation
5734  // remains scalar unless there is gather/scatter support. This avoids
5735  // inevitable extracts into address registers, and also has the benefit of
5736  // activating LSR more, since that pass can't optimize vectorized
5737  // addresses.
5739  return;
5740 
5741  // Start with all scalar pointer uses.
5743  for (BasicBlock *BB : TheLoop->blocks())
5744  for (Instruction &I : *BB) {
5745  Instruction *PtrDef =
5746  dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
5747  if (PtrDef && TheLoop->contains(PtrDef) &&
5748  getWideningDecision(&I, VF) != CM_GatherScatter)
5749  AddrDefs.insert(PtrDef);
5750  }
5751 
5752  // Add all instructions used to generate the addresses.
5754  for (auto *I : AddrDefs)
5755  Worklist.push_back(I);
5756  while (!Worklist.empty()) {
5757  Instruction *I = Worklist.pop_back_val();
5758  for (auto &Op : I->operands())
5759  if (auto *InstOp = dyn_cast<Instruction>(Op))
5760  if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
5761  AddrDefs.insert(InstOp).second)
5762  Worklist.push_back(InstOp);
5763  }
5764 
5765  for (auto *I : AddrDefs) {
5766  if (isa<LoadInst>(I)) {
5767  // Setting the desired widening decision should ideally be handled in
5768  // by cost functions, but since this involves the task of finding out
5769  // if the loaded register is involved in an address computation, it is
5770  // instead changed here when we know this is the case.
5771  InstWidening Decision = getWideningDecision(I, VF);
5772  if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
5773  // Scalarize a widened load of address.
5774  setWideningDecision(I, VF, CM_Scalarize,
5775  (VF * getMemoryInstructionCost(I, 1)));
5776  else if (auto Group = getInterleavedAccessGroup(I)) {
5777  // Scalarize an interleave group of address loads.
5778  for (unsigned I = 0; I < Group->getFactor(); ++I) {
5779  if (Instruction *Member = Group->getMember(I))
5780  setWideningDecision(Member, VF, CM_Scalarize,
5781  (VF * getMemoryInstructionCost(Member, 1)));
5782  }
5783  }
5784  } else
5785  // Make sure I gets scalarized and a cost estimate without
5786  // scalarization overhead.
5787  ForcedScalars[VF].insert(I);
5788  }
5789 }
5790 
5791 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
5792  unsigned VF,
5793  Type *&VectorTy) {
5794  Type *RetTy = I->getType();
5795  if (canTruncateToMinimalBitwidth(I, VF))
5796  RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
5797  VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
5798  auto SE = PSE.getSE();
5799 
5800  // TODO: We need to estimate the cost of intrinsic calls.
5801  switch (I->getOpcode()) {
5802  case Instruction::GetElementPtr:
5803  // We mark this instruction as zero-cost because the cost of GEPs in
5804  // vectorized code depends on whether the corresponding memory instruction
5805  // is scalarized or not. Therefore, we handle GEPs with the memory
5806  // instruction cost.
5807  return 0;
5808  case Instruction::Br: {
5809  // In cases of scalarized and predicated instructions, there will be VF
5810  // predicated blocks in the vectorized loop. Each branch around these
5811  // blocks requires also an extract of its vector compare i1 element.
5812  bool ScalarPredicatedBB = false;
5813  BranchInst *BI = cast<BranchInst>(I);
5814  if (VF > 1 && BI->isConditional() &&
5815  (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) !=
5816  PredicatedBBsAfterVectorization.end() ||
5817  PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) !=
5818  PredicatedBBsAfterVectorization.end()))
5819  ScalarPredicatedBB = true;
5820 
5821  if (ScalarPredicatedBB) {
5822  // Return cost for branches around scalarized and predicated blocks.
5823  Type *Vec_i1Ty =
5825  return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
5826  (TTI.getCFInstrCost(Instruction::Br) * VF));
5827  } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
5828  // The back-edge branch will remain, as will all scalar branches.
5829  return TTI.getCFInstrCost(Instruction::Br);
5830  else
5831  // This branch will be eliminated by if-conversion.
5832  return 0;
5833  // Note: We currently assume zero cost for an unconditional branch inside
5834  // a predicated block since it will become a fall-through, although we
5835  // may decide in the future to call TTI for all branches.
5836  }
5837  case Instruction::PHI: {
5838  auto *Phi = cast<PHINode>(I);
5839 
5840  // First-order recurrences are replaced by vector shuffles inside the loop.
5841  // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
5842  if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
5844  VectorTy, VF - 1, VectorType::get(RetTy, 1));
5845 
5846  // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
5847  // converted into select instructions. We require N - 1 selects per phi
5848  // node, where N is the number of incoming values.
5849  if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
5850  return (Phi->getNumIncomingValues() - 1) *
5852  Instruction::Select, ToVectorTy(Phi->getType(), VF),
5853  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));
5854 
5855  return TTI.getCFInstrCost(Instruction::PHI);
5856  }
5857  case Instruction::UDiv:
5858  case Instruction::SDiv:
5859  case Instruction::URem:
5860  case Instruction::SRem:
5861  // If we have a predicated instruction, it may not be executed for each
5862  // vector lane. Get the scalarization cost and scale this amount by the
5863  // probability of executing the predicated block. If the instruction is not
5864  // predicated, we fall through to the next case.
5865  if (VF > 1 && isScalarWithPredication(I)) {
5866  unsigned Cost = 0;
5867 
5868  // These instructions have a non-void type, so account for the phi nodes
5869  // that we will create. This cost is likely to be zero. The phi node
5870  // cost, if any, should be scaled by the block probability because it
5871  // models a copy at the end of each predicated block.
5872  Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
5873 
5874  // The cost of the non-predicated instruction.
5875  Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
5876 
5877  // The cost of insertelement and extractelement instructions needed for
5878  // scalarization.
5879  Cost += getScalarizationOverhead(I, VF, TTI);
5880 
5881  // Scale the cost by the probability of executing the predicated blocks.
5882  // This assumes the predicated block for each vector lane is equally
5883  // likely.
5884  return Cost / getReciprocalPredBlockProb();
5885  }
5887  case Instruction::Add:
5888  case Instruction::FAdd:
5889  case Instruction::Sub:
5890  case Instruction::FSub:
5891  case Instruction::Mul:
5892  case Instruction::FMul:
5893  case Instruction::FDiv:
5894  case Instruction::FRem:
5895  case Instruction::Shl:
5896  case Instruction::LShr:
5897  case Instruction::AShr:
5898  case Instruction::And:
5899  case Instruction::Or:
5900  case Instruction::Xor: {
5901  // Since we will replace the stride by 1 the multiplication should go away.
5902  if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
5903  return 0;
5904  // Certain instructions can be cheaper to vectorize if they have a constant
5905  // second vector operand. One example of this are shifts on x86.
5906  Value *Op2 = I->getOperand(1);
5909  TTI.getOperandInfo(Op2, Op2VP);
5910  if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
5912 
5914  unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
5915  return N * TTI.getArithmeticInstrCost(
5917  Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands);
5918  }
5919  case Instruction::Select: {
5920  SelectInst *SI = cast<SelectInst>(I);
5921  const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
5922  bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
5923  Type *CondTy = SI->getCondition()->getType();
5924  if (!ScalarCond)
5925  CondTy = VectorType::get(CondTy, VF);
5926 
5927  return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
5928  }
5929  case Instruction::ICmp:
5930  case Instruction::FCmp: {
5931  Type *ValTy = I->getOperand(0)->getType();
5932  Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
5933  if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
5934  ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
5935  VectorTy = ToVectorTy(ValTy, VF);
5936  return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
5937  }
5938  case Instruction::Store:
5939  case Instruction::Load: {
5940  unsigned Width = VF;
5941  if (Width > 1) {
5942  InstWidening Decision = getWideningDecision(I, Width);
5943  assert(Decision != CM_Unknown &&
5944  "CM decision should be taken at this point");
5945  if (Decision == CM_Scalarize)
5946  Width = 1;
5947  }
5948  VectorTy = ToVectorTy(getMemInstValueType(I), Width);
5949  return getMemoryInstructionCost(I, VF);
5950  }
5951  case Instruction::ZExt:
5952  case Instruction::SExt:
5953  case Instruction::FPToUI:
5954  case Instruction::FPToSI:
5955  case Instruction::FPExt:
5956  case Instruction::PtrToInt:
5957  case Instruction::IntToPtr:
5958  case Instruction::SIToFP:
5959  case Instruction::UIToFP:
5960  case Instruction::Trunc:
5961  case Instruction::FPTrunc:
5962  case Instruction::BitCast: {
5963  // We optimize the truncation of induction variables having constant
5964  // integer steps. The cost of these truncations is the same as the scalar
5965  // operation.
5966  if (isOptimizableIVTruncate(I, VF)) {
5967  auto *Trunc = cast<TruncInst>(I);
5968  return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
5969  Trunc->getSrcTy(), Trunc);
5970  }
5971 
5972  Type *SrcScalarTy = I->getOperand(0)->getType();
5973  Type *SrcVecTy =
5974  VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
5975  if (canTruncateToMinimalBitwidth(I, VF)) {
5976  // This cast is going to be shrunk. This may remove the cast or it might
5977  // turn it into slightly different cast. For example, if MinBW == 16,
5978  // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
5979  //
5980  // Calculate the modified src and dest types.
5981  Type *MinVecTy = VectorTy;
5982  if (I->getOpcode() == Instruction::Trunc) {
5983  SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
5984  VectorTy =
5985  largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
5986  } else if (I->getOpcode() == Instruction::ZExt ||
5987  I->getOpcode() == Instruction::SExt) {
5988  SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
5989  VectorTy =
5990  smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
5991  }
5992  }
5993 
5994  unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
5995  return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
5996  }
5997  case Instruction::Call: {
5998  bool NeedToScalarize;
5999  CallInst *CI = cast<CallInst>(I);
6000  unsigned CallCost = getVectorCallCost(CI, VF, TTI, TLI, NeedToScalarize);
6002  return std::min(CallCost, getVectorIntrinsicCost(CI, VF, TTI, TLI));
6003  return CallCost;
6004  }
6005  default:
6006  // The cost of executing VF copies of the scalar instruction. This opcode
6007  // is unknown. Assume that it is the same as 'mul'.
6008  return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
6009  getScalarizationOverhead(I, VF, TTI);
6010  } // end of switch.
6011 }
6012 
6013 char LoopVectorize::ID = 0;
6014 
6015 static const char lv_name[] = "Loop Vectorization";
6016 
6017 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6030 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6031 
6032 namespace llvm {
6033 
6034 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6035  bool VectorizeOnlyWhenForced) {
6036  return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6037 }
6038 
6039 } // end namespace llvm
6040 
6041 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6042  // Check if the pointer operand of a load or store instruction is
6043  // consecutive.
6044  if (auto *Ptr = getLoadStorePointerOperand(Inst))
6045  return Legal->isConsecutivePtr(Ptr);
6046  return false;
6047 }
6048 
6050  // Ignore ephemeral values.
6051  CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6052 
6053  // Ignore type-promoting instructions we identified during reduction
6054  // detection.
6055  for (auto &Reduction : *Legal->getReductionVars()) {
6056  RecurrenceDescriptor &RedDes = Reduction.second;
6057  SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6058  VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6059  }
6060  // Ignore type-casting instructions we identified during induction
6061  // detection.
6062  for (auto &Induction : *Legal->getInductionVars()) {
6063  InductionDescriptor &IndDes = Induction.second;
6064  const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6065  VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6066  }
6067 }
6068 
6071  unsigned UserVF) {
6072  // Width 1 means no vectorization, cost 0 means uncomputed cost.
6073  const VectorizationFactor NoVectorization = {1U, 0U};
6074 
6075  // Outer loop handling: They may require CFG and instruction level
6076  // transformations before even evaluating whether vectorization is profitable.
6077  // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6078  // the vectorization pipeline.
6079  if (!OrigLoop->empty()) {
6080  // TODO: If UserVF is not provided, we set UserVF to 4 for stress testing.
6081  // This won't be necessary when UserVF is not required in the VPlan-native
6082  // path.
6083  if (VPlanBuildStressTest && !UserVF)
6084  UserVF = 4;
6085 
6086  assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6087  assert(UserVF && "Expected UserVF for outer loop vectorization.");
6088  assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
6089  LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6090  buildVPlans(UserVF, UserVF);
6091 
6092  // For VPlan build stress testing, we bail out after VPlan construction.
6094  return NoVectorization;
6095 
6096  return {UserVF, 0};
6097  }
6098 
6099  LLVM_DEBUG(
6100  dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6101  "VPlan-native path.\n");
6102  return NoVectorization;
6103 }
6104 
6106 LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {
6107  assert(OrigLoop->empty() && "Inner loop expected.");
6108  // Width 1 means no vectorization, cost 0 means uncomputed cost.
6109  const VectorizationFactor NoVectorization = {1U, 0U};
6110  Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(OptForSize);
6111  if (!MaybeMaxVF.hasValue()) // Cases considered too costly to vectorize.
6112  return NoVectorization;
6113 
6114  // Invalidate interleave groups if all blocks of loop will be predicated.
6115  if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6117  LLVM_DEBUG(
6118  dbgs()
6119  << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6120  "which requires masked-interleaved support.\n");
6121  CM.InterleaveInfo.reset();
6122  }
6123 
6124  if (UserVF) {
6125  LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6126  assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
6127  // Collect the instructions (and their associated costs) that will be more
6128  // profitable to scalarize.
6129  CM.selectUserVectorizationFactor(UserVF);
6130  buildVPlansWithVPRecipes(UserVF, UserVF);
6131  LLVM_DEBUG(printPlans(dbgs()));
6132  return {UserVF, 0};
6133  }
6134 
6135  unsigned MaxVF = MaybeMaxVF.getValue();
6136  assert(MaxVF != 0 && "MaxVF is zero.");
6137 
6138  for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6139  // Collect Uniform and Scalar instructions after vectorization with VF.
6140  CM.collectUniformsAndScalars(VF);
6141 
6142  // Collect the instructions (and their associated costs) that will be more
6143  // profitable to scalarize.
6144  if (VF > 1)
6145  CM.collectInstsToScalarize(VF);
6146  }
6147 
6148  buildVPlansWithVPRecipes(1, MaxVF);
6149  LLVM_DEBUG(printPlans(dbgs()));
6150  if (MaxVF == 1)
6151  return NoVectorization;
6152 
6153  // Select the optimal vectorization factor.
6154  return CM.selectVectorizationFactor(MaxVF);
6155 }
6156 
6157 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
6158  LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
6159  << '\n');
6160  BestVF = VF;
6161  BestUF = UF;
6162 
6163  erase_if(VPlans, [VF](const VPlanPtr &Plan) {
6164  return !Plan->hasVF(VF);
6165  });
6166  assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
6167 }
6168 
6170  DominatorTree *DT) {
6171  // Perform the actual loop transformation.
6172 
6173  // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6174  VPCallbackILV CallbackILV(ILV);
6175 
6176  VPTransformState State{BestVF, BestUF, LI,
6177  DT, ILV.Builder, ILV.VectorLoopValueMap,
6178  &ILV, CallbackILV};
6179  State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6180  State.TripCount = ILV.getOrCreateTripCount(nullptr);
6181 
6182  //===------------------------------------------------===//
6183  //
6184  // Notice: any optimization or new instruction that go
6185  // into the code below should also be implemented in
6186  // the cost-model.
6187  //
6188  //===------------------------------------------------===//
6189 
6190  // 2. Copy and widen instructions from the old loop into the new loop.
6191  assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
6192  VPlans.front()->execute(&State);
6193 
6194  // 3. Fix the vectorized code: take care of header phi's, live-outs,
6195  // predication, updating analyses.
6196  ILV.fixVectorizedLoop();
6197 }
6198 
6200  SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6201  BasicBlock *Latch = OrigLoop->getLoopLatch();
6202 
6203  // We create new control-flow for the vectorized loop, so the original
6204  // condition will be dead after vectorization if it's only used by the
6205  // branch.
6206  auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
6207  if (Cmp && Cmp->hasOneUse())
6208  DeadInstructions.insert(Cmp);
6209 
6210  // We create new "steps" for induction variable updates to which the original
6211  // induction variables map. An original update instruction will be dead if
6212  // all its users except the induction variable are dead.
6213  for (auto &Induction : *Legal->getInductionVars()) {
6214  PHINode *Ind = Induction.first;
6215  auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
6216  if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
6217  return U == Ind || DeadInstructions.find(cast<Instruction>(U)) !=
6218  DeadInstructions.end();
6219  }))
6220  DeadInstructions.insert(IndUpdate);
6221 
6222  // We record as "Dead" also the type-casting instructions we had identified
6223  // during induction analysis. We don't need any handling for them in the
6224  // vectorized loop because we have proven that, under a proper runtime
6225  // test guarding the vectorized loop, the value of the phi, and the casted
6226  // value of the phi, are the same. The last instruction in this casting chain
6227  // will get its scalar/vector/widened def from the scalar/vector/widened def
6228  // of the respective phi node. Any other casts in the induction def-use chain
6229  // have no other uses outside the phi update chain, and will be ignored.
6230  InductionDescriptor &IndDes = Induction.second;
6231  const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6232  DeadInstructions.insert(Casts.begin(), Casts.end());
6233  }
6234 }
6235 
6236 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
6237 
6238 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
6239 
6240 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
6241  Instruction::BinaryOps BinOp) {
6242  // When unrolling and the VF is 1, we only need to add a simple scalar.
6243  Type *Ty = Val->getType();
6244  assert(!Ty->isVectorTy() && "Val must be a scalar");
6245 
6246  if (Ty->isFloatingPointTy()) {
6247  Constant *C = ConstantFP::get(Ty, (double)StartIdx);
6248 
6249  // Floating point operations had to be 'fast' to enable the unrolling.
6250  Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
6251  return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
6252  }
6253  Constant *C = ConstantInt::get(Ty, StartIdx);
6254  return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
6255 }
6256 
6259  // Reserve first location for self reference to the LoopID metadata node.
6260  MDs.push_back(nullptr);
6261  bool IsUnrollMetadata = false;
6262  MDNode *LoopID = L->getLoopID();
6263  if (LoopID) {
6264  // First find existing loop unrolling disable metadata.
6265  for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
6266  auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
6267  if (MD) {
6268  const auto *S = dyn_cast<MDString>(MD->getOperand(0));
6269  IsUnrollMetadata =
6270  S && S->getString().startswith("llvm.loop.unroll.disable");
6271  }
6272  MDs.push_back(LoopID->getOperand(i));
6273  }
6274  }
6275 
6276  if (!IsUnrollMetadata) {
6277  // Add runtime unroll disable metadata.
6279  SmallVector<Metadata *, 1> DisableOperands;
6280  DisableOperands.push_back(
6281  MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
6282  MDNode *DisableNode = MDNode::get(Context, DisableOperands);
6283  MDs.push_back(DisableNode);
6284  MDNode *NewLoopID = MDNode::get(Context, MDs);
6285  // Set operand 0 to refer to the loop id itself.
6286  NewLoopID->replaceOperandWith(0, NewLoopID);
6287  L->setLoopID(NewLoopID);
6288  }
6289 }
6290 
6292  const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
6293  assert(Range.End > Range.Start && "Trying to test an empty VF range.");
6294  bool PredicateAtRangeStart = Predicate(Range.Start);
6295 
6296  for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
6297  if (Predicate(TmpVF) != PredicateAtRangeStart) {
6298  Range.End = TmpVF;
6299  break;
6300  }
6301 
6302  return PredicateAtRangeStart;
6303 }
6304 
6305 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6306 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6307 /// of VF's starting at a given VF and extending it as much as possible. Each
6308 /// vectorization decision can potentially shorten this sub-range during
6309 /// buildVPlan().
6310 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
6311  for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6312  VFRange SubRange = {VF, MaxVF + 1};
6313  VPlans.push_back(buildVPlan(SubRange));
6314  VF = SubRange.End;
6315  }
6316 }
6317 
6319  VPlanPtr &Plan) {
6320  assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
6321 
6322  // Look for cached value.
6323  std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
6324  EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
6325  if (ECEntryIt != EdgeMaskCache.end())
6326  return ECEntryIt->second;
6327 
6328  VPValue *SrcMask = createBlockInMask(Src, Plan);
6329 
6330  // The terminator has to be a branch inst!
6332  assert(BI && "Unexpected terminator found");
6333 
6334  if (!BI->isConditional())
6335  return EdgeMaskCache[Edge] = SrcMask;
6336 
6337  VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
6338  assert(EdgeMask && "No Edge Mask found for condition");
6339 
6340  if (BI->getSuccessor(0) != Dst)
6341  EdgeMask = Builder.createNot(EdgeMask);
6342 
6343  if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
6344  EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
6345 
6346  return EdgeMaskCache[Edge] = EdgeMask;
6347 }
6348 
6350  assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
6351 
6352  // Look for cached value.
6353  BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
6354  if (BCEntryIt != BlockMaskCache.end())
6355  return BCEntryIt->second;
6356 
6357  // All-one mask is modelled as no-mask following the convention for masked
6358  // load/store/gather/scatter. Initialize BlockMask to no-mask.
6359  VPValue *BlockMask = nullptr;
6360 
6361  if (OrigLoop->getHeader() == BB) {
6362  if (!CM.blockNeedsPredication(BB))
6363  return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
6364 
6365  // Introduce the early-exit compare IV <= BTC to form header block mask.
6366  // This is used instead of IV < TC because TC may wrap, unlike BTC.
6367  VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction());
6368  VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
6369  BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
6370  return BlockMaskCache[BB] = BlockMask;
6371  }
6372 
6373  // This is the block mask. We OR all incoming edges.
6374  for (auto *Predecessor : predecessors(BB)) {
6375  VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
6376  if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
6377  return BlockMaskCache[BB] = EdgeMask;
6378 
6379  if (!BlockMask) { // BlockMask has its initialized nullptr value.
6380  BlockMask = EdgeMask;
6381  continue;
6382  }
6383 
6384  BlockMask = Builder.createOr(BlockMask, EdgeMask);
6385  }
6386 
6387  return BlockMaskCache[BB] = BlockMask;
6388 }
6389 
6391  VFRange &Range,
6392  VPlanPtr &Plan) {
6393  const InterleaveGroup<Instruction> *IG = CM.getInterleavedAccessGroup(I);
6394  if (!IG)
6395  return nullptr;
6396 
6397  // Now check if IG is relevant for VF's in the given range.
6398  auto isIGMember = [&](Instruction *I) -> std::function<bool(unsigned)> {
6399  return [=](unsigned VF) -> bool {
6400  return (VF >= 2 && // Query is illegal for VF == 1
6401  CM.getWideningDecision(I, VF) ==
6403  };
6404  };
6405  if (!LoopVectorizationPlanner::getDecisionAndClampRange(isIGMember(I), Range))
6406  return nullptr;
6407 
6408  // I is a member of an InterleaveGroup for VF's in the (possibly trimmed)
6409  // range. If it's the primary member of the IG construct a VPInterleaveRecipe.
6410  // Otherwise, it's an adjunct member of the IG, do not construct any Recipe.
6411  assert(I == IG->getInsertPos() &&
6412  "Generating a recipe for an adjunct member of an interleave group");
6413 
6414  VPValue *Mask = nullptr;
6415  if (Legal->isMaskRequired(I))
6416  Mask = createBlockInMask(I->getParent(), Plan);
6417 
6418  return new VPInterleaveRecipe(IG, Mask);
6419 }
6420 
6423  VPlanPtr &Plan) {
6424  if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
6425  return nullptr;
6426 
6427  auto willWiden = [&](unsigned VF) -> bool {
6428  if (VF == 1)
6429  return false;
6430  if (CM.isScalarAfterVectorization(I, VF) ||
6431  CM.isProfitableToScalarize(I, VF))
6432  return false;
6434  CM.getWideningDecision(I, VF);
6436  "CM decision should be taken at this point.");
6438  "Interleave memory opportunity should be caught earlier.");
6439  return Decision != LoopVectorizationCostModel::CM_Scalarize;
6440  };
6441 
6443  return nullptr;
6444 
6445  VPValue *Mask = nullptr;
6446  if (Legal->isMaskRequired(I))
6447  Mask = createBlockInMask(I->getParent(), Plan);
6448 
6449  return new VPWidenMemoryInstructionRecipe(*I, Mask);
6450 }
6451 
6454  if (PHINode *Phi = dyn_cast<PHINode>(I)) {
6455  // Check if this is an integer or fp induction. If so, build the recipe that
6456  // produces its scalar and vector values.
6460  return new VPWidenIntOrFpInductionRecipe(Phi);
6461 
6462  return nullptr;
6463  }
6464 
6465  // Optimize the special case where the source is a constant integer
6466  // induction variable. Notice that we can only optimize the 'trunc' case
6467  // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6468  // (c) other casts depend on pointer size.
6469 
6470  // Determine whether \p K is a truncation based on an induction variable that
6471  // can be optimized.
6472  auto isOptimizableIVTruncate =
6473  [&](Instruction *K) -> std::function<bool(unsigned)> {
6474  return
6475  [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
6476  };
6477 
6478  if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange(
6479  isOptimizableIVTruncate(I), Range))
6480  return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
6481  cast<TruncInst>(I));
6482  return nullptr;
6483 }
6484 
6486  PHINode *Phi = dyn_cast<PHINode>(I);
6487  if (!Phi || Phi->getParent() == OrigLoop->getHeader())
6488  return nullptr;
6489 
6490  // We know that all PHIs in non-header blocks are converted into selects, so
6491  // we don't have to worry about the insertion order and we can just use the
6492  // builder. At this point we generate the predication tree. There may be
6493  // duplications since this is a simple recursive scan, but future
6494  // optimizations will clean it up.
6495 
6497  unsigned NumIncoming = Phi->getNumIncomingValues();
6498  for (unsigned In = 0; In < NumIncoming; In++) {
6499  VPValue *EdgeMask =
6500  createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
6501  assert((EdgeMask || NumIncoming == 1) &&
6502  "Multiple predecessors with one having a full mask");
6503  if (EdgeMask)
6504  Masks.push_back(EdgeMask);
6505  }
6506  return new VPBlendRecipe(Phi, Masks);
6507 }
6508 
6510  VFRange &Range) {
6511 
6513  [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6514 
6515  if (IsPredicated)
6516  return false;
6517 
6518  auto IsVectorizableOpcode = [](unsigned Opcode) {
6519  switch (Opcode) {
6520  case Instruction::Add:
6521  case Instruction::And:
6522  case Instruction::AShr:
6523  case Instruction::BitCast:
6524  case Instruction::Br:
6525  case Instruction::Call:
6526  case Instruction::FAdd:
6527  case Instruction::FCmp:
6528  case Instruction::FDiv:
6529  case Instruction::FMul:
6530  case Instruction::FPExt:
6531  case Instruction::FPToSI:
6532  case Instruction::FPToUI:
6533  case Instruction::FPTrunc:
6534  case Instruction::FRem:
6535  case Instruction::FSub:
6536  case Instruction::GetElementPtr:
6537  case Instruction::ICmp:
6538  case Instruction::IntToPtr:
6539  case Instruction::Load:
6540  case Instruction::LShr:
6541  case Instruction::Mul:
6542  case Instruction::Or:
6543  case Instruction::PHI:
6544  case Instruction::PtrToInt:
6545  case Instruction::SDiv:
6546  case Instruction::Select:
6547  case Instruction::SExt:
6548  case Instruction::Shl:
6549  case Instruction::SIToFP:
6550  case Instruction::SRem:
6551  case Instruction::Store:
6552  case Instruction::Sub:
6553  case Instruction::Trunc:
6554  case Instruction::UDiv:
6555  case Instruction::UIToFP:
6556  case Instruction::URem:
6557  case Instruction::Xor:
6558  case Instruction::ZExt:
6559  return true;
6560  }
6561  return false;
6562  };
6563 
6564  if (!IsVectorizableOpcode(I->getOpcode()))
6565  return false;
6566 
6567  if (CallInst *CI = dyn_cast<CallInst>(I)) {
6569  if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
6571  return false;
6572  }
6573 
6574  auto willWiden = [&](unsigned VF) -> bool {
6575  if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
6576  CM.isProfitableToScalarize(I, VF)))
6577  return false;
6578  if (CallInst *CI = dyn_cast<CallInst>(I)) {
6580  // The following case may be scalarized depending on the VF.
6581  // The flag shows whether we use Intrinsic or a usual Call for vectorized
6582  // version of the instruction.
6583  // Is it beneficial to perform intrinsic call compared to lib call?
6584  bool NeedToScalarize;
6585  unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
6586  bool UseVectorIntrinsic =
6587  ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
6588  return UseVectorIntrinsic || !NeedToScalarize;
6589  }
6590  if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
6591  assert(CM.getWideningDecision(I, VF) ==
6593  "Memory widening decisions should have been taken care by now");
6594  return false;
6595  }
6596  return true;
6597  };
6598 
6600  return false;
6601 
6602  // Success: widen this instruction. We optimize the common case where
6603  // consecutive instructions can be represented by a single recipe.
6604  if (!VPBB->empty()) {
6605  VPWidenRecipe *LastWidenRecipe = dyn_cast<VPWidenRecipe>(&VPBB->back());
6606  if (LastWidenRecipe && LastWidenRecipe->appendInstruction(I))
6607  return true;
6608  }
6609 
6610  VPBB->appendRecipe(new VPWidenRecipe(I));
6611  return true;
6612 }
6613 
6615  Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
6617  VPlanPtr &Plan) {
6619  [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
6620  Range);
6621 
6623  [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6624 
6625  auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
6626 
6627  // Find if I uses a predicated instruction. If so, it will use its scalar
6628  // value. Avoid hoisting the insert-element which packs the scalar value into
6629  // a vector value, as that happens iff all users use the vector value.
6630  for (auto &Op : I->operands())
6631  if (auto *PredInst = dyn_cast<Instruction>(Op))
6632  if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
6633  PredInst2Recipe[PredInst]->setAlsoPack(false);
6634 
6635  // Finalize the recipe for Instr, first if it is not predicated.
6636  if (!IsPredicated) {
6637  LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
6638  VPBB->appendRecipe(Recipe);
6639  return VPBB;
6640  }
6641  LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
6642  assert(VPBB->getSuccessors().empty() &&
6643  "VPBB has successors when handling predicated replication.");
6644  // Record predicated instructions for above packing optimizations.
6645  PredInst2Recipe[I] = Recipe;
6646  VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
6647  VPBlockUtils::insertBlockAfter(Region, VPBB);
6648  auto *RegSucc = new VPBasicBlock();
6649  VPBlockUtils::insertBlockAfter(RegSucc, Region);
6650  return RegSucc;
6651 }
6652 
6654  VPRecipeBase *PredRecipe,
6655  VPlanPtr &Plan) {
6656  // Instructions marked for predication are replicated and placed under an
6657  // if-then construct to prevent side-effects.
6658 
6659  // Generate recipes to compute the block mask for this region.
6660  VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
6661 
6662  // Build the triangular if-then region.
6663  std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
6664  assert(Instr->getParent() && "Predicated instruction not in any basic block");
6665  auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
6666  auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
6667  auto *PHIRecipe =
6668  Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
6669  auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
6670  auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
6671  VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
6672 
6673  // Note: first set Entry as region entry and then connect successors starting
6674  // from it in order, to propagate the "parent" of each VPBasicBlock.
6675  VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
6676  VPBlockUtils::connectBlocks(Pred, Exit);
6677 
6678  return Region;
6679 }
6680 
6682  VPlanPtr &Plan, VPBasicBlock *VPBB) {
6683  VPRecipeBase *Recipe = nullptr;
6684  // Check if Instr should belong to an interleave memory recipe, or already
6685  // does. In the latter case Instr is irrelevant.
6686  if ((Recipe = tryToInterleaveMemory(Instr, Range, Plan))) {
6687  VPBB->appendRecipe(Recipe);
6688  return true;
6689  }
6690 
6691  // Check if Instr is a memory operation that should be widened.
6692  if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) {
6693  VPBB->appendRecipe(Recipe);
6694  return true;
6695  }
6696 
6697  // Check if Instr should form some PHI recipe.
6698  if ((Recipe = tryToOptimizeInduction(Instr, Range))) {
6699  VPBB->appendRecipe(Recipe);
6700  return true;
6701  }
6702  if ((Recipe = tryToBlend(Instr, Plan))) {
6703  VPBB->appendRecipe(Recipe);
6704  return true;
6705  }
6706  if (PHINode *Phi = dyn_cast<PHINode>(Instr)) {
6707  VPBB->appendRecipe(new VPWidenPHIRecipe(Phi));
6708  return true;
6709  }
6710 
6711  // Check if Instr is to be widened by a general VPWidenRecipe, after
6712  // having first checked for specific widening recipes that deal with
6713  // Interleave Groups, Inductions and Phi nodes.
6714  if (tryToWiden(Instr, VPBB, Range))
6715  return true;
6716 
6717  return false;
6718 }
6719 
6720 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
6721  unsigned MaxVF) {
6722  assert(OrigLoop->empty() && "Inner loop expected.");
6723 
6724  // Collect conditions feeding internal conditional branches; they need to be
6725  // represented in VPlan for it to model masking.
6726  SmallPtrSet<Value *, 1> NeedDef;
6727 
6728  auto *Latch = OrigLoop->getLoopLatch();
6729  for (BasicBlock *BB : OrigLoop->blocks()) {
6730  if (BB == Latch)
6731  continue;
6732  BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
6733  if (Branch && Branch->isConditional())
6734  NeedDef.insert(Branch->getCondition());
6735  }
6736 
6737  // If the tail is to be folded by masking, the primary induction variable
6738  // needs to be represented in VPlan for it to model early-exit masking.
6739  if (CM.foldTailByMasking())
6740  NeedDef.insert(Legal->getPrimaryInduction());
6741 
6742  // Collect instructions from the original loop that will become trivially dead
6743  // in the vectorized loop. We don't need to vectorize these instructions. For
6744  // example, original induction update instructions can become dead because we
6745  // separately emit induction "steps" when generating code for the new loop.
6746  // Similarly, we create a new latch condition when setting up the structure
6747  // of the new loop, so the old one can become dead.
6748  SmallPtrSet<Instruction *, 4> DeadInstructions;
6749  collectTriviallyDeadInstructions(DeadInstructions);
6750 
6751  for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6752  VFRange SubRange = {VF, MaxVF + 1};
6753  VPlans.push_back(
6754  buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions));
6755  VF = SubRange.End;
6756  }
6757 }
6758 
6759 LoopVectorizationPlanner::VPlanPtr
6760 LoopVectorizationPlanner::buildVPlanWithVPRecipes(
6761  VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
6762  SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6763  // Hold a mapping from predicated instructions to their recipes, in order to
6764  // fix their AlsoPack behavior if a user is determined to replicate and use a
6765  // scalar instead of vector value.
6767 
6769  DenseMap<Instruction *, Instruction *> SinkAfterInverse;
6770 
6771  // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
6772  VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
6773  auto Plan = llvm::make_unique<VPlan>(VPBB);
6774 
6775  VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, TTI, Legal, CM, Builder);
6776  // Represent values that will have defs inside VPlan.
6777  for (Value *V : NeedDef)
6778  Plan->addVPValue(V);
6779 
6780  // Scan the body of the loop in a topological order to visit each basic block
6781  // after having visited its predecessor basic blocks.
6783  DFS.perform(LI);
6784 
6785  for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6786  // Relevant instructions from basic block BB will be grouped into VPRecipe
6787  // ingredients and fill a new VPBasicBlock.
6788  unsigned VPBBsForBB = 0;
6789  auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
6790  VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
6791  VPBB = FirstVPBBForBB;
6792  Builder.setInsertPoint(VPBB);
6793 
6794  std::vector<Instruction *> Ingredients;
6795 
6796  // Organize the ingredients to vectorize from current basic block in the
6797  // right order.
6798  for (Instruction &I : BB->instructionsWithoutDebug()) {
6799  Instruction *Instr = &I;
6800 
6801  // First filter out irrelevant instructions, to ensure no recipes are
6802  // built for them.
6803  if (isa<BranchInst>(Instr) ||
6804  DeadInstructions.find(Instr) != DeadInstructions.end())
6805  continue;
6806 
6807  // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct
6808  // member of the IG, do not construct any Recipe for it.
6809  const InterleaveGroup<Instruction> *IG =
6810  CM.getInterleavedAccessGroup(Instr);
6811  if (IG && Instr != IG->getInsertPos() &&
6812  Range.Start >= 2 && // Query is illegal for VF == 1
6813  CM.getWideningDecision(Instr, Range.Start) ==
6815  auto SinkCandidate = SinkAfterInverse.find(Instr);
6816  if (SinkCandidate != SinkAfterInverse.end())
6817  Ingredients.push_back(SinkCandidate->second);
6818  continue;
6819  }
6820 
6821  // Move instructions to handle first-order recurrences, step 1: avoid
6822  // handling this instruction until after we've handled the instruction it
6823  // should follow.
6824  auto SAIt = SinkAfter.find(Instr);
6825  if (SAIt != SinkAfter.end()) {
6826  LLVM_DEBUG(dbgs() << "Sinking" << *SAIt->first << " after"
6827  << *SAIt->second
6828  << " to vectorize a 1st order recurrence.\n");
6829  SinkAfterInverse[SAIt->second] = Instr;
6830  continue;
6831  }
6832 
6833  Ingredients.push_back(Instr);
6834 
6835  // Move instructions to handle first-order recurrences, step 2: push the
6836  // instruction to be sunk at its insertion point.
6837  auto SAInvIt = SinkAfterInverse.find(Instr);
6838  if (SAInvIt != SinkAfterInverse.end())
6839  Ingredients.push_back(SAInvIt->second);
6840  }
6841 
6842  // Introduce each ingredient into VPlan.
6843  for (Instruction *Instr : Ingredients) {
6844  if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
6845  continue;
6846 
6847  // Otherwise, if all widening options failed, Instruction is to be
6848  // replicated. This may create a successor for VPBB.
6849  VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
6850  Instr, Range, VPBB, PredInst2Recipe, Plan);
6851  if (NextVPBB != VPBB) {
6852  VPBB = NextVPBB;
6853  VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
6854  : "");
6855  }
6856  }
6857  }
6858 
6859  // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
6860  // may also be empty, such as the last one VPBB, reflecting original
6861  // basic-blocks with no recipes.
6862  VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
6863  assert(PreEntry->empty() && "Expecting empty pre-entry block.");
6864  VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
6865  VPBlockUtils::disconnectBlocks(PreEntry, Entry);
6866  delete PreEntry;
6867 
6868  std::string PlanName;
6869  raw_string_ostream RSO(PlanName);
6870  unsigned VF = Range.Start;
6871  Plan->addVF(VF);
6872  RSO << "Initial VPlan for VF={" << VF;
6873  for (VF *= 2; VF < Range.End; VF *= 2) {
6874  Plan->addVF(VF);
6875  RSO << "," << VF;
6876  }
6877  RSO << "},UF>=1";
6878  RSO.flush();
6879  Plan->setName(PlanName);
6880 
6881  return Plan;
6882 }
6883 
6884 LoopVectorizationPlanner::VPlanPtr
6885 LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
6886  // Outer loop handling: They may require CFG and instruction level
6887  // transformations before even evaluating whether vectorization is profitable.
6888  // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6889  // the vectorization pipeline.
6890  assert(!OrigLoop->empty());
6891  assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6892 
6893  // Create new empty VPlan
6894  auto Plan = llvm::make_unique<VPlan>();
6895 
6896  // Build hierarchical CFG
6897  VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
6898  HCFGBuilder.buildHierarchicalCFG();
6899 
6900  SmallPtrSet<Instruction *, 1> DeadInstructions;
6902  Plan, Legal->getInductionVars(), DeadInstructions);
6903 
6904  for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
6905  Plan->addVF(VF);
6906 
6907  return Plan;
6908 }
6909 
6910 Value* LoopVectorizationPlanner::VPCallbackILV::
6911 getOrCreateVectorValues(Value *V, unsigned Part) {
6912  return ILV.getOrCreateVectorValue(V, Part);
6913 }
6914 
6916  O << " +\n"
6917  << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
6918  IG->getInsertPos()->printAsOperand(O, false);
6919  if (User) {
6920  O << ", ";
6921  User->getOperand(0)->printAsOperand(O);
6922  }
6923  O << "\\l\"";
6924  for (unsigned i = 0; i < IG->getFactor(); ++i)
6925  if (Instruction *I = IG->getMember(i))
6926  O << " +\n"
6927  << Indent << "\" " << VPlanIngredient(I) << " " << i << "\\l\"";
6928 }
6929 
6931  for (auto &Instr : make_range(Begin, End))
6932  State.ILV->widenInstruction(Instr);
6933 }
6934 
6936  assert(!State.Instance && "Int or FP induction being replicated.");
6937  State.ILV->widenIntOrFpInduction(IV, Trunc);
6938 }
6939 
6941  State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
6942 }
6943 
6945  State.ILV->setDebugLocFromInst(State.Builder, Phi);
6946  // We know that all PHIs in non-header blocks are converted into
6947  // selects, so we don't have to worry about the insertion order and we
6948  // can just use the builder.
6949  // At this point we generate the predication tree. There may be
6950  // duplications since this is a simple recursive scan, but future
6951  // optimizations will clean it up.
6952 
6953  unsigned NumIncoming = Phi->getNumIncomingValues();
6954 
6955  assert((User || NumIncoming == 1) &&
6956  "Multiple predecessors with predecessors having a full mask");
6957  // Generate a sequence of selects of the form:
6958  // SELECT(Mask3, In3,
6959  // SELECT(Mask2, In2,
6960  // ( ...)))
6961  InnerLoopVectorizer::VectorParts Entry(State.UF);
6962  for (unsigned In = 0; In < NumIncoming; ++In) {
6963  for (unsigned Part = 0; Part < State.UF; ++Part) {
6964  // We might have single edge PHIs (blocks) - use an identity
6965  // 'select' for the first PHI operand.
6966  Value *In0 =
6967  State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part);
6968  if (In == 0)
6969  Entry[Part] = In0; // Initialize with the first incoming value.
6970  else {
6971  // Select between the current value and the previous incoming edge
6972  // based on the incoming mask.
6973  Value *Cond = State.get(User->getOperand(In), Part);
6974  Entry[Part] =
6975  State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
6976  }
6977  }
6978  }
6979  for (unsigned Part = 0; Part < State.UF; ++Part)
6980  State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
6981 }
6982 
6984  assert(!State.Instance && "Interleave group being replicated.");
6985  if (!User)
6986  return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());
6987 
6988  // Last (and currently only) operand is a mask.
6989  InnerLoopVectorizer::VectorParts MaskValues(State.UF);
6991  for (unsigned Part = 0; Part < State.UF; ++Part)
6992  MaskValues[Part] = State.get(Mask, Part);
6993  State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues);
6994 }
6995 
6997  if (State.Instance) { // Generate a single instance.
6998  State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
6999  // Insert scalar instance packing it into a vector.
7000  if (AlsoPack && State.VF > 1) {
7001  // If we're constructing lane 0, initialize to start from undef.
7002  if (State.Instance->Lane == 0) {
7003  Value *Undef =
7004  UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
7005  State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
7006  }
7007  State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
7008  }
7009  return;
7010  }
7011 
7012  // Generate scalar instances for all VF lanes of all UF parts, unless the
7013  // instruction is uniform inwhich case generate only the first lane for each
7014  // of the UF parts.
7015  unsigned EndLane = IsUniform ? 1 : State.VF;
7016  for (unsigned Part = 0; Part < State.UF; ++Part)
7017  for (unsigned Lane = 0; Lane < EndLane; ++Lane)
7018  State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
7019 }
7020 
7022  assert(State.Instance && "Branch on Mask works only on single instance.");
7023 
7024  unsigned Part = State.Instance->Part;
7025  unsigned Lane = State.Instance->Lane;
7026 
7027  Value *ConditionBit = nullptr;
7028  if (!User) // Block in mask is all-one.
7029  ConditionBit = State.Builder.getTrue();
7030  else {
7031  VPValue *BlockInMask = User->getOperand(0);
7032  ConditionBit = State.get(BlockInMask, Part);
7033  if (ConditionBit->getType()->isVectorTy())
7034  ConditionBit = State.Builder.CreateExtractElement(
7035  ConditionBit, State.Builder.getInt32(Lane));
7036  }
7037 
7038  // Replace the temporary unreachable terminator with a new conditional branch,
7039  // whose two destinations will be set later when they are created.
7040  auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
7041  assert(isa<UnreachableInst>(CurrentTerminator) &&
7042  "Expected to replace unreachable terminator with conditional branch.");
7043  auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
7044  CondBr->setSuccessor(0, nullptr);
7045  ReplaceInstWithInst(CurrentTerminator, CondBr);
7046 }
7047 
7049  assert(State.Instance && "Predicated instruction PHI works per instance.");
7050  Instruction *ScalarPredInst = cast<Instruction>(
7051  State.ValueMap.getScalarValue(PredInst, *State.Instance));
7052  BasicBlock *PredicatedBB = ScalarPredInst->getParent();
7053  BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
7054  assert(PredicatingBB && "Predicated block has no single predecessor.");
7055 
7056  // By current pack/unpack logic we need to generate only a single phi node: if
7057  // a vector value for the predicated instruction exists at this point it means
7058  // the instruction has vector users only, and a phi for the vector value is
7059  // needed. In this case the recipe of the predicated instruction is marked to
7060  // also do that packing, thereby "hoisting" the insert-element sequence.
7061  // Otherwise, a phi node for the scalar value is needed.
7062  unsigned Part = State.Instance->Part;
7063  if (State.ValueMap.hasVectorValue(PredInst, Part)) {
7064  Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
7065  InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
7066  PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
7067  VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
7068  VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
7069  State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
7070  } else {
7071  Type *PredInstType = PredInst->getType();
7072  PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
7073  Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
7074  Phi->addIncoming(ScalarPredInst, PredicatedBB);
7075  State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
7076  }
7077 }
7078 
7080  if (!User)
7081  return State.ILV->vectorizeMemoryInstruction(&Instr);
7082 
7083  // Last (and currently only) operand is a mask.
7084  InnerLoopVectorizer::VectorParts MaskValues(State.UF);
7086  for (unsigned Part = 0; Part < State.UF; ++Part)
7087  MaskValues[Part] = State.get(Mask, Part);
7088  State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
7089 }
7090 
7091 // Process the loop in the VPlan-native vectorization path. This path builds
7092 // VPlan upfront in the vectorization pipeline, which allows to apply
7093 // VPlan-to-VPlan transformations from the very beginning without modifying the
7094 // input LLVM IR.
7100 
7101  assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
7102  Function *F = L->getHeader()->getParent();
7103  InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
7104  LoopVectorizationCostModel CM(L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7105  &Hints, IAI);
7106  // Use the planner for outer loop vectorization.
7107  // TODO: CM is not used at this point inside the planner. Turn CM into an
7108  // optional argument if we don't need it in the future.
7109  LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM);
7110 
7111  // Get user vectorization factor.
7112  unsigned UserVF = Hints.getWidth();
7113 
7114  // Check the function attributes to find out if this function should be
7115  // optimized for size.
7116  bool OptForSize =
7118 
7119  // Plan how to best vectorize, return the best VF and its cost.
7120  VectorizationFactor VF = LVP.planInVPlanNativePath(OptForSize, UserVF);
7121 
7122  // If we are stress testing VPlan builds, do not attempt to generate vector
7123  // code.
7125  return false;
7126 
7127  LVP.setBestPlan(VF.Width, 1);
7128 
7129  InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, UserVF, 1, LVL,
7130  &CM);
7131  LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
7132  << L->getHeader()->getParent()->getName() << "\"\n");
7133  LVP.executePlan(LB, DT);
7134 
7135  // Mark the loop as already vectorized to avoid vectorizing again.
7136  Hints.setAlreadyVectorized();
7137 
7139  return true;
7140 }
7141 
7143  assert((EnableVPlanNativePath || L->empty()) &&
7144  "VPlan-native path is not enabled. Only process inner loops.");
7145 
7146 #ifndef NDEBUG
7147  const std::string DebugLocStr = getDebugLocString(L);
7148 #endif /* NDEBUG */
7149 
7150  LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
7151  << L->getHeader()->getParent()->getName() << "\" from "
7152  << DebugLocStr << "\n");
7153 
7154  LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
7155 
7156  LLVM_DEBUG(
7157  dbgs() << "LV: Loop hints:"
7158  << " force="
7159  << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
7160  ? "disabled"
7161  : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
7162  ? "enabled"
7163  : "?"))
7164  << " width=" << Hints.getWidth()
7165  << " unroll=" << Hints.getInterleave() << "\n");
7166 
7167  // Function containing loop
7168  Function *F = L->getHeader()->getParent();
7169 
7170  // Looking at the diagnostic output is the only way to determine if a loop
7171  // was vectorized (other than looking at the IR or machine code), so it
7172  // is important to generate an optimization remark for each loop. Most of
7173  // these messages are generated as OptimizationRemarkAnalysis. Remarks
7174  // generated as OptimizationRemark and OptimizationRemarkMissed are
7175  // less verbose reporting vectorized loops and unvectorized loops that may
7176  // benefit from vectorization, respectively.
7177 
7178  if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
7179  LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7180  return false;
7181  }
7182 
7183  PredicatedScalarEvolution PSE(*SE, *L);
7184 
7185  // Check if it is legal to vectorize the loop.
7186  LoopVectorizationRequirements Requirements(*ORE);
7187  LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, GetLAA, LI, ORE,
7188  &Requirements, &Hints, DB, AC);
7189  if (!LVL.canVectorize(EnableVPlanNativePath)) {
7190  LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7191  Hints.emitRemarkWithHints();
7192  return false;
7193  }
7194 
7195  // Check the function attributes to find out if this function should be
7196  // optimized for size.
7197  bool OptForSize =
7198  Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize();
7199 
7200  // Entrance to the VPlan-native vectorization path. Outer loops are processed
7201  // here. They may require CFG and instruction level transformations before
7202  // even evaluating whether vectorization is profitable. Since we cannot modify
7203  // the incoming IR, we need to build VPlan upfront in the vectorization
7204  // pipeline.
7205  if (!L->empty())
7206  return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
7207  ORE, Hints);
7208 
7209  assert(L->empty() && "Inner loop expected.");
7210  // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7211  // count by optimizing for size, to minimize overheads.
7212  // Prefer constant trip counts over profile data, over upper bound estimate.
7213  unsigned ExpectedTC = 0;
7214  bool HasExpectedTC = false;
7215  if (const SCEVConstant *ConstExits =
7216  dyn_cast<SCEVConstant>(SE->getBackedgeTakenCount(L))) {
7217  const APInt &ExitsCount = ConstExits->getAPInt();
7218  // We are interested in small values for ExpectedTC. Skip over those that
7219  // can't fit an unsigned.
7220  if (ExitsCount.ult(std::numeric_limits<unsigned>::max())) {
7221  ExpectedTC = static_cast<unsigned>(ExitsCount.getZExtValue()) + 1;
7222  HasExpectedTC = true;
7223  }
7224  }
7225  // ExpectedTC may be large because it's bound by a variable. Check
7226  // profiling information to validate we should vectorize.
7227  if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) {
7228  auto EstimatedTC = getLoopEstimatedTripCount(L);
7229  if (EstimatedTC) {
7230  ExpectedTC = *EstimatedTC;
7231  HasExpectedTC = true;
7232  }
7233  }
7234  if (!HasExpectedTC) {
7235  ExpectedTC = SE->getSmallConstantMaxTripCount(L);
7236  HasExpectedTC = (ExpectedTC > 0);
7237  }
7238 
7239  if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) {
7240  LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
7241  << "This loop is worth vectorizing only if no scalar "
7242  << "iteration overheads are incurred.");
7243  if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
7244  LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
7245  else {
7246  LLVM_DEBUG(dbgs() << "\n");
7247  // Loops with a very small trip count are considered for vectorization
7248  // under OptForSize, thereby making sure the cost of their loop body is
7249  // dominant, free of runtime guards and scalar iteration overheads.
7250  OptForSize = true;
7251  }
7252  }
7253 
7254  // Check the function attributes to see if implicit floats are allowed.
7255  // FIXME: This check doesn't seem possibly correct -- what if the loop is
7256  // an integer loop and the vector instructions selected are purely integer
7257  // vector instructions?
7258  if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
7259  LLVM_DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
7260  "attribute is used.\n");
7261  ORE->emit(createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(),
7262  "NoImplicitFloat", L)
7263  << "loop not vectorized due to NoImplicitFloat attribute");
7264  Hints.emitRemarkWithHints();
7265  return false;
7266  }
7267 
7268  // Check if the target supports potentially unsafe FP vectorization.
7269  // FIXME: Add a check for the type of safety issue (denormal, signaling)
7270  // for the target we're vectorizing for, to make sure none of the
7271  // additional fp-math flags can help.
7272  if (Hints.isPotentiallyUnsafe() &&
7274  LLVM_DEBUG(
7275  dbgs() << "LV: Potentially unsafe FP op prevents vectorization.\n");
7276  ORE->emit(
7277  createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(), "UnsafeFP", L)
7278  << "loop not vectorized due to unsafe FP support.");
7279  Hints.emitRemarkWithHints();
7280  return false;
7281  }
7282 
7283  bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
7284  InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
7285 
7286  // If an override option has been passed in for interleaved accesses, use it.
7287  if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
7288  UseInterleaved = EnableInterleavedMemAccesses;
7289 
7290  // Analyze interleaved memory accesses.
7291  if (UseInterleaved) {
7292  IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
7293  }
7294 
7295  // Use the cost model.
7296  LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F,
7297  &Hints, IAI);
7298  CM.collectValuesToIgnore();
7299 
7300  // Use the planner for vectorization.
7301  LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM);
7302 
7303  // Get user vectorization factor.
7304  unsigned UserVF = Hints.getWidth();
7305 
7306  // Plan how to best vectorize, return the best VF and its cost.
7307  VectorizationFactor VF = LVP.plan(OptForSize, UserVF);
7308 
7309  // Select the interleave count.
7310  unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost);
7311 
7312  // Get user interleave count.
7313  unsigned UserIC = Hints.getInterleave();
7314 
7315  // Identify the diagnostic messages that should be produced.
7316  std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
7317  bool VectorizeLoop = true, InterleaveLoop = true;
7318  if (Requirements.doesNotMeet(F, L, Hints)) {
7319  LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
7320  "requirements.\n");
7321  Hints.emitRemarkWithHints();
7322  return false;
7323  }
7324 
7325  if (VF.Width == 1) {
7326  LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
7327  VecDiagMsg = std::make_pair(
7328  "VectorizationNotBeneficial",
7329  "the cost-model indicates that vectorization is not beneficial");
7330  VectorizeLoop = false;
7331  }
7332 
7333  if (IC == 1 && UserIC <= 1) {
7334  // Tell the user interleaving is not beneficial.
7335  LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
7336  IntDiagMsg = std::make_pair(
7337  "InterleavingNotBeneficial",
7338  "the cost-model indicates that interleaving is not beneficial");
7339  InterleaveLoop = false;
7340  if (UserIC == 1) {
7341  IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
7342  IntDiagMsg.second +=
7343  " and is explicitly disabled or interleave count is set to 1";
7344  }
7345  } else if (IC > 1 && UserIC == 1) {
7346  // Tell the user interleaving is beneficial, but it explicitly disabled.
7347  LLVM_DEBUG(
7348  dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
7349  IntDiagMsg = std::make_pair(
7350  "InterleavingBeneficialButDisabled",
7351  "the cost-model indicates that interleaving is beneficial "
7352  "but is explicitly disabled or interleave count is set to 1");
7353  InterleaveLoop = false;
7354  }
7355 
7356  // Override IC if user provided an interleave count.
7357  IC = UserIC > 0 ? UserIC : IC;
7358 
7359  // Emit diagnostic messages, if any.
7360  const char *VAPassName = Hints.vectorizeAnalysisPassName();
7361  if (!VectorizeLoop && !InterleaveLoop) {
7362  // Do not vectorize or interleaving the loop.
7363  ORE->emit([&]() {
7364  return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
7365  L->getStartLoc(), L->getHeader())
7366  << VecDiagMsg.second;
7367  });
7368  ORE->emit([&]() {
7369  return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
7370  L->getStartLoc(), L->getHeader())
7371  << IntDiagMsg.second;
7372  });
7373  return false;
7374  } else if (!VectorizeLoop && InterleaveLoop) {
7375  LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7376  ORE->emit([&]() {
7377  return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
7378  L->getStartLoc(), L->getHeader())
7379  << VecDiagMsg.second;
7380  });
7381  } else if (VectorizeLoop && !InterleaveLoop) {
7382  LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7383  << ") in " << DebugLocStr << '\n');
7384  ORE->emit([&]() {
7385  return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
7386  L->getStartLoc(), L->getHeader())
7387  << IntDiagMsg.second;
7388  });
7389  } else if (VectorizeLoop && InterleaveLoop) {
7390  LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7391  << ") in " << DebugLocStr << '\n');
7392  LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7393  }
7394 
7395  LVP.setBestPlan(VF.Width, IC);
7396 
7397  using namespace ore;
7398  bool DisableRuntimeUnroll = false;
7399  MDNode *OrigLoopID = L->getLoopID();
7400 
7401  if (!VectorizeLoop) {
7402  assert(IC > 1 && "interleave count should not be 1 or 0");
7403  // If we decided that it is not legal to vectorize the loop, then
7404  // interleave it.
7405  InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
7406  &CM);
7407  LVP.executePlan(Unroller, DT);
7408 
7409  ORE->emit([&]() {
7410  return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
7411  L->getHeader())
7412  << "interleaved loop (interleaved count: "
7413  << NV("InterleaveCount", IC) << ")";
7414  });
7415  } else {
7416  // If we decided that it is *legal* to vectorize the loop, then do it.
7417  InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
7418  &LVL, &CM);
7419  LVP.executePlan(LB, DT);
7420  ++LoopsVectorized;
7421 
7422  // Add metadata to disable runtime unrolling a scalar loop when there are
7423  // no runtime checks about strides and memory. A scalar loop that is
7424  // rarely used is not worth unrolling.
7425  if (!LB.areSafetyChecksAdded())
7426  DisableRuntimeUnroll = true;
7427 
7428  // Report the vectorization decision.
7429  ORE->emit([&]() {
7430  return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
7431  L->getHeader())
7432  << "vectorized loop (vectorization width: "
7433  << NV("VectorizationFactor", VF.Width)
7434  << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
7435  });
7436  }
7437 
7438  Optional<MDNode *> RemainderLoopID =
7440  LLVMLoopVectorizeFollowupEpilogue});
7441  if (RemainderLoopID.hasValue()) {
7442  L->setLoopID(RemainderLoopID.getValue());
7443  } else {
7444  if (DisableRuntimeUnroll)
7446 
7447  // Mark the loop as already vectorized to avoid vectorizing again.
7448  Hints.setAlreadyVectorized();
7449  }
7450 
7452  return true;
7453 }
7454 
7458  DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
7459  std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
7460  OptimizationRemarkEmitter &ORE_) {
7461  SE = &SE_;
7462  LI = &LI_;
7463  TTI = &TTI_;
7464  DT = &DT_;
7465  BFI = &BFI_;
7466  TLI = TLI_;
7467  AA = &AA_;
7468  AC = &AC_;
7469  GetLAA = &GetLAA_;
7470  DB = &DB_;
7471  ORE = &ORE_;
7472 
7473  // Don't attempt if
7474  // 1. the target claims to have no vector registers, and
7475  // 2. interleaving won't help ILP.
7476  //
7477  // The second condition is necessary because, even if the target has no
7478  // vector registers, loop vectorization may still enable scalar
7479  // interleaving.
7480  if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2)
7481  return false;
7482 
7483  bool Changed = false;
7484 
7485  // The vectorizer requires loops to be in simplified form.
7486  // Since simplification may add new inner loops, it has to run before the
7487  // legality and profitability checks. This means running the loop vectorizer
7488  // will simplify all loops, regardless of whether anything end up being
7489  // vectorized.
7490  for (auto &L : *LI)
7491  Changed |= simplifyLoop(L, DT, LI, SE, AC, false /* PreserveLCSSA */);
7492 
7493  // Build up a worklist of inner-loops to vectorize. This is necessary as
7494  // the act of vectorizing or partially unrolling a loop creates new loops
7495  // and can invalidate iterators across the loops.
7496  SmallVector<Loop *, 8> Worklist;
7497 
7498  for (Loop *L : *LI)
7499  collectSupportedLoops(*L, LI, ORE, Worklist);
7500 
7501  LoopsAnalyzed += Worklist.size();
7502 
7503  // Now walk the identified inner loops.
7504  while (!Worklist.empty()) {
7505  Loop *L = Worklist.pop_back_val();
7506 
7507  // For the inner loops we actually process, form LCSSA to simplify the
7508  // transform.
7509  Changed |= formLCSSARecursively(*L, *DT, LI, SE);
7510 
7511  Changed |= processLoop(L);
7512  }
7513 
7514  // Process each loop nest in the function.
7515  return Changed;
7516 }
7517 
7520  auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
7521  auto &LI = AM.getResult<LoopAnalysis>(F);
7522  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
7523  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
7524  auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
7525  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
7526  auto &AA = AM.getResult<AAManager>(F);
7527  auto &AC = AM.getResult<AssumptionAnalysis>(F);
7528  auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
7530 
7531  auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
7532  std::function<const LoopAccessInfo &(Loop &)> GetLAA =
7533  [&](Loop &L) -> const LoopAccessInfo & {
7534  LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, nullptr};
7535  return LAM.getResult<LoopAccessAnalysis>(L, AR);
7536  };
7537  bool Changed =
7538  runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE);
7539  if (!Changed)
7540  return PreservedAnalyses::all();
7541  PreservedAnalyses PA;
7542 
7543  // We currently do not preserve loopinfo/dominator analyses with outer loop
7544  // vectorization. Until this is addressed, mark these analyses as preserved
7545  // only for non-VPlan-native path.
7546  // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
7547  if (!EnableVPlanNativePath) {
7548  PA.preserve<LoopAnalysis>();
7550  }
7551  PA.preserve<BasicAA>();
7552  PA.preserve<GlobalsAA>();
7553  return PA;
7554 }
Legacy wrapper pass to provide the GlobalsAAResult object.
Pass interface - Implemented by all &#39;passes&#39;.
Definition: Pass.h:81
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value *> &EphValues)
Collect a loop&#39;s ephemeral values (those used only by an assume or similar intrinsics in the loop)...
Definition: CodeMetrics.cpp:72
struct llvm::VPTransformState::CFGState CFG
Value * CreateInBoundsGEP(Value *Ptr, ArrayRef< Value *> IdxList, const Twine &Name="")
Definition: IRBuilder.h:1477
Type * getVectorElementType() const
Definition: Type.h:371
uint64_t CallInst * C
Value * getValueOperand()
Definition: Instructions.h:410
unsigned getSmallConstantTripCount(const Loop *L)
Returns the maximum trip count of the loop if it is a single-exit loop and we can compute a small max...
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks &#39;this&#39; from the containing basic block and deletes it.
Definition: Instruction.cpp:68
Value * getOrCreateScalarValue(Value *V, const VPIteration &Instance)
Return a value in the new loop corresponding to V from the original loop at unroll and vector indices...
Main class to build the VPlan H-CFG for an incoming IR.
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:111
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:259
void copyFastMathFlags(FastMathFlags FMF)
Convenience function for transferring all fast-math flag values to this instruction, which must be an operator which supports these flags.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
Type * getWidestInductionType()
Returns the widest induction type.
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional &#39;br Cond, TrueDest, FalseDest&#39; instruction.
Definition: IRBuilder.h:854
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1949
void ReplaceInstWithInst(BasicBlock::InstListType &BIL, BasicBlock::iterator &BI, Instruction *I)
Replace the instruction specified by BI with the instruction specified by I.
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:636
bool appendInstruction(Instruction *Instr)
Augment the recipe to include Instr, if it lies at its End.
Definition: VPlan.h:711
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, TargetLibraryInfo &TLI, DominatorTree &DT)
This is the entry point for all transforms.
DemandedBits * DB
Demanded bits analysis.
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
BinaryOperator * getInductionBinOp() const
static IntegerType * getInt1Ty(LLVMContext &C)
Definition: Type.cpp:173
Diagnostic information for missed-optimization remarks.
static cl::opt< bool > MaximizeBandwidth("vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " "will be determined by the smallest type in loop."))
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static cl::opt< bool > LoopVectorizeWithBlockFrequency("loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, cl::desc("Enable the use of the block frequency analysis to access PGO " "heuristics minimizing code growth in cold regions and being more " "aggressive in hot regions."))
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
Definition: LoopInfoImpl.h:225
InstWidening
Decision that was taken during cost calculation for memory instruction.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1563
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1298
bool isLegalMaskedScatter(Type *DataType)
Returns true if the target machine supports masked scatter operation for the given DataType...
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
MapVector< Instruction *, uint64_t > computeMinimumValueSizes(ArrayRef< BasicBlock *> Blocks, DemandedBits &DB, const TargetTransformInfo *TTI=nullptr)
Compute a map of integer instructions to their minimum legal type size.
LLVMContext & Context
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:250
bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF=1)
Returns true if I is a memory instruction with consecutive memory access that can be widened...
DiagnosticInfoOptimizationBase::Argument NV
void setFast(bool B=true)
Definition: Operator.h:231
bool isMaskRequired(const Instruction *I)
Returns true if vector representation of the instruction I requires mask.
void execute(VPTransformState &State) override
Generate the wide load/store.
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:1669
void addNewMetadata(Instruction *To, const Instruction *Orig)
Add additional metadata to To that was not present on Orig.
VPInterleaveRecipe * tryToInterleaveMemory(Instruction *I, VFRange &Range, VPlanPtr &Plan)
Check if belongs to an Interleave Group within the given VF Range,.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:770
void collectValuesToIgnore()
Collect values we want to ignore in the cost model.
This class represents lattice values for constants.
Definition: AllocatorList.h:24
InnerLoopVectorizer vectorizes loops which contain only one basic block to a specified vectorization ...
static cl::opt< unsigned > ForceTargetNumScalarRegs("force-target-num-scalar-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of scalar registers."))
const LoopVectorizeHints * Hints
Loop Vectorize Hint.
const InterleaveGroup< Instruction > * getInterleavedAccessGroup(Instruction *Instr)
Get the interleaved access group that Instr belongs to.
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:78
This is the interface for a simple mod/ref and alias analysis over globals.
Pass * createLoopVectorizePass(bool InterleaveOnlyWhenForced=false, bool VectorizeOnlyWhenForced=false)
bool isFPVectorizationPotentiallyUnsafe() const
Indicate that it is potentially unsafe to automatically vectorize floating-point operations because t...
Optional< VPIteration > Instance
Hold the indices to generate specific scalar instructions.
Definition: VPlan.h:248
A Module instance is used to store all the information related to an LLVM module. ...
Definition: Module.h:65
#define LLVM_FALLTHROUGH
Definition: Compiler.h:86
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, unsigned Align, const char *Name)
Provided to resolve &#39;CreateAlignedLoad(Ptr, Align, "...")&#39; correctly, instead of converting the strin...
Definition: IRBuilder.h:1393
void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance, bool IfPredicateInstr)
A helper function to scalarize a single Instruction in the innermost loop.
void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
Definition: Metadata.cpp:859
std::unique_ptr< LoopVersioning > LVer
LoopVersioning.
void emitSCEVChecks(Loop *L, BasicBlock *Bypass)
Emit a bypass check to see if all of the SCEV assumptions we&#39;ve had to make are correct.
static MDString * get(LLVMContext &Context, StringRef Str)
Definition: Metadata.cpp:454
Interval Class - An Interval is a set of nodes defined such that every node in the interval has all o...
Definition: Interval.h:37
bool supportsEfficientVectorElementLoadStore() const
If target has efficient vector element load/store instructions, it can return true here so that inser...
VectorizationFactor planInVPlanNativePath(bool OptForSize, unsigned UserVF)
Use the VPlan-native path to plan how to best vectorize, return the best VF and its cost...
ConstantInt * getConstIntStepValue() const
Min/max implemented in terms of select(cmp()).
Definition: IVDescriptors.h:73
This class represents zero extension of integer types.
Value * getScalarValue(Value *Key, const VPIteration &Instance)
Retrieve the existing scalar value that corresponds to Key and Instance.
Definition: VPlan.h:173
Instruction::BinaryOps getInductionOpcode() const
Returns binary opcode of the induction operator.
LoopInfo * LI
Loop Info.
Value * createBitOrPointerCast(Value *V, VectorType *DstVTy, const DataLayout &DL)
Returns a bitcasted value to the requested vector type.
static unsigned getScalarizationOverhead(Instruction *I, unsigned VF, const TargetTransformInfo &TTI)
Estimate the overhead of scalarizing an instruction.
This provides a very simple, boring adaptor for a begin and end iterator into a range type...
BasicBlock * PrevBB
The previous IR BasicBlock created or used.
Definition: VPlan.h:289
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition: VPlan.h:1050
value_op_iterator value_op_begin()
Definition: User.h:256
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value *> VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal, MD_access_group].
static void AddRuntimeUnrollDisableMetaData(Loop *L)
The main scalar evolution driver.
bool isUniform(Value *V)
Returns true if the value V is uniform within the loop.
TODO: The following VectorizationFactor was pulled out of LoopVectorizationCostModel class...
static std::string getDebugLocString(const Loop *L)
IRBuilder & Builder
Hold a reference to the IRBuilder used to generate output IR code.
Definition: VPlan.h:313
This is a helper struct for maintaining vectorization state.
Definition: VPlan.h:109
This class represents a function call, abstracting a target machine&#39;s calling convention.
const ValueToValueMap & getSymbolicStrides() const
If an access has a symbolic strides, this maps the pointer value to the stride symbol.
This file contains the declarations for metadata subclasses.
void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF)
Vectorize a single PHINode in a block.
unsigned getNumberOfRegisters(bool Vector) const
DominatorTree * DT
Dominator Tree.
void collectTriviallyDeadInstructions(SmallPtrSetImpl< Instruction *> &DeadInstructions)
Collect the instructions from the original loop that would be trivially dead in the vectorized loop i...
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Definition: LoopInfoImpl.h:174
An immutable pass that tracks lazily created AssumptionCache objects.
VPWidenMemoryInstructionRecipe * tryToWidenMemory(Instruction *I, VFRange &Range, VPlanPtr &Plan)
Check if is a memory instruction to be widened for Range.Start and potentially masked.
static Type * largestIntegerVectorType(Type *T1, Type *T2)
void fixLCSSAPHIs()
The Loop exit block may have single value PHI nodes with some incoming value.
unsigned less or equal
Definition: InstrTypes.h:672
unsigned less than
Definition: InstrTypes.h:671
Helper class to create VPRecipies from IR instructions.
A cache of @llvm.assume calls within a function.
Analysis pass providing the TargetTransformInfo.
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, Value *Step, Instruction *EntryVal)
Create a vector induction phi node based on an existing scalar one.
Externally visible function.
Definition: GlobalValue.h:49
Loop * TheLoop
The loop that we evaluate.
BasicBlock * LoopScalarPreHeader
The scalar-loop preheader.
bool tryToCreateRecipe(Instruction *Instr, VFRange &Range, VPlanPtr &Plan, VPBasicBlock *VPBB)
Check if a recipe can be create for I withing the given VF Range.
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:705
void initializeLoopVectorizePass(PassRegistry &)
static void collectSupportedLoops(Loop &L, LoopInfo *LI, OptimizationRemarkEmitter *ORE, SmallVectorImpl< Loop *> &V)
A Recipe for widening load/store operations.
Definition: VPlan.h:938
Value * TripCount
Trip count of the original loop.
void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, const Instruction *EntryVal, Value *VectorLoopValue, unsigned Part, unsigned Lane=UINT_MAX)
If there is a cast involved in the induction variable ID, which should be ignored in the vectorized l...
This class implements a map that also provides access to all stored values in a deterministic order...
Definition: MapVector.h:38
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
int getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type *> Tys) const
value_op_iterator value_op_end()
Definition: User.h:259
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1186
BasicBlock * getSuccessor(unsigned i) const
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:1663
int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
InductionKind getKind() const
STATISTIC(NumFunctions, "Total number of functions")
A debug info location.
Definition: DebugLoc.h:34
Metadata node.
Definition: Metadata.h:864
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:231
F(f)
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1069
An instruction for reading from memory.
Definition: Instructions.h:168
Hexagon Common GEP
Value * getCondition() const
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.cpp:138
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:230
bool hasAnyScalarValue(Value *Key) const
Definition: VPlan.h:147
#define op(i)
virtual Value * getStepVector(Value *Val, int StartIdx, Value *Step, Instruction::BinaryOps Opcode=Instruction::BinaryOpsEnd)
This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) to each vector element of Val...
This defines the Use class.
VPBlockBase * getSingleSuccessor() const
Definition: VPlan.h:431
Value * getStartValue() const
static cl::opt< unsigned > NumberOfStoresToPredicate("vectorize-num-stores-pred", cl::init(1), cl::Hidden, cl::desc("Max number of stores to be predicated behind an if."))
The number of stores in a loop that are allowed to need predication.
const TargetLibraryInfo * TLI
Target Library Info.
static const char *const LLVMLoopVectorizeFollowupAll
iterator end()
Get an iterator to the end of the SetVector.
Definition: SetVector.h:93
LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:33
bool enableAggressiveInterleaving(bool LoopHasReductions) const
Don&#39;t restrict interleaved unrolling to small loops.
op_iterator op_begin()
Definition: User.h:230
bool hasLoopInvariantOperands(const Instruction *I) const
Return true if all the operands of the specified instruction are loop invariant.
Definition: LoopInfo.cpp:64
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition: VPlan.h:551
VPValue * createBlockInMask(BasicBlock *BB, VPlanPtr &Plan)
A helper function that computes the predicate of the block BB, assuming that the header block of the ...
Value * get(VPValue *Def, unsigned Part)
Get the generated Value for a given VPValue and a given Part.
Definition: VPlan.h:264
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:130
SmallVector< Instruction *, 4 > PredicatedInstructions
Store instructions that were predicated.
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, unsigned VecWidth, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM)
static bool isBitOrNoopPointerCastable(Type *SrcTy, Type *DestTy, const DataLayout &DL)
Check whether a bitcast, inttoptr, or ptrtoint cast between these types is valid and a no-op...
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:269
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, unsigned Align, bool isVolatile=false)
Definition: IRBuilder.h:1430
bool isIdenticalTo(const Instruction *I) const
Return true if the specified instruction is exactly identical to the current one. ...
const MapVector< Instruction *, uint64_t > & getMinimalBitwidths() const
bool formLCSSARecursively(Loop &L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
Definition: LCSSA.cpp:360
bool hasAnyVectorValue(Value *Key) const
Definition: VPlan.h:132
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:221
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
unsigned getMaxInterleaveFactor(unsigned VF) const
unsigned VF
The chosen Vectorization and Unroll Factors of the loop being vectorized.
Definition: VPlan.h:242
virtual ~InnerLoopVectorizer()=default
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition: VPlan.h:912
bool enableMaskedInterleavedAccessVectorization() const
Enable matching of interleaved access groups that contain predicated accesses or gaps and therefore v...
bool isScalarWithPredication(Instruction *I, unsigned VF=1)
Returns true if I is an instruction that will be scalarized with predication.
AnalysisUsage & addRequired()
PredicatedScalarEvolution & PSE
A wrapper around ScalarEvolution used to add runtime SCEV checks.
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
Definition: BasicBlock.cpp:134
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:51
void print(raw_ostream &O, const Twine &Indent) const override
Print the recipe.
bool tryToWiden(Instruction *I, VPBasicBlock *VPBB, VFRange &Range)
Check if I can be widened within the given VF Range.
bool hasBranchDivergence() const
Return true if branch divergence exists.
static cl::opt< bool > EnableMaskedInterleavedMemAccesses("enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"))
An interleave-group may need masking if it resides in a block that needs predication, or in order to mask away gaps.
This file defines the LoopVectorizationLegality class.
This class represents the LLVM &#39;select&#39; instruction.
const DataLayout & getDataLayout() const
Get the data layout for the module&#39;s target platform.
Definition: Module.cpp:371
bool verify(VerificationLevel VL=VerificationLevel::Full) const
verify - checks if the tree is correct.
int getBasicBlockIndex(const BasicBlock *BB) const
Return the first index of the specified basic block in the value list for this PHI.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Definition: LoopInfo.h:690
bool simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, bool PreserveLCSSA)
Simplify each loop in a loop nest recursively.
static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal)
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:353
bool isFloatingPointTy() const
Return true if this is one of the six floating-point types.
Definition: Type.h:162
int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, const Instruction *I=nullptr) const
bool isReverse() const
Definition: VectorUtils.h:268
SmallVector< PHINode *, 8 > OrigPHIsToFix
static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBase NewBlock after BlockPtr.
Definition: VPlan.h:1444
A Use represents the edge between a Value definition and its users.
Definition: Use.h:56
iterator find(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:383
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
Definition: Type.cpp:652
bool hasVectorValue(Value *Key, unsigned Part) const
Definition: VPlan.h:137
void setIsInBounds(bool b=true)
Set or clear the inbounds flag on this GEP instruction.
Diagnostic information for optimization analysis remarks.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:42
const TargetLibraryInfo * TLI
Target Library Info.
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:197
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:743
Value * emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, const InductionDescriptor &ID) const
Compute the transformed value of Index at offset StartValue using step StepValue. ...
VectorizerValueMap & ValueMap
Hold a reference to the Value state information used when generating the Values of the output IR...
Definition: VPlan.h:317
An analysis that produces DemandedBits for a function.
Definition: DemandedBits.h:107
This file contains the simple types necessary to represent the attributes associated with functions a...
A recipe for handling all phi nodes except for integer and FP inductions.
Definition: VPlan.h:748
Legacy analysis pass which computes BlockFrequencyInfo.
void setName(const Twine &newName)
Definition: VPlan.h:399
void setWideningDecision(Instruction *I, unsigned VF, InstWidening W, unsigned Cost)
Save vectorization decision W and Cost taken by the cost model for instruction I and vector width VF...
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1014
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:285
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:945
RPOIterator endRPO() const
Definition: LoopIterator.h:141
bool isOptimizableIVTruncate(Instruction *I, unsigned VF)
Return True if instruction I is an optimizable truncate whose operand is an induction variable...
uint64_t getNumElements() const
Definition: DerivedTypes.h:359
BasicBlock * LoopExitBlock
The ExitBlock of the scalar loop.
static void cse(BasicBlock *BB)
Perform cse of induction variable instructions.
bool isScalarEpilogueAllowed() const
Returns true if a scalar epilogue is not allowed due to optsize.
Value * getOrCreateVectorTripCount(Loop *NewLoop)
Returns (and creates if needed) the trip count of the widened loop.
CallInst * CreateMaskedGather(Value *Ptrs, unsigned Align, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
Definition: IRBuilder.cpp:523
static const unsigned TinyTripCountInterleaveThreshold
We don&#39;t interleave loops with a known constant trip count below this number.
virtual Value * reverseVector(Value *Vec)
Generate a shuffle sequence that will reverse the vector Vec.
This file implements a class to represent arbitrary precision integral constant values and operations...
BlockT * getHeader() const
Definition: LoopInfo.h:100
bool isLegalMaskedScatter(Type *DataType) const
Return true if the target supports masked gather/scatter AVX-512 fully supports gather and scatter fo...
VPBlendRecipe * tryToBlend(Instruction *I, VPlanPtr &Plan)
Handle non-loop phi nodes.
int isConsecutivePtr(Value *Ptr)
Check if this pointer is consecutive when vectorizing.
void sinkScalarOperands(Instruction *PredInst)
Iteratively sink the scalarized operands of a predicated instruction into the block that was created ...
Instruction * clone() const
Create a copy of &#39;this&#39; instruction that is identical in all ways except the following: ...
Fast - This calling convention attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:43
InstTy * getMember(unsigned Index) const
Get the member with the given index Index.
Definition: VectorUtils.h:310
Class to represent function types.
Definition: DerivedTypes.h:103
static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI)
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:1732
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:245
unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const
bool isTruncateFree(Type *Ty1, Type *Ty2) const
Return true if it&#39;s free to truncate a value of type Ty1 to type Ty2.
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:1684
VPRegionBlock * createReplicateRegion(Instruction *I, VPRecipeBase *PredRecipe, VPlanPtr &Plan)
Create a replicating region for instruction I that requires predication.
Drive the analysis of interleaved memory accesses in the loop.
Definition: VectorUtils.h:388
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:142
unsigned End
Definition: VPlan.h:73
bool isInductionVariable(const Value *V)
Returns True if V can be considered as an induction variable in this loop.
static cl::opt< unsigned > MaxNestedScalarReductionIC("max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop."))
static bool isEqual(const Function &Caller, const Function &Callee)
void addBasicBlockToLoop(BlockT *NewBB, LoopInfoBase< BlockT, LoopT > &LI)
This method is used by other analyses to update loop information.
Definition: LoopInfoImpl.h:251
LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, const Function *F, const LoopVectorizeHints *Hints, InterleavedAccessInfo &IAI)
This file provides a LoopVectorizationPlanner class.
void fixNonInductionPHIs(void)
Fix the non-induction PHIs in the OrigPHIsToFix vector.
void setLoopID(MDNode *LoopID) const
Set the llvm.loop loop id metadata for this loop.
Definition: LoopInfo.cpp:239
This file provides utility VPlan to VPlan transformations.
void execute(VPTransformState &State) override
Generate the wide load or store, and shuffles.
const T & getValue() const LLVM_LVALUE_FUNCTION
Definition: Optional.h:161
void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc=nullptr)
Widen an integer or floating-point induction variable IV.
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Definition: Type.cpp:621
void execute(VPTransformState &State) override
Generate the phi/select nodes.
SmallVector< RegisterUsage, 8 > calculateRegisterUsage(ArrayRef< unsigned > VFs)
void addTopLevelLoop(LoopT *New)
This adds the specified loop to the collection of top-level loops.
Definition: LoopInfo.h:741
LLVM_NODISCARD LLVM_ATTRIBUTE_ALWAYS_INLINE bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:133
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition: SetVector.h:83
static Type * smallestIntegerVectorType(Type *T1, Type *T2)
The group of interleaved loads/stores sharing the same stride and close to each other.
Definition: VectorUtils.h:27
This header provides classes for managing per-loop analyses.
static CmpInst * Create(OtherOps Op, Predicate predicate, Value *S1, Value *S2, const Twine &Name="", Instruction *InsertBefore=nullptr)
Construct a compare instruction, given the opcode, the predicate and the two operands.
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.h:2196
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:33
ReductionList * getReductionVars()
Returns the reduction variables found in the loop.
int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef< Value *> Args, FastMathFlags FMF, unsigned VF=1) const
PHINode * getPrimaryInduction()
Returns the primary induction variable.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:126
iterator_range< User::op_iterator > arg_operands()
Definition: InstrTypes.h:1127
VPTransformState holds information passed down when "executing" a VPlan, needed for generating the ou...
Definition: VPlan.h:234
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1031
A struct that represents some properties of the register usage of a loop.
Value * getOrCreateTripCount(Loop *NewLoop)
Returns (and creates if needed) the original loop trip count.
Value * getLoadStorePointerOperand(Value *V)
A helper function that returns the pointer operand of a load or store instruction.
An instruction for storing to memory.
Definition: Instructions.h:321
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:151
bool isMinusOne() const
This function will return true iff every bit in this constant is set to true.
Definition: Constants.h:209
unsigned getLoadStoreAddressSpace(Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction...
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:429
SmallPtrSet< const Value *, 16 > VecValuesToIgnore
Values to ignore in the cost model when VF > 1.
int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index=-1) const
bool blockNeedsPredication(BasicBlock *BB)
Return true if the block BB needs to be predicated in order for the loop to be vectorized.
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:1659
bool isLegalGatherOrScatter(Value *V)
Returns true if the target machine can represent V as a masked gather or scatter operation.
Reverse the order of the vector.
Debug location.
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:291
Optional< const DILocation * > cloneWithDuplicationFactor(unsigned DF) const
Returns a new DILocation with duplication factor DF * current duplication factor encoded in the discr...
void perform(LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
Definition: LoopInfo.cpp:817
Value * getOrCreateVectorValue(Value *V, unsigned Part)
getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a vector or scalar value on-...
iterator find(const KeyT &Key)
Definition: MapVector.h:148
void truncateToMinimalBitwidths()
Shrinks vector element sizes to the smallest bitwidth they can be legally represented as...
VectorType * getType() const
Overload to return most specific vector type.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
Definition: Dominators.h:145
int getAddressComputationCost(Type *Ty, ScalarEvolution *SE=nullptr, const SCEV *Ptr=nullptr) const
BasicBlock * createVectorizedLoopSkeleton()
Create a new empty loop.
void execute(VPTransformState &State) override
Produce widened copies of all Ingredients.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type *> Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1020
void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass)
Emit a bypass check to see if the vector trip count is zero, including if it overflows.
This class represents a truncation of integer types.
void resetScalarValue(Value *Key, const VPIteration &Instance, Value *Scalar)
Reset the scalar value associated with Key for Part and Lane.
Definition: VPlan.h:217
static BinaryOperator * CreateAdd(Value *S1, Value *S2, const Twine &Name, Instruction *InsertBefore, Value *FlagsOp)
bool isPredicatedInst(Instruction *I)
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block...
Definition: IRBuilder.h:127
Value * getOperand(unsigned i) const
Definition: User.h:170
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:211
bool runImpl(Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_, std::function< const LoopAccessInfo &(Loop &)> &GetLAA_, OptimizationRemarkEmitter &ORE)
Pointer induction var. Step = C / sizeof(elem).
const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
void execute(VPTransformState &State) override
Generates phi nodes for live-outs as needed to retain SSA form.
void execute(VPTransformState &State) override
Generate replicas of the desired Ingredient.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return &#39;this&#39;.
Definition: Type.h:304
void widenInstruction(Instruction &I)
Widen a single instruction within the innermost loop.
Value * CreateFCmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1957
ExtractSubvector Index indicates start offset.
BasicBlock * LoopMiddleBlock
Middle Block between the vector and the scalar.
std::pair< Instruction *, Instruction * > addRuntimeChecks(Instruction *Loc) const
Add code that checks at runtime if the accessed arrays overlap.
bool isVoidTy() const
Return true if this is &#39;void&#39;.
Definition: Type.h:141
bool InterleaveOnlyWhenForced
If false, consider all loops for interleaving.
Definition: LoopVectorize.h:83
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe...
Definition: VPlan.h:1030
an instruction for type-safe pointer arithmetic to access elements of arrays and structs ...
Definition: Instructions.h:854
void setBestPlan(unsigned VF, unsigned UF)
Finalize the best decision and dispose of all other VPlans.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata *> MDs)
Definition: Metadata.h:1166
static bool runOnFunction(Function &F, bool PostInlining)
#define P(N)
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:423
void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, Value *CountRoundDown, Value *EndValue, BasicBlock *MiddleBlock)
Set up the values of the IVs correctly when exiting the vector loop.
This instruction inserts a single (scalar) element into a VectorType value.
void restoreIP(InsertPoint IP)
Sets the current insert point to a previously-saved location.
Definition: IRBuilder.h:200
int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, bool VariableMask, unsigned Alignment) const
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
Definition: Function.h:136
static cl::opt< bool > EnableCondStoresVectorization("enable-cond-stores-vec", cl::init(true), cl::Hidden, cl::desc("Enable if predication of stores during vectorization."))
Integer induction variable. Step = C.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1247
Wrapper pass for TargetTransformInfo.
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:154
* if(!EatIfPresent(lltok::kw_thread_local)) return false
ParseOptionalThreadLocal := /*empty.
const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
Definition: BasicBlock.cpp:217
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:308
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:234
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:429
static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF)
A helper function that returns true if the given type is irregular.
unsigned getRegisterBitWidth(bool Vector) const
Value * concatenateVectors(IRBuilder<> &Builder, ArrayRef< Value *> Vecs)
Concatenate a list of vectors.
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:46
bool isProfitableToScalarize(Instruction *I, unsigned VF) const
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:69
void fixVectorizedLoop()
Fix the vectorized code, taking care of header phi&#39;s, live-outs, and more.
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:287
Conditional or Unconditional Branch instruction.
Min/max implemented in terms of select(cmp()).
Definition: IVDescriptors.h:76
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Definition: InstrTypes.h:1483
Value handle that tracks a Value across RAUW.
Definition: ValueHandle.h:337
BasicBlock * LoopScalarBody
The scalar loop body.
unsigned getNumberOfParts(Type *Tp) const
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node&#39;s...
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition: APInt.h:1185
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:149
const char * getOpcodeName() const
Definition: Instruction.h:128
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static Instruction * getDebugLocFromInstOrOperands(Instruction *I)
Look for a meaningful debug location on the instruction or it&#39;s operands.
This is an important base class in LLVM.
Definition: Constant.h:42
bool hasScalarValue(Value *Key, const VPIteration &Instance) const
Definition: VPlan.h:152
This analysis provides dependence information for the memory accesses of a loop.
PredicatedScalarEvolution & PSE
Predicated scalar evolution analysis.
InsertPoint saveIP() const
Returns the current insert point.
Definition: IRBuilder.h:188
LLVM_ATTRIBUTE_ALWAYS_INLINE iterator begin()
Definition: SmallVector.h:129
void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr)
Set the debug location in the builder using the debug location in the instruction.
const SCEV * getStep() const
Constant * createReplicatedMask(IRBuilder<> &Builder, unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool getDecisionAndClampRange(const std::function< bool(unsigned)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF&#39;s.
Value * createMinMaxOp(IRBuilder<> &Builder, RecurrenceDescriptor::MinMaxRecurrenceKind RK, Value *Left, Value *Right)
Returns a Min/Max operation corresponding to MinMaxRecurrenceKind.
Definition: LoopUtils.cpp:681
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.h:2021
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:224
static Value * addFastMathFlag(Value *V)
A helper function that adds a &#39;fast&#39; flag to floating-point operations.
const unsigned Start
Definition: VPlan.h:70
bool foldTailByMasking() const
Returns true if all loop blocks should be masked to fold tail loop.
A manager for alias analyses.
unsigned selectInterleaveCount(bool OptForSize, unsigned VF, unsigned LoopCost)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:371
static const char *const LLVMLoopVectorizeFollowupEpilogue
InterleavedAccessInfo & InterleaveInfo
The interleave access information contains groups of interleaved accesses with the same stride and cl...
Diagnostic information for applied optimization remarks.
std::unique_ptr< VPlan > VPlanPtr
Definition: VPlan.h:76
static cl::opt< unsigned > SmallLoopCost("small-loop-cost", cl::init(20), cl::Hidden, cl::desc("The cost of a loop that is considered 'small' by the interleaver."))
unsigned getAlignment() const
Definition: VectorUtils.h:270
int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy=nullptr, const Instruction *I=nullptr) const
Represent the analysis usage information of a pass.
PHINode * Induction
The new Induction variable which was added to the new block.
op_iterator op_end()
Definition: User.h:232
OptimizationRemarkAnalysis createLVMissedAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop, Instruction *I=nullptr)
Create an analysis remark that explains why vectorization failed.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1193
bool optForSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:598
bool isReductionVariable(PHINode *PN)
Returns True if PN is a reduction variable in this loop.
Analysis pass providing a never-invalidated alias analysis result.
Value * expandCodeFor(const SCEV *SH, Type *Ty, Instruction *I)
Insert code to directly compute the specified SCEV expression into the program.
void fixCrossIterationPHIs()
Handle all cross-iteration phis in the header.
void fixFirstOrderRecurrence(PHINode *Phi)
Fix a first-order recurrence.
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:285
unsigned getIndex(const InstTy *Instr) const
Get the index for the given member.
Definition: VectorUtils.h:321
static FunctionType * get(Type *Result, ArrayRef< Type *> Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
Definition: Type.cpp:297
VectorizationFactor selectVectorizationFactor(unsigned MaxVF)
op_range operands()
Definition: User.h:238
bool isLegalMaskedStore(Type *DataType) const
Return true if the target supports masked load/store AVX2 and AVX-512 targets allow masks for consecu...
Constant * createBitMaskForGaps(IRBuilder<> &Builder, unsigned VF, const InterleaveGroup< Instruction > &Group)
Create a mask that filters the members of an interleave group where there are gaps.
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1839
unsigned UF
The vectorization unroll factor to use.
Optional< unsigned > getLoopEstimatedTripCount(Loop *L)
Get a loop&#39;s estimated trip count based on branch weight metadata.
Definition: LoopUtils.cpp:616
static cl::opt< unsigned > ForceTargetNumVectorRegs("force-target-num-vector-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of vector registers."))
const RuntimePointerChecking * getRuntimePointerChecking() const
Returns the information that we collected about runtime memory check.
static CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd)
Create a BitCast AddrSpaceCast, or a PtrToInt cast instruction.
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2041
void execute(VPTransformState &State) override
Generate the extraction of the appropriate bit from the block mask and the conditional branch...
unsigned LoopInvariantRegs
Holds the number of loop invariant values that are used in the loop.
const Value * getCondition() const
BlockT * getExitBlock() const
If getExitBlocks would return exactly one block, return that block.
Definition: LoopInfoImpl.h:76
AssumptionCache * AC
Assumption cache.
bool doesNotMeet(Function *F, Loop *L, const LoopVectorizeHints &Hints)
static UndefValue * get(Type *T)
Static factory methods - Return an &#39;undef&#39; object of the specified type.
Definition: Constants.cpp:1415
AssumptionCache * AC
Assumption Cache.
const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs, and aliases.
Definition: Value.cpp:529
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:160
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:365
static void connectBlocks(VPBlockBase *From, VPBlockBase *To)
Connect VPBlockBases From and To bi-directionally.
Definition: VPlan.h:1478
static unsigned getIncomingValueNumForOperand(unsigned i)
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
size_t size() const
Definition: SmallVector.h:53
static wasm::ValType getType(const TargetRegisterClass *RC)
RecurrenceKind getRecurrenceKind()
const std::string & getModuleIdentifier() const
Get the module identifier which is, essentially, the name of the module.
Definition: Module.h:210
void setScalarValue(Value *Key, const VPIteration &Instance, Value *Scalar)
Set a scalar value associated with Key and Instance.
Definition: VPlan.h:191
bool requiresScalarEpilogue() const
Returns true if an interleaved group requires a scalar iteration to handle accesses with gaps...
static unsigned getRecurrenceBinOp(RecurrenceKind Kind)
Returns the opcode of binary operation corresponding to the RecurrenceKind.
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition: VPlan.h:965
void printAsOperand(raw_ostream &O, bool PrintType=true, const Module *M=nullptr) const
Print the name of this Value out to the specified raw_ostream.
Definition: AsmWriter.cpp:4225
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1048
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:1655
void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass)
Emit bypass checks to check any memory assumptions we may have made.
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:58
static const char *const LLVMLoopVectorizeFollowupVectorized
BasicBlock * LoopVectorBody
The vector loop body.
unsigned first
void fixReduction(PHINode *Phi)
Fix a reduction cross-iteration phi.
OperandValueProperties
Additional properties of an operand&#39;s values.
void executePlan(InnerLoopVectorizer &LB, DominatorTree *DT)
Generate the IR code for the body of the vectorized loop according to the best selected VPlan...
Planner drives the vectorization process after having passed Legality checks.
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Definition: IVDescriptors.h:63
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:1969
Value * CreateGEP(Value *Ptr, ArrayRef< Value *> IdxList, const Twine &Name="")
Definition: IRBuilder.h:1458
size_type size() const
Definition: SmallPtrSet.h:93
A function analysis which provides an AssumptionCache.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
static Constant * getSplat(unsigned NumElts, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
Definition: Constants.cpp:1119
unsigned Lane
in [0..VF)
Definition: VPlan.h:89
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:240
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
Definition: LoopInfo.h:110
bool isLegalMaskedGather(Type *DataType)
Returns true if the target machine supports masked gather operation for the given DataType...
const VPRecipeBase & back() const
Definition: VPlan.h:1005
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:298
Analysis pass which computes BlockFrequencyInfo.
Iterator for intrusive lists based on ilist_node.
unsigned getNumOperands() const
Definition: User.h:192
This file defines the VPlanHCFGBuilder class which contains the public interface (buildHierarchicalCF...
static bool mayDivideByZero(Instruction &I)
A helper function for checking whether an integer division-related instruction may divide by zero (in...
LoopVectorizationLegality * Legal
Vectorization legality.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:418
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPValue *Condition, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition: VPlan.h:1461
bool VectorizeOnlyWhenForced
If false, consider all loops for vectorization.
Definition: LoopVectorize.h:87
#define LV_NAME
This is the shared class of boolean and integer constants.
Definition: Constants.h:84
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file. ...
const SCEVUnionPredicate & getUnionPredicate() const
Type * getType() const
Return the LLVM type of this SCEV expression.
A struct for saving information about induction variables.
BlockVerifier::State From
A range of powers-of-2 vectorization factors with fixed start and adjustable end. ...
Definition: VPlan.h:68
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false...
Definition: SmallPtrSet.h:378
unsigned MaxLocalUsers
Holds the maximum number of concurrent live intervals in the loop.
bool canFoldTailByMasking()
Return true if we can vectorize this loop while folding its tail by masking.
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition: VPlan.h:334
iterator end()
Definition: BasicBlock.h:271
BasicBlock * LoopVectorPreHeader
The vector-loop preheader.
bool isFunctionVectorizable(StringRef F, unsigned VF) const
This holds vectorization requirements that must be verified late in the process.
cl::opt< bool > EnableVPlanNativePath("enable-vplan-native-path", cl::init(false), cl::Hidden, cl::desc("Enable VPlan-native vectorization path with " "support for outer loop vectorization."))
static cl::opt< bool > EnableLoadStoreRuntimeInterleave("enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, cl::desc("Enable runtime interleaving until load/store ports are saturated"))
bool isAccessInterleaved(Instruction *Instr)
Check if Instr belongs to any interleaved access group.
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:847
bool dominates(const Instruction *Def, const Use &U) const
Return true if Def dominates a use in User.
Definition: Dominators.cpp:249
SmallPtrSet< const Value *, 16 > ValuesToIgnore
Values to ignore in the cost model.
Module.h This file contains the declarations for the Module class.
Value * CreateInsertElement(Value *Vec, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2054
Provides information about what library functions are available for the current target.
bool isLegalMaskedGather(Type *DataType) const
void buildVPlans(unsigned MinVF, unsigned MaxVF)
Build VPlans for power-of-2 VF&#39;s between MinVF and MaxVF inclusive, according to the information gath...
AddressSpace
Definition: NVPTXBaseInfo.h:22
VPValue * createEdgeMask(BasicBlock *Src, BasicBlock *Dst, VPlanPtr &Plan)
A helper function that computes the predicate of the edge between SRC and DST.
Predicate
Predicate - These are "(BI << 5) | BO" for various predicates.
Definition: PPCPredicates.h:27
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
size_type count(const KeyT &Key) const
Definition: MapVector.h:143
unsigned getABITypeAlignment(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:730
bool isAggregateType() const
Return true if the type is an aggregate type.
Definition: Type.h:258
bool isFirstOrderRecurrence(const PHINode *Phi)
Returns True if Phi is a first-order recurrence in this loop.
LoopT * AllocateLoop(ArgsTy &&... Args)
Definition: LoopInfo.h:654
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:48
void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, const InductionDescriptor &ID)
Compute scalar induction steps.
static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI)
CallInst * CreateMaskedStore(Value *Val, Value *Ptr, unsigned Align, Value *Mask)
Create a call to Masked Store intrinsic.
Definition: IRBuilder.cpp:492
LLVM_NODISCARD T pop_back_val()
Definition: SmallVector.h:381
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:307
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition: Type.cpp:180
int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace) const
static const SCEV * getAddressAccessSCEV(Value *Ptr, LoopVectorizationLegality *Legal, PredicatedScalarEvolution &PSE, const Loop *TheLoop)
Gets Address Access SCEV after verifying that the access pattern is loop invariant except the inducti...
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2068
Drive the analysis of memory accesses in the loop.
static Constant * getSignedIntOrFpConstant(Type *Ty, int64_t C)
A helper function that returns an integer or floating-point constant with value C.
VectorizationFactor plan(bool OptForSize, unsigned UserVF)
Plan how to best vectorize, return the best VF and its cost.
static Constant * get(Type *Ty, uint64_t V, bool isSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:622
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition: Constants.cpp:636
static BranchInst * Create(BasicBlock *IfTrue, Instruction *InsertBefore=nullptr)
bool isConditional() const
static cl::opt< bool > EnableIndVarRegisterHeur("enable-ind-var-reg-heur", cl::init(true), cl::Hidden, cl::desc("Count the induction variable only once when interleaving"))
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
pred_range predecessors(BasicBlock *BB)
Definition: CFG.h:125
const VPBlocksTy & getSuccessors() const
Definition: VPlan.h:423
static Constant * get(Type *Ty, double V)
This returns a ConstantFP, or a vector containing a splat of a ConstantFP, for the specified value in...
Definition: Constants.cpp:685
CallInst * CreateMaskedLoad(Value *Ptr, unsigned Align, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
Definition: IRBuilder.cpp:471
bool hasVectorInstrinsicScalarOpd(Intrinsic::ID ID, unsigned ScalarOpdIdx)
Identifies if the intrinsic has a scalar operand.
Definition: VectorUtils.cpp:89
unsigned getNumIncomingValues() const
Return the number of incoming edges.
unsigned getFactor() const
Definition: VectorUtils.h:269
Value * CreateURem(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1093
StringRef getVectorizedFunction(StringRef F, unsigned VF) const
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2&#39;s erase_if which is equivalent t...
Definition: STLExtras.h:1330
void setOperand(unsigned i, Value *Val)
Definition: User.h:175
Value * createTargetReduction(IRBuilder<> &B, const TargetTransformInfo *TTI, RecurrenceDescriptor &Desc, Value *Src, bool NoNaN=false)
Create a generic target reduction using a recurrence descriptor Desc The target is queried to determi...
Definition: LoopUtils.cpp:877
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:133
Optional< MDNode * > makeFollowupLoopID(MDNode *OrigLoopID, ArrayRef< StringRef > FollowupAttrs, const char *InheritOptionsAttrsPrefix="", bool AlwaysNew=false)
Create a new loop identifier for a loop created from a loop transformation.
Definition: LoopUtils.cpp:246
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition: Module.cpp:176
unsigned getVectorNumElements() const
Definition: DerivedTypes.h:462
static Constant * getRecurrenceIdentity(RecurrenceKind K, Type *Tp)
Returns identity corresponding to the RecurrenceKind.
Store the result of a depth first search within basic blocks contained by a single loop...
Definition: LoopIterator.h:98
void clear()
Completely clear the SetVector.
Definition: SetVector.h:216
DenseMap< PHINode *, Value * > IVEndValues
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition: VPlan.h:801
Class to represent vector types.
Definition: DerivedTypes.h:393
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:56
Class for arbitrary precision integers.
Definition: APInt.h:70
bool isInductionPhi(const Value *V)
Returns True if V is a Phi node of an induction variable in this loop.
void collectUniformsAndScalars(unsigned VF)
Collect Uniform and Scalar values for the given VF.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition: Hashing.h:601
void addMetadata(Instruction *To, Instruction *From)
Add metadata from one instruction to another.
const LoopAccessInfo * getLAI() const
void resetVectorValue(Value *Key, unsigned Part, Value *Vector)
Reset the vector value associated with Key for the given Part.
Definition: VPlan.h:208
LoopInfo * LI
Loop Info analysis.
iterator_range< user_iterator > users()
Definition: Value.h:400
static void DFS(BasicBlock *Root, SetVector< BasicBlock *> &Set)
static const char lv_name[]
This class uses information about analyze scalars to rewrite expressions in canonical form...
static bool processLoopInVPlanNativePath(Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, LoopVectorizeHints &Hints)
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:479
static cl::opt< unsigned > ForceTargetMaxVectorInterleaveFactor("force-target-max-vector-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "vectorized loops."))
Value * VectorTripCount
Trip count of the widened loop (TripCount - TripCount % (VF*UF))
void vectorizeMemoryInstruction(Instruction *Instr, VectorParts *BlockInMask=nullptr)
Vectorize Load and Store instructions, optionally masking the vector operations if BlockInMask is non...
loop Loop Strength Reduction
static cl::opt< bool > VPlanBuildStressTest("vplan-build-stress-test", cl::init(false), cl::Hidden, cl::desc("Build VPlan for every supported loop nest in the function and bail " "out right after the build (stress test the VPlan H-CFG construction " "in the VPlan-native vectorization path)."))
ValueT lookup(const KeyT &Key) const
Definition: MapVector.h:111
InnerLoopVectorizer * ILV
Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.
Definition: VPlan.h:327
const TargetTransformInfo & TTI
Vector target information.
amdgpu Simplify well known AMD library false Value Value * Arg
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:292
const SmallVectorImpl< Instruction * > & getCastInsts() const
Returns a reference to the type cast instructions in the induction update chain, that are redundant w...
LoopVectorizationCostModel * Cost
The profitablity analysis.
Analysis pass that exposes the ScalarEvolution for a function.
void setWideningDecision(const InterleaveGroup< Instruction > *Grp, unsigned VF, InstWidening W, unsigned Cost)
Save vectorization decision W and Cost taken by the cost model for interleaving group Grp and vector ...
Constant * createStrideMask(IRBuilder<> &Builder, unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
uint64_t getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:568
PHINode * OldInduction
The induction variable of the old basic block.
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:1810
bool blockNeedsPredication(BasicBlock *BB)
bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF=1)
Returns true if I is a memory instruction in an interleaved-group of memory accesses that can be vect...
uint64_t getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:436
LoopT * getParentLoop() const
Definition: LoopInfo.h:101
static unsigned getReciprocalPredBlockProb()
A helper function that returns the reciprocal of the block probability of predicated blocks...
Analysis pass providing a never-invalidated alias analysis result.
bool empty() const
Definition: VPlan.h:1002
const SCEV * getBackedgeTakenCount()
Get the (predicated) backedge count for the analyzed loop.
static cl::opt< unsigned > TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), cl::Hidden, cl::desc("Loops with a constant trip count that is smaller than this " "value are vectorized only if no scalar iteration overheads " "are incurred."))
Loops with a known constant trip count below this number are vectorized only if no scalar iteration o...
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
bool hasValue() const
Definition: Optional.h:165
MDNode * getLoopID() const
Return the llvm.loop loop id metadata node for this loop if it is present.
Definition: LoopInfo.cpp:215
bool isLegalMaskedLoad(Type *DataType) const
This analysis provides dependence information for the memory accesses of a loop.
LLVM_ATTRIBUTE_ALWAYS_INLINE iterator end()
Definition: SmallVector.h:133
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:311
static Type * getMemInstValueType(Value *I)
A helper function that returns the type of loaded or stored value.
void registerAssumption(CallInst *CI)
Add an @llvm.assume intrinsic to this function&#39;s cache.
void packScalarIntoVectorValue(Value *V, const VPIteration &Instance)
Construct the vector value of a scalarized value V one lane at a time.
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
static unsigned getVectorCallCost(CallInst *CI, unsigned VF, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, bool &NeedToScalarize)
iterator begin() const
Definition: SmallPtrSet.h:397
This class represents an analyzed expression in the program.
Value * getVectorValue(Value *Key, unsigned Part)
Retrieve the existing vector value that corresponds to Key and Part.
Definition: VPlan.h:166
static cl::opt< bool > EnableInterleavedMemAccesses("enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on interleaved memory accesses in a loop"))
void addChildLoop(LoopT *NewChild)
Add the specified loop to be a child of this loop.
Definition: LoopInfo.h:331
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition: VPlan.h:832
LLVM_NODISCARD bool empty() const
Definition: SmallVector.h:56
bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const
Floating point induction variable.
iterator begin()
Definition: DenseMap.h:100
void execute(VPTransformState &State) override
Generate the phi/select nodes.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:465
VPBasicBlock * handleReplication(Instruction *I, VFRange &Range, VPBasicBlock *VPBB, DenseMap< Instruction *, VPReplicateRecipe *> &PredInst2Recipe, VPlanPtr &Plan)
Build a VPReplicationRecipe for I and enclose it within a Region if it is predicated.
InductionList * getInductionVars()
Returns the induction variables found in the loop.
StringRef getName() const
Return a constant reference to the value&#39;s name.
Definition: Value.cpp:214
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation.
Definition: InstrTypes.h:1181
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:107
const NodeList & List
Definition: RDFGraph.cpp:210
bool isUniformAfterVectorization(Instruction *I, unsigned VF) const
Returns true if I is known to be uniform after vectorization.
void collectInstsToScalarize(unsigned VF)
Collects the instructions to scalarize for each predicated instruction in the loop.
static Instruction::CastOps getCastOpcode(const Value *Val, bool SrcIsSigned, Type *Ty, bool DstIsSigned)
Returns the opcode necessary to cast Val into Ty using usual casting rules.
#define I(x, y, z)
Definition: MD5.cpp:58
#define N
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
user_iterator_impl< User > user_iterator
Definition: Value.h:369
bool isLegalMaskedStore(Type *DataType, Value *Ptr)
Returns true if the target machine supports masked store operation for the given DataType and kind of...
void execute(VPTransformState &State) override
Generate the vectorized and scalarized versions of the phi node as needed by their users...
PHINode * createInductionVariable(Loop *L, Value *Start, Value *End, Value *Step, Instruction *DL)
Create a new induction variable inside L.
iterator end()
Definition: DenseMap.h:109
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:193
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:4809
Optional< unsigned > computeMaxVF(bool OptForSize)
iterator_range< value_op_iterator > operand_values()
Definition: User.h:262
Loop * OrigLoop
The original loop.
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:323
unsigned getWideningCost(Instruction *I, unsigned VF)
Return the vectorization cost for the given instruction I and vector width VF.
int getShuffleCost(ShuffleKind Kind, Type *Tp, int Index=0, Type *SubTp=nullptr) const
void preserve()
Mark an analysis as preserved.
Definition: PassManager.h:175
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition: BasicBlock.h:325
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value *> Args=None, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1974
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:408
int getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Opd1Info=OK_AnyValue, OperandValueKind Opd2Info=OK_AnyValue, OperandValueProperties Opd1PropInfo=OP_None, OperandValueProperties Opd2PropInfo=OP_None, ArrayRef< const Value *> Args=ArrayRef< const Value *>()) const
This is an approximation of reciprocal throughput of a math/logic op.
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector and ...
Definition: VPlan.h:724
bool canVectorize(bool UseVPlanNativePath)
Returns true if it is legal to vectorize this loop.
InstWidening getWideningDecision(Instruction *I, unsigned VF)
Return the cost model decision for the given instruction I and vector width VF.
iterator end() const
Definition: SmallPtrSet.h:402
LLVM_NODISCARD bool empty() const
Definition: DenseMap.h:123
void setVectorValue(Value *Key, unsigned Part, Value *Vector)
Set a vector value associated with Key and Part.
Definition: VPlan.h:180
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:211
void updateAnalysis()
Insert the new loop to the loop hierarchy and pass manager and update the analysis passes...
Wrapper class to LoopBlocksDFS that provides a standard begin()/end() interface for the DFS reverse p...
Definition: LoopIterator.h:173
bool shouldMaximizeVectorBandwidth(bool OptSize) const
bool empty() const
Definition: LoopInfo.h:146
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition: IRBuilder.h:794
Analysis pass providing the TargetLibraryInfo.
unsigned getLoadStoreAlignment(Value *I)
A helper function that returns the alignment of load or store instruction.
AliasAnalysis * AA
Alias Analysis.
LoopVectorizationLegality * Legal
The legality analysis.
const SmallVectorImpl< const SCEVPredicate * > & getPredicates() const
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:1769
separate const offset from gep
void setAlreadyVectorized()
Mark the loop L as already vectorized by setting the width to 1.
Constant * createInterleaveMask(IRBuilder<> &Builder, unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
iterator end()
Definition: MapVector.h:72
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isExplicitVecOuterLoop(Loop *OuterLp, OptimizationRemarkEmitter *ORE)
SmallPtrSet< Instruction *, 8 > & getCastInsts()
Returns a reference to the instructions used for type-promoting the recurrence.
unsigned VF
The vectorization SIMD factor to use.
Definition: JSON.cpp:598
CallInst * CreateMaskedScatter(Value *Val, Value *Ptrs, unsigned Align, Value *Mask=nullptr)
Create a call to Masked Scatter intrinsic.
Definition: IRBuilder.cpp:554
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:483
LoopVectorizationCostModel - estimates the expected speedups due to vectorization.
VPWidenRecipe is a recipe for producing a copy of vector type for each Instruction in its ingredients...
Definition: VPlan.h:688
unsigned getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition: Type.cpp:115
IRBuilder Builder
The builder that we use.
uint64_t PowerOf2Floor(uint64_t A)
Returns the power of two which is less than or equal to the given value.
Definition: MathExtras.h:652
InstTy * getInsertPos() const
Definition: VectorUtils.h:330
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:566
LLVM Value Representation.
Definition: Value.h:73
virtual Value * getBroadcastInstrs(Value *V)
Create a broadcast instruction.
A recipe for generating conditional branches on the bits of a mask.
Definition: VPlan.h:877
uint64_t getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type...
Definition: DataLayout.h:419
bool isLegalMaskedLoad(Type *DataType, Value *Ptr)
Returns true if the target machine supports masked load operation for the given DataType and kind of ...
static VectorType * get(Type *ElementType, unsigned NumElements)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:606
The LoopVectorize Pass.
Definition: LoopVectorize.h:80
RPOIterator beginRPO() const
Reverse iterate over the cached postorder blocks.
Definition: LoopIterator.h:137
bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
bool enableInterleavedAccessVectorization() const
Enable matching of interleaved access groups.
OptimizationRemarkEmitter legacy analysis pass.
std::underlying_type< E >::type Mask()
Get a bitmask with 1s in all places up to the high-order bit of E&#39;s largest value.
Definition: BitmaskEnum.h:81
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:220
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:122
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
Definition: Instruction.cpp:87
Broadcast element 0 to all other elements.
This class implements an extremely fast bulk output stream that can only output to a stream...
Definition: raw_ostream.h:46
int getCFInstrCost(unsigned Opcode) const
uint64_t getTypeAllocSizeInBits(Type *Ty) const
Returns the offset in bits between successive objects of the specified type, including alignment padd...
Definition: DataLayout.h:446
bool prefersVectorizedAddressing() const
Return true if target doesn&#39;t mind addresses in vectors.
unsigned getOperandsScalarizationOverhead(ArrayRef< const Value *> Args, unsigned VF) const
Type * getElementType() const
Definition: DerivedTypes.h:360
The legacy pass manager&#39;s analysis pass to compute loop information.
Definition: LoopInfo.h:970
print Print MemDeps of function
SmallVector< BasicBlock *, 4 > LoopBypassBlocks
A list of all bypass blocks. The first block is the entry of the loop.
static Type * ToVectorTy(Type *Scalar, unsigned VF)
A helper function for converting Scalar types to vector types.
static void VPInstructionsToVPRecipes(VPlanPtr &Plan, LoopVectorizationLegality::InductionList *Inductions, SmallPtrSetImpl< Instruction *> &DeadInstructions)
Replaces the VPInstructions in Plan with corresponding widen recipes.
const TargetTransformInfo * TTI
Target Transform Info.
bool hasOneUse() const
Return true if there is exactly one user of this value.
Definition: Value.h:413
Convenience struct for specifying and reasoning about fast-math flags.
Definition: Operator.h:160
OperandValueKind
Additional information about an operand&#39;s possible values.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:49
This is the interface for LLVM&#39;s primary stateless and local alias analysis.
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition: VPlan.h:770
A single uniqued string.
Definition: Metadata.h:604
In what follows, the term "input IR" refers to code that is fed into the vectorizer whereas the term ...
Definition: VPlan.h:84
A container for analyses that lazily runs them and caches their results.
unsigned getMinimumVF(unsigned ElemWidth) const
VectorizerValueMap VectorLoopValueMap
Maps values from the original loop to their corresponding values in the vectorized loop...
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:260
This pass exposes codegen information to IR-level passes.
void selectUserVectorizationFactor(unsigned UserVF)
Setup cost-based decisions for user vectorization factor.
bool shouldScalarizeInstruction(Instruction *I) const
Returns true if an instruction I should be scalarized instead of vectorized for the chosen vectorizat...
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object...
void setIncomingValue(unsigned i, Value *V)
void perform(LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
Definition: LoopIterator.h:181
Utility class for getting and setting loop vectorizer hints in the form of loop metadata.
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1075
static BinaryOperator * CreateMul(Value *S1, Value *S2, const Twine &Name, Instruction *InsertBefore, Value *FlagsOp)
#define LLVM_DEBUG(X)
Definition: Debug.h:123
static cl::opt< unsigned > ForceTargetInstructionCost("force-target-instruction-cost", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's expected cost for " "an instruction to a single constant value. Mostly " "useful for getting consistent testing."))
void addMetadata(InstTy *NewInst) const
Add metadata (e.g.
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:156
int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace, const Instruction *I=nullptr) const
unsigned Part
in [0..UF)
Definition: VPlan.h:86
VPWidenIntOrFpInductionRecipe * tryToOptimizeInduction(Instruction *I, VFRange &Range)
Check if an induction recipe should be constructed for within the given VF Range.
void vectorizeInterleaveGroup(Instruction *Instr, VectorParts *BlockInMask=nullptr)
Try to vectorize the interleaved access group that Instr belongs to, optionally masking the vector op...
void setCostBasedWideningDecision(unsigned VF)
Memory access instruction may be vectorized in more than one way.
static OperandValueKind getOperandInfo(Value *V, OperandValueProperties &OpProps)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
The optimization diagnostic interface.
bool isScalarAfterVectorization(Instruction *I, unsigned VF) const
Returns true if I is known to be scalar after vectorization.
bool use_empty() const
Definition: Value.h:323
#define OP(n)
Definition: regex2.h:73
static Constant * get(ArrayRef< Constant *> V)
Definition: Constants.cpp:1079
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
DenseMap< Instruction *, Instruction * > & getSinkAfter()
Return the set of instructions to sink to handle first-order recurrences.
static cl::opt< unsigned > ForceTargetMaxScalarInterleaveFactor("force-target-max-scalar-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "scalar loops."))
#define T1
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition: VPlan.h:1489
RecurrenceKind
This enum represents the kinds of recurrences that we support.
Definition: IVDescriptors.h:66
const BasicBlock * getParent() const
Definition: Instruction.h:67
This class represents a constant integer value.
bool needsScalarInduction(Instruction *IV) const
Returns true if we should generate a scalar version of IV.
bool requiresScalarEpilogue() const
Returns true if this Group requires a scalar iteration to handle gaps.
Definition: VectorUtils.h:342
InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM)
Legacy wrapper pass to provide the BasicAAResult object.
bool Need
This flag indicates if we need to add the runtime check.
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Definition: PassManager.h:1038
Type * getRecurrenceType()
Returns the type of the recurrence.
std::pair< unsigned, unsigned > getSmallestAndWidestTypes()
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.
Definition: STLExtras.h:1245