141 #include <functional> 150 using namespace llvm;
152 #define LV_NAME "loop-vectorize" 153 #define DEBUG_TYPE LV_NAME 158 "llvm.loop.vectorize.followup_all";
160 "llvm.loop.vectorize.followup_vectorized";
162 "llvm.loop.vectorize.followup_epilogue";
165 STATISTIC(LoopsVectorized,
"Number of loops vectorized");
166 STATISTIC(LoopsAnalyzed,
"Number of loops analyzed for vectorization");
172 cl::desc(
"Loops with a constant trip count that is smaller than this " 173 "value are vectorized only if no scalar iteration overheads " 178 cl::desc(
"Maximize bandwidth when selecting vectorization factor which " 179 "will be determined by the smallest type in loop."));
183 cl::desc(
"Enable vectorization on interleaved memory accesses in a loop"));
189 cl::desc(
"Enable vectorization on masked interleaved memory accesses in a loop"));
197 cl::desc(
"A flag that overrides the target's number of scalar registers."));
201 cl::desc(
"A flag that overrides the target's number of vector registers."));
205 cl::desc(
"A flag that overrides the target's max interleave factor for " 210 cl::desc(
"A flag that overrides the target's max interleave factor for " 211 "vectorized loops."));
215 cl::desc(
"A flag that overrides the target's expected cost for " 216 "an instruction to a single constant value. Mostly " 217 "useful for getting consistent testing."));
222 "The cost of a loop that is considered 'small' by the interleaver."));
226 cl::desc(
"Enable the use of the block frequency analysis to access PGO " 227 "heuristics minimizing code growth in cold regions and being more " 228 "aggressive in hot regions."));
234 "Enable runtime interleaving until load/store ports are saturated"));
239 cl::desc(
"Max number of stores to be predicated behind an if."));
243 cl::desc(
"Count the induction variable only once when interleaving"));
247 cl::desc(
"Enable if predication of stores during vectorization."));
251 cl::desc(
"The maximum interleave count to use when interleaving a scalar " 252 "reduction in a nested loop."));
256 cl::desc(
"Enable VPlan-native vectorization path with " 257 "support for outer loop vectorization."));
266 "Build VPlan for every supported loop nest in the function and bail " 267 "out right after the build (stress test the VPlan H-CFG construction " 268 "in the VPlan-native vectorization path)."));
281 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
282 "Expected Load or Store instruction");
283 if (
auto *LI = dyn_cast<LoadInst>(I))
284 return LI->getType();
285 return cast<StoreInst>(
I)->getValueOperand()->getType();
314 if (isa<FPMathOperator>(V)) {
317 cast<Instruction>(V)->setFastMathFlags(Flags);
354 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
355 AC(AC), ORE(ORE),
VF(VecWidth),
UF(UnrollFactor),
356 Builder(PSE.getSE()->getContext()),
388 bool IfPredicateInstr);
506 Instruction::BinaryOpsEnd);
551 Value *VectorLoopValue,
553 unsigned Lane = UINT_MAX);
642 std::unique_ptr<LoopVersioning>
LVer;
727 UnrollFactor, LVL, CM) {}
733 Instruction::BinaryOpsEnd)
override;
750 if (
Instruction *OpInst = dyn_cast<Instruction>(*OI))
751 if (OpInst->getDebugLoc() !=
Empty)
759 if (
const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
761 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
762 !isa<DbgInfoIntrinsic>(Inst)) {
768 <<
"Failed to create new discriminator: " 769 << DIL->getFilename() <<
" Line: " << DIL->getLine());
784 LoopDbgLoc.print(OS);
798 if (
LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
799 LVer->annotateInstWithNoAlias(To, Orig);
810 for (
Value *V : To) {
835 : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
836 AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {}
850 collectUniformsAndScalars(UserVF);
851 collectInstsToScalarize(UserVF);
857 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
863 unsigned selectInterleaveCount(
bool OptForSize,
unsigned VF,
873 void setCostBasedWideningDecision(
unsigned VF);
890 void collectValuesToIgnore();
902 assert(VF > 1 &&
"Profitable to scalarize relevant only for VF > 1.");
909 auto Scalars = InstsToScalarize.find(VF);
910 assert(Scalars != InstsToScalarize.end() &&
911 "VF not yet analyzed for scalarization profitability");
912 return Scalars->second.find(I) != Scalars->second.end();
925 auto UniformsPerVF = Uniforms.find(VF);
926 assert(UniformsPerVF != Uniforms.end() &&
927 "VF not yet analyzed for uniformity");
928 return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
941 auto ScalarsPerVF = Scalars.find(VF);
942 assert(ScalarsPerVF != Scalars.end() &&
943 "Scalar values are not calculated for VF");
944 return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
950 return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
951 !isProfitableToScalarize(I, VF) &&
952 !isScalarAfterVectorization(I, VF);
969 assert(VF >= 2 &&
"Expected VF >=2");
970 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
977 assert(VF >= 2 &&
"Expected VF >=2");
980 for (
unsigned i = 0; i < Grp->
getFactor(); ++i) {
983 WideningDecisions[std::make_pair(
I, VF)] = std::make_pair(W, Cost);
985 WideningDecisions[std::make_pair(
I, VF)] = std::make_pair(W, 0);
994 assert(VF >= 2 &&
"Expected VF >=2");
999 return CM_GatherScatter;
1001 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1002 auto Itr = WideningDecisions.find(InstOnVF);
1003 if (Itr == WideningDecisions.end())
1005 return Itr->second.first;
1011 assert(VF >= 2 &&
"Expected VF >=2");
1012 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1013 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1014 "The cost is not calculated");
1015 return WideningDecisions[InstOnVF].second;
1036 Value *
Op = Trunc->getOperand(0);
1046 void collectInstsToScalarize(
unsigned VF);
1053 if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1055 setCostBasedWideningDecision(VF);
1056 collectLoopUniforms(VF);
1057 collectLoopScalars(VF);
1087 bool LI = isa<LoadInst>(V);
1088 bool SI = isa<StoreInst>(V);
1092 return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
1100 bool isScalarWithPredication(
Instruction *
I,
unsigned VF = 1);
1106 if (!blockNeedsPredication(I->
getParent()))
1110 if (isa<LoadInst>(I) || isa<StoreInst>(
I))
1112 return isScalarWithPredication(I);
1117 bool memoryInstructionCanBeWidened(
Instruction *I,
unsigned VF = 1);
1122 bool interleavedAccessCanBeWidened(
Instruction *I,
unsigned VF = 1);
1126 return InterleaveInfo.isInterleaved(Instr);
1132 return InterleaveInfo.getInterleaveGroup(Instr);
1139 return IsScalarEpilogueAllowed && InterleaveInfo.requiresScalarEpilogue();
1153 unsigned NumPredStores = 0;
1157 unsigned computeFeasibleMaxVF(
bool OptForSize,
unsigned ConstTripCount);
1166 using VectorizationCostTy = std::pair<unsigned, bool>;
1172 VectorizationCostTy expectedCost(
unsigned VF);
1176 VectorizationCostTy getInstructionCost(
Instruction *I,
unsigned VF);
1180 unsigned getInstructionCost(
Instruction *I,
unsigned VF,
Type *&VectorTy);
1183 unsigned getMemoryInstructionCost(
Instruction *I,
unsigned VF);
1186 unsigned getMemInstScalarizationCost(
Instruction *I,
unsigned VF);
1189 unsigned getInterleaveGroupCost(
Instruction *I,
unsigned VF);
1192 unsigned getGatherScatterCost(
Instruction *I,
unsigned VF);
1196 unsigned getConsecutiveMemOpCost(
Instruction *I,
unsigned VF);
1202 unsigned getUniformMemOpCost(
Instruction *I,
unsigned VF);
1218 RemarkName, TheLoop);
1242 bool IsScalarEpilogueAllowed =
true;
1245 bool FoldTailByMasking =
false;
1282 void collectLoopUniforms(
unsigned VF);
1289 void collectLoopScalars(
unsigned VF);
1294 std::pair<InstWidening, unsigned>>;
1360 assert(!OuterLp->
empty() &&
"This is not an outer loop");
1369 if (!Hints.allowVectorization(Fn, OuterLp,
1371 LLVM_DEBUG(
dbgs() <<
"LV: Loop hints prevent outer loop vectorization.\n");
1375 if (!Hints.getWidth()) {
1376 LLVM_DEBUG(
dbgs() <<
"LV: Not vectorizing: No user vector width.\n");
1377 Hints.emitRemarkWithHints();
1381 if (Hints.getInterleave() > 1) {
1383 LLVM_DEBUG(
dbgs() <<
"LV: Not vectorizing: Interleave is not supported for " 1385 Hints.emitRemarkWithHints();
1403 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1413 for (
Loop *InnerL : L)
1426 explicit LoopVectorize(
bool InterleaveOnlyWhenForced =
false,
1427 bool VectorizeOnlyWhenForced =
false)
1435 if (skipFunction(F))
1438 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1439 auto *
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1440 auto *
TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1441 auto *
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1442 auto *
BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1443 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1444 auto *
TLI = TLIP ? &TLIP->getTLI() :
nullptr;
1445 auto *
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1446 auto *
AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1447 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1448 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1449 auto *
ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1451 std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1454 return Impl.
runImpl(F, *SE, *
LI, *
TTI, *
DT, *
BFI, TLI, *DB, *
AA, *
AC,
1512 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1513 "Expected either an induction phi-node or a truncate of it!");
1519 if (isa<TruncInst>(EntryVal)) {
1521 "Truncation requires an integer type");
1522 auto *TruncType = cast<IntegerType>(EntryVal->
getType());
1527 Value *SteppedStart =
1536 MulOp = Instruction::Mul;
1539 MulOp = Instruction::FMul;
1552 Value *SplatVF = isa<Constant>(Mul)
1563 for (
unsigned Part = 0; Part <
UF; ++Part) {
1566 if (isa<TruncInst>(EntryVal))
1578 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1579 auto *ICmp = cast<Instruction>(Br->getCondition());
1581 LastInduction->
setName(
"vec.ind.next");
1584 VecInd->
addIncoming(LastInduction, LoopVectorLatch);
1595 auto isScalarInst = [&](
User *U) ->
bool {
1596 auto *
I = cast<Instruction>(U);
1604 Value *VectorLoopVal,
unsigned Part,
unsigned Lane) {
1605 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1606 "Expected either an induction phi-node or a truncate of it!");
1614 if (isa<TruncInst>(EntryVal))
1624 if (Lane < UINT_MAX)
1632 "Primary induction variable must have an integer type");
1637 auto ID = II->second;
1638 assert(IV->
getType() == ID.getStartValue()->getType() &&
"Types must match");
1642 Value *ScalarIV =
nullptr;
1646 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1649 auto VectorizedIV =
false;
1659 "Induction step should be loop invariant");
1661 Value *Step =
nullptr;
1664 Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
1667 Step = cast<SCEVUnknown>(ID.getStep())->getValue();
1675 VectorizedIV =
true;
1683 if (!VectorizedIV || NeedsScalarIV) {
1691 ScalarIV->
setName(
"offset.idx");
1694 auto *TruncType = cast<IntegerType>(Trunc->
getType());
1696 "Truncation requires an integer step");
1705 if (!VectorizedIV) {
1707 for (
unsigned Part = 0; Part <
UF; ++Part) {
1735 "Induction Step must be an integer or FP");
1742 for (
int i = 0; i < VLen; ++i)
1749 assert(Step->getType() == Val->
getType() &&
"Invalid step vec");
1757 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1758 "Binary Opcode should be specified for FP induction");
1760 for (
int i = 0; i < VLen; ++i)
1773 if (isa<Instruction>(MulOp))
1775 cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1778 if (isa<Instruction>(BOp))
1779 cast<Instruction>(BOp)->setFastMathFlags(Flags);
1787 assert(
VF > 1 &&
"VF should be greater than one");
1792 "Val and Step should have the same type");
1800 MulOp = Instruction::Mul;
1803 MulOp = Instruction::FMul;
1813 for (
unsigned Part = 0; Part <
UF; ++Part) {
1814 for (
unsigned Lane = 0; Lane < Lanes; ++Lane) {
1825 assert(V !=
Induction &&
"The new induction variable should not be used.");
1845 auto *
I = cast<Instruction>(V);
1859 auto *LastInst = cast<Instruction>(
1874 Value *VectorValue =
nullptr;
1882 for (
unsigned Lane = 0; Lane <
VF; ++Lane)
1907 :
true &&
"Uniform values only have lane zero");
1920 if (!U->getType()->isVectorTy()) {
1921 assert(
VF == 1 &&
"Value not scalarized has non-vector type");
1933 assert(V !=
Induction &&
"The new induction variable should not be used.");
1947 for (
unsigned i = 0; i <
VF; ++i)
1998 assert(Group &&
"Fail to get an interleaved access group.");
2009 unsigned InterleaveFactor = Group->
getFactor();
2019 bool IsMaskForCondRequired = BlockInMask;
2020 if (IsMaskForCondRequired) {
2021 Mask = *BlockInMask;
2036 bool InBounds =
false;
2038 InBounds =
gep->isInBounds();
2040 for (
unsigned Part = 0; Part <
UF; Part++) {
2056 cast<GetElementPtrInst>(NewPtr)->setIsInBounds(
true);
2065 Value *MaskForGaps =
nullptr;
2068 assert(MaskForGaps &&
"Mask for Gaps is required but it is null");
2072 if (isa<LoadInst>(Instr)) {
2075 for (
unsigned Part = 0; Part <
UF; Part++) {
2077 if (IsMaskForCondRequired || MaskForGaps) {
2079 "masked interleaved groups are not allowed.");
2080 Value *GroupMask = MaskForGaps;
2081 if (IsMaskForCondRequired) {
2085 Mask[Part], Undefs, RepMask,
"interleaved.mask");
2086 GroupMask = MaskForGaps
2093 GroupMask, UndefVec,
"wide.masked.vec");
2104 for (
unsigned I = 0;
I < InterleaveFactor; ++
I) {
2112 for (
unsigned Part = 0; Part <
UF; Part++) {
2114 NewLoads[Part], UndefVec, StrideMask,
"strided.vec");
2117 if (Member->
getType() != ScalarTy) {
2135 for (
unsigned Part = 0; Part <
UF; Part++) {
2138 for (
unsigned i = 0; i < InterleaveFactor; i++) {
2141 assert(Member &&
"Fail to get a member from an interleaved store group");
2144 cast<StoreInst>(Member)->getValueOperand(), Part);
2150 if (StoredVec->
getType() != SubVT)
2165 if (IsMaskForCondRequired) {
2169 Mask[Part], Undefs, RepMask,
"interleaved.mask");
2171 IVec, NewPtrs[Part], Group->
getAlignment(), ShuffledMask);
2187 assert((LI || SI) &&
"Invalid Load/Store instruction");
2192 "CM decision should be taken at this point");
2210 bool ConsecutiveStride =
2212 bool CreateGatherScatter =
2217 assert((ConsecutiveStride || CreateGatherScatter) &&
2218 "The instruction should be scalarized");
2221 if (ConsecutiveStride)
2225 bool isMaskRequired = BlockInMask;
2227 Mask = *BlockInMask;
2229 bool InBounds =
false;
2230 if (
auto *
gep = dyn_cast<GetElementPtrInst>(
2232 InBounds =
gep->isInBounds();
2234 const auto CreateVecPtr = [&](
unsigned Part,
Value *Ptr) ->
Value * {
2241 PartPtr = cast<GetElementPtrInst>(
2244 PartPtr = cast<GetElementPtrInst>(
2250 PartPtr = cast<GetElementPtrInst>(
2262 for (
unsigned Part = 0; Part <
UF; ++Part) {
2265 if (CreateGatherScatter) {
2266 Value *MaskPart = isMaskRequired ? Mask[Part] :
nullptr;
2278 auto *VecPtr = CreateVecPtr(Part, Ptr);
2291 assert(LI &&
"Must have a load instruction");
2293 for (
unsigned Part = 0; Part <
UF; ++Part) {
2295 if (CreateGatherScatter) {
2296 Value *MaskPart = isMaskRequired ? Mask[Part] :
nullptr;
2299 nullptr,
"wide.masked.gather");
2302 auto *VecPtr = CreateVecPtr(Part, Ptr);
2306 "wide.masked.load");
2321 bool IfPredicateInstr) {
2348 if (
auto *II = dyn_cast<IntrinsicInst>(Cloned))
2353 if (IfPredicateInstr)
2393 assert(L &&
"Create Trip Count for null loop.");
2398 assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2399 "Invalid loop count");
2402 assert(IdxTy &&
"No type for induction");
2410 IdxTy->getPrimitiveSizeInBits())
2411 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2412 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2415 const SCEV *ExitCount = SE->getAddExpr(
2416 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->
getType()));
2454 "VF*UF must be a power of 2 when folding tail by masking");
2491 "Vector elements must have same size");
2502 "Only one type should be a pointer type");
2504 "Only one type should be a floating point type");
2556 if (
auto *
C = dyn_cast<ConstantInt>(SCEVCheck))
2561 "Cannot SCEV check stride or overflow when folding tail");
2563 BB->
setName(
"vector.scevcheck");
2589 std::tie(FirstCheckInst, MemRuntimeCheck) =
2591 if (!MemRuntimeCheck)
2596 BB->
setName(
"vector.memcheck");
2613 LVer->prepareNoAliasMetadata();
2624 "Index type does not match StepValue type");
2634 if (
auto *CX = dyn_cast<ConstantInt>(X))
2637 if (
auto *CY = dyn_cast<ConstantInt>(
Y))
2645 if (
auto *CX = dyn_cast<ConstantInt>(X))
2648 if (
auto *CY = dyn_cast<ConstantInt>(
Y))
2657 "Index type does not match StartValue type");
2665 assert(isa<SCEVConstant>(Step) &&
2666 "Expected constant step for pointer induction");
2668 nullptr, StartValue,
2673 assert(Step->getType()->isFloatingPointTy() &&
"Expected FP Step value");
2676 (InductionBinOp->getOpcode() == Instruction::FAdd ||
2677 InductionBinOp->getOpcode() == Instruction::FSub) &&
2678 "Original bin op should be defined for FP induction");
2680 Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2687 if (isa<Instruction>(MulExp))
2689 cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2691 Value *BOp = B.
CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2693 if (isa<Instruction>(BOp))
2694 cast<Instruction>(BOp)->setFastMathFlags(Flags);
2741 assert(VectorPH &&
"Invalid loop structure");
2742 assert(ExitBlock &&
"Must have an exit block");
2823 for (
auto &InductionEntry : *List) {
2824 PHINode *OrigPhi = InductionEntry.first;
2835 EndValue = CountRoundDown;
2841 Value *CRD =
B.CreateCast(CastOp, CountRoundDown, StepType,
"cast.crd");
2886 LLVMLoopVectorizeFollowupVectorized});
2929 assert(isa<PHINode>(UI) &&
"Expected LCSSA form");
2930 MissingVals[UI] = EndValue;
2938 auto *UI = cast<Instruction>(U);
2942 assert(isa<PHINode>(UI) &&
"Expected LCSSA form");
2945 Value *CountMinusOne =
B.CreateSub(
2949 ?
B.CreateCast(Instruction::SIToFP, CountMinusOne,
2954 Escape->
setName(
"ind.escape");
2955 MissingVals[UI] = Escape;
2959 for (
auto &
I : MissingVals) {
2960 PHINode *PHI = cast<PHINode>(
I.first);
2973 struct CSEDenseMapInfo {
2975 return isa<InsertElementInst>(
I) || isa<ExtractElementInst>(I) ||
2976 isa<ShuffleVectorInst>(
I) || isa<GetElementPtrInst>(I);
2987 static unsigned getHashValue(
const Instruction *I) {
2988 assert(canHandle(I) &&
"Unknown instruction!");
2994 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
2995 LHS == getTombstoneKey() || RHS == getTombstoneKey())
3010 if (!CSEDenseMapInfo::canHandle(In))
3035 (!isa<LoadInst>(
I) ||
3043 if (
CallInst *CI = dyn_cast<CallInst>(I)) {
3047 else if (!isa<StoreInst>(I) ||
3063 bool &NeedToScalarize) {
3077 return ScalarCallCost;
3081 for (
Type *ScalarTy : ScalarTys)
3088 unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3092 NeedToScalarize =
true;
3098 if (VectorCallCost < Cost) {
3099 NeedToScalarize =
false;
3100 return VectorCallCost;
3112 assert(ID &&
"Expected intrinsic call!");
3115 if (
auto *FPMO = dyn_cast<FPMathOperator>(CI))
3116 FMF = FPMO->getFastMathFlags();
3125 return I1->getBitWidth() < I2->getBitWidth() ?
T1 : T2;
3130 return I1->getBitWidth() > I2->getBitWidth() ?
T1 : T2;
3144 for (
unsigned Part = 0; Part <
UF; ++Part) {
3147 !isa<Instruction>(
I))
3150 Type *ScalarTruncatedTy =
3154 if (TruncatedTy == OriginalTy)
3158 auto ShrinkOperand = [&](
Value *V) ->
Value * {
3159 if (
auto *ZI = dyn_cast<ZExtInst>(V))
3160 if (ZI->getSrcTy() == TruncatedTy)
3161 return ZI->getOperand(0);
3167 Value *NewI =
nullptr;
3168 if (
auto *BO = dyn_cast<BinaryOperator>(I)) {
3169 NewI = B.
CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3170 ShrinkOperand(BO->getOperand(1)));
3175 cast<BinaryOperator>(NewI)->copyIRFlags(I,
false);
3176 }
else if (
auto *CI = dyn_cast<ICmpInst>(I)) {
3178 B.
CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3179 ShrinkOperand(CI->getOperand(1)));
3180 }
else if (
auto *
SI = dyn_cast<SelectInst>(I)) {
3182 ShrinkOperand(
SI->getTrueValue()),
3183 ShrinkOperand(
SI->getFalseValue()));
3184 }
else if (
auto *CI = dyn_cast<CastInst>(I)) {
3185 switch (CI->getOpcode()) {
3188 case Instruction::Trunc:
3189 NewI = ShrinkOperand(CI->getOperand(0));
3191 case Instruction::SExt:
3196 case Instruction::ZExt:
3202 }
else if (
auto *
SI = dyn_cast<ShuffleVectorInst>(I)) {
3203 auto Elements0 =
SI->getOperand(0)->getType()->getVectorNumElements();
3206 auto Elements1 =
SI->getOperand(1)->getType()->getVectorNumElements();
3211 }
else if (isa<LoadInst>(I) || isa<PHINode>(
I)) {
3214 }
else if (
auto *
IE = dyn_cast<InsertElementInst>(I)) {
3215 auto Elements =
IE->getOperand(0)->getType()->getVectorNumElements();
3220 }
else if (
auto *EE = dyn_cast<ExtractElementInst>(I)) {
3231 NewI->
takeName(cast<Instruction>(I));
3234 cast<Instruction>(
I)->eraseFromParent();
3247 for (
unsigned Part = 0; Part <
UF; ++Part) {
3268 "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3377 auto *VectorInit = ScalarInit;
3392 auto *VecPhi =
Builder.
CreatePHI(VectorInit->getType(), 2,
"vector.recur");
3405 isa<PHINode>(PreviousLastPart))
3415 for (
unsigned I = 1;
I <
VF; ++
I)
3420 Value *Incoming = VecPhi;
3423 for (
unsigned Part = 0; Part <
UF; ++Part) {
3431 cast<Instruction>(PhiPart)->eraseFromParent();
3433 Incoming = PreviousPart;
3441 auto *ExtractForScalar = Incoming;
3452 Value *ExtractForPhiUsedOutsideLoop =
nullptr;
3467 auto *Incoming = BB ==
LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3468 Start->addIncoming(Incoming, BB);
3480 if (LCSSAPhi.getIncomingValue(0) == Phi) {
3491 "Unable to find the reduction variable");
3496 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3498 RdxDesc.getMinMaxRecurrenceKind();
3518 VectorStart = Identity = ReductionStartValue;
3520 VectorStart = Identity =
3531 VectorStart = ReductionStartValue;
3548 for (
unsigned Part = 0; Part <
UF; ++Part) {
3553 Value *StartVal = (Part == 0) ? VectorStart : Identity;
3555 cast<PHINode>(VecRdxPhi)
3570 if (
VF > 1 && Phi->
getType() != RdxDesc.getRecurrenceType()) {
3575 for (
unsigned Part = 0; Part <
UF; ++Part) {
3581 UI != RdxParts[Part]->user_end();)
3583 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3584 RdxParts[Part] = Extnd;
3590 for (
unsigned Part = 0; Part <
UF; ++Part) {
3600 for (
unsigned Part = 1; Part <
UF; ++Part) {
3602 if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3606 ReducedPartRdx,
"bin.rdx"));
3618 if (Phi->
getType() != RdxDesc.getRecurrenceType())
3640 assert(LCSSAPhi.getNumIncomingValues() < 3 &&
"Invalid LCSSA PHI");
3644 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3650 int IncomingEdgeBlockIdx =
3652 assert(IncomingEdgeBlockIdx >= 0 &&
"Invalid block index");
3654 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3661 if (LCSSAPhi.getNumIncomingValues() == 1) {
3662 auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3664 unsigned LastLane = 0;
3665 if (isa<Instruction>(IncomingValue))
3667 cast<Instruction>(IncomingValue),
VF)
3673 Value *lastIncomingValue =
3694 auto isBlockOfUsePredicated = [&](
Use &U) ->
bool {
3695 auto *
I = cast<Instruction>(U.getUser());
3697 if (
auto *Phi = dyn_cast<PHINode>(
I))
3698 BB = Phi->getIncomingBlock(
3700 return BB == PredBB;
3711 Worklist.insert(InstsToReanalyze.
begin(), InstsToReanalyze.
end());
3712 InstsToReanalyze.
clear();
3715 while (!Worklist.empty()) {
3720 if (!
I || isa<PHINode>(
I) ||
I->getParent() == PredBB ||
3721 !VectorLoop->contains(
I) ||
I->mayHaveSideEffects())
3728 InstsToReanalyze.push_back(
I);
3734 I->moveBefore(&*PredBB->getFirstInsertionPt());
3735 Worklist.insert(
I->op_begin(),
I->op_end());
3754 assert(ScalarBBPredecessors.
size() == VectorBBPredecessors.
size() &&
3755 "Scalar and Vector BB should have the same number of predecessors");
3765 for (
unsigned i = 0; i < NumIncomingValues; ++i) {
3766 BasicBlock *NewPredBB = VectorBBPredecessors[i];
3771 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
3798 "Non-header phis should have been handled elsewhere");
3805 for (
unsigned Part = 0; Part <
UF; ++Part) {
3827 switch (II.getKind()) {
3845 for (
unsigned Part = 0; Part <
UF; ++Part) {
3846 for (
unsigned Lane = 0; Lane < Lanes; ++Lane) {
3872 "Unexpected instruction");
3875 return !CInt || CInt->
isZero();
3880 case Instruction::Br:
3881 case Instruction::PHI:
3883 case Instruction::GetElementPtr: {
3889 auto *
GEP = cast<GetElementPtrInst>(&
I);
3905 for (
unsigned Part = 0; Part <
UF; ++Part) {
3918 for (
unsigned Part = 0; Part <
UF; ++Part) {
3923 ?
GEP->getPointerOperand()
3938 auto *NewGEP =
GEP->isInBounds()
3941 assert((
VF == 1 || NewGEP->getType()->isVectorTy()) &&
3942 "NewGEP is not a pointer vector");
3950 case Instruction::UDiv:
3951 case Instruction::SDiv:
3952 case Instruction::SRem:
3953 case Instruction::URem:
3955 case Instruction::FAdd:
3956 case Instruction::Sub:
3957 case Instruction::FSub:
3958 case Instruction::Mul:
3959 case Instruction::FMul:
3960 case Instruction::FDiv:
3961 case Instruction::FRem:
3962 case Instruction::Shl:
3963 case Instruction::LShr:
3964 case Instruction::AShr:
3965 case Instruction::And:
3966 case Instruction::Or:
3967 case Instruction::Xor: {
3969 auto *BinOp = cast<BinaryOperator>(&
I);
3972 for (
unsigned Part = 0; Part <
UF; ++Part) {
3978 VecOp->copyIRFlags(BinOp);
3992 bool InvariantCond =
4003 for (
unsigned Part = 0; Part <
UF; ++Part) {
4016 case Instruction::ICmp:
4017 case Instruction::FCmp: {
4019 bool FCmp = (I.
getOpcode() == Instruction::FCmp);
4022 for (
unsigned Part = 0; Part <
UF; ++Part) {
4041 case Instruction::ZExt:
4042 case Instruction::SExt:
4043 case Instruction::FPToUI:
4044 case Instruction::FPToSI:
4045 case Instruction::FPExt:
4046 case Instruction::PtrToInt:
4047 case Instruction::IntToPtr:
4048 case Instruction::SIToFP:
4049 case Instruction::UIToFP:
4050 case Instruction::Trunc:
4051 case Instruction::FPTrunc:
4052 case Instruction::BitCast: {
4060 for (
unsigned Part = 0; Part <
UF; ++Part) {
4071 if (isa<DbgInfoIntrinsic>(I))
4076 auto *CI = cast<CallInst>(&
I);
4078 StringRef FnName = CI->getCalledFunction()->getName();
4082 for (
Value *ArgOperand : CI->arg_operands())
4090 bool NeedToScalarize;
4092 bool UseVectorIntrinsic =
4094 assert((UseVectorIntrinsic || !NeedToScalarize) &&
4095 "Instruction should be scalarized elsewhere.");
4097 for (
unsigned Part = 0; Part <
UF; ++Part) {
4099 for (
unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4109 if (UseVectorIntrinsic) {
4111 Type *TysForDecl[] = {CI->getType()};
4118 assert(!VFnName.
empty() &&
"Vector function name is empty.");
4125 VectorF->copyAttributesFrom(F);
4128 assert(VectorF &&
"Can't create vector function.");
4131 CI->getOperandBundlesAsDefs(OpBundles);
4134 if (isa<FPMathOperator>(V))
4146 LLVM_DEBUG(
dbgs() <<
"LV: Found an unhandled instruction: " << I);
4161 "Entry does not dominate exit.");
4171 void LoopVectorizationCostModel::collectLoopScalars(
unsigned VF) {
4175 assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4176 "This function should not be visited twice for the same VF");
4190 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4191 assert(WideningDecision != CM_Unknown &&
4192 "Widening decision should be ready at this moment");
4193 if (
auto *
Store = dyn_cast<StoreInst>(MemAccess))
4194 if (Ptr ==
Store->getValueOperand())
4195 return WideningDecision == CM_Scalarize;
4197 "Ptr is neither a value or pointer operand");
4198 return WideningDecision != CM_GatherScatter;
4203 auto isLoopVaryingBitCastOrGEP = [&](
Value *V) {
4204 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4205 isa<GetElementPtrInst>(V)) &&
4206 !TheLoop->isLoopInvariant(V);
4216 if (!isLoopVaryingBitCastOrGEP(Ptr))
4221 auto *
I = cast<Instruction>(Ptr);
4222 if (Worklist.count(
I))
4229 return isa<LoadInst>(U) || isa<StoreInst>(U);
4233 PossibleNonScalarPtrs.
insert(
I);
4244 Worklist.insert(Uniforms[VF].
begin(), Uniforms[VF].
end());
4251 for (
auto *BB : TheLoop->blocks())
4252 for (
auto &
I : *BB) {
4253 if (
auto *
Load = dyn_cast<LoadInst>(&
I)) {
4254 evaluatePtrUse(
Load,
Load->getPointerOperand());
4255 }
else if (
auto *
Store = dyn_cast<StoreInst>(&
I)) {
4256 evaluatePtrUse(
Store,
Store->getPointerOperand());
4257 evaluatePtrUse(
Store,
Store->getValueOperand());
4260 for (
auto *
I : ScalarPtrs)
4261 if (PossibleNonScalarPtrs.
find(
I) == PossibleNonScalarPtrs.
end()) {
4271 auto *Latch = TheLoop->getLoopLatch();
4274 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4277 Worklist.insert(Ind);
4278 Worklist.insert(IndUpdate);
4279 LLVM_DEBUG(
dbgs() <<
"LV: Found scalar instruction: " << *Ind <<
"\n");
4280 LLVM_DEBUG(
dbgs() <<
"LV: Found scalar instruction: " << *IndUpdate
4287 auto ForcedScalar = ForcedScalars.find(VF);
4288 if (ForcedScalar != ForcedScalars.end())
4289 for (
auto *
I : ForcedScalar->second)
4297 while (Idx != Worklist.size()) {
4299 if (!isLoopVaryingBitCastOrGEP(Dst->
getOperand(0)))
4301 auto *Src = cast<Instruction>(Dst->
getOperand(0));
4303 auto *J = cast<Instruction>(U);
4304 return !TheLoop->contains(J) || Worklist.count(J) ||
4305 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4306 isScalarUse(J, Src));
4308 Worklist.insert(Src);
4309 LLVM_DEBUG(
dbgs() <<
"LV: Found scalar instruction: " << *Src <<
"\n");
4317 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4330 auto *
I = cast<Instruction>(U);
4331 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4338 auto ScalarIndUpdate =
4340 auto *
I = cast<Instruction>(U);
4341 return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4343 if (!ScalarIndUpdate)
4347 Worklist.insert(Ind);
4348 Worklist.insert(IndUpdate);
4349 LLVM_DEBUG(
dbgs() <<
"LV: Found scalar instruction: " << *Ind <<
"\n");
4350 LLVM_DEBUG(
dbgs() <<
"LV: Found scalar instruction: " << *IndUpdate
4354 Scalars[
VF].insert(Worklist.begin(), Worklist.end());
4358 if (!blockNeedsPredication(I->
getParent()))
4372 InstWidening WideningDecision = getWideningDecision(I, VF);
4373 assert(WideningDecision != CM_Unknown &&
4374 "Widening decision should be ready at this moment");
4375 return WideningDecision == CM_Scalarize;
4377 return isa<LoadInst>(
I) ?
4378 !(isLegalMaskedLoad(Ty, Ptr) || isLegalMaskedGather(Ty))
4379 : !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty));
4381 case Instruction::UDiv:
4382 case Instruction::SDiv:
4383 case Instruction::SRem:
4384 case Instruction::URem:
4392 assert(isAccessInterleaved(I) &&
"Expecting interleaved access.");
4393 assert(getWideningDecision(I, VF) == CM_Unknown &&
4394 "Decision should not be set yet.");
4395 auto *Group = getInterleavedAccessGroup(I);
4396 assert(Group &&
"Must have a group.");
4401 bool PredicatedAccessRequiresMasking =
4403 bool AccessWithGapsRequiresMasking =
4404 Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed;
4405 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4412 "Masked interleave-groups for predicated accesses are not enabled.");
4424 assert((LI || SI) &&
"Invalid memory instruction");
4434 if (isScalarWithPredication(I))
4440 auto *ScalarTy = LI ? LI->
getType() : SI->getValueOperand()->getType();
4447 void LoopVectorizationCostModel::collectLoopUniforms(
unsigned VF) {
4453 assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4454 "This function should not be visited twice for the same VF");
4458 Uniforms[
VF].clear();
4466 auto isOutOfScope = [&](
Value *V) ->
bool {
4468 return (!I || !TheLoop->contains(I));
4478 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
4480 LLVM_DEBUG(
dbgs() <<
"LV: Found uniform instruction: " << *Cmp <<
"\n");
4493 InstWidening WideningDecision = getWideningDecision(I, VF);
4494 assert(WideningDecision != CM_Unknown &&
4495 "Widening decision should be ready at this moment");
4497 return (WideningDecision == CM_Widen ||
4498 WideningDecision == CM_Widen_Reverse ||
4499 WideningDecision == CM_Interleave);
4509 for (
auto *BB : TheLoop->blocks())
4510 for (
auto &I : *BB) {
4518 auto UsersAreMemAccesses =
4527 if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4528 PossibleNonUniformPtrs.
insert(Ptr);
4534 ConsecutiveLikePtrs.
insert(Ptr);
4539 for (
auto *V : ConsecutiveLikePtrs)
4540 if (PossibleNonUniformPtrs.
find(V) == PossibleNonUniformPtrs.
end()) {
4541 LLVM_DEBUG(
dbgs() <<
"LV: Found uniform instruction: " << *V <<
"\n");
4549 while (idx != Worklist.
size()) {
4554 if (isOutOfScope(OV))
4563 auto *OI = cast<Instruction>(OV);
4565 auto *J = cast<Instruction>(U);
4566 return Worklist.
count(J) ||
4568 isUniformDecision(J, VF));
4571 LLVM_DEBUG(
dbgs() <<
"LV: Found uniform instruction: " << *OI <<
"\n");
4590 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4595 auto *I = cast<Instruction>(U);
4596 return I == IndUpdate || !TheLoop->contains(I) || Worklist.
count(I) ||
4597 isVectorizedMemAccessUse(I, Ind);
4604 auto UniformIndUpdate =
4606 auto *I = cast<Instruction>(U);
4607 return I == Ind || !TheLoop->contains(I) || Worklist.
count(I) ||
4608 isVectorizedMemAccessUse(I, IndUpdate);
4610 if (!UniformIndUpdate)
4615 Worklist.
insert(IndUpdate);
4616 LLVM_DEBUG(
dbgs() <<
"LV: Found uniform instruction: " << *Ind <<
"\n");
4617 LLVM_DEBUG(
dbgs() <<
"LV: Found uniform instruction: " << *IndUpdate
4621 Uniforms[
VF].insert(Worklist.
begin(), Worklist.
end());
4629 dbgs() <<
"LV: Not inserting runtime ptr check for divergent target");
4632 createMissedAnalysis(
"CantVersionLoopWithDivergentTarget")
4633 <<
"runtime pointer checks needed. Not enabled for divergent target");
4640 return computeFeasibleMaxVF(OptForSize, TC);
4643 ORE->
emit(createMissedAnalysis(
"CantVersionLoopWithOptForSize")
4644 <<
"runtime pointer checks needed. Enable vectorization of this " 4645 "loop with '#pragma clang loop vectorize(enable)' when " 4646 "compiling with -Os/-Oz");
4649 <<
"LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");
4654 ORE->
emit(createMissedAnalysis(
"CantVersionLoopWithOptForSize")
4655 <<
"runtime SCEV checks needed. Enable vectorization of this " 4656 "loop with '#pragma clang loop vectorize(enable)' when " 4657 "compiling with -Os/-Oz");
4660 <<
"LV: Aborting. Runtime SCEV check is required with -Os/-Oz.\n");
4666 ORE->
emit(createMissedAnalysis(
"CantVersionLoopWithOptForSize")
4667 <<
"runtime stride == 1 checks needed. Enable vectorization of " 4668 "this loop with '#pragma clang loop vectorize(enable)' when " 4669 "compiling with -Os/-Oz");
4672 <<
"LV: Aborting. Runtime stride check is required with -Os/-Oz.\n");
4680 ORE->
emit(createMissedAnalysis(
"SingleIterationLoop")
4681 <<
"loop trip count is one, irrelevant for vectorization");
4682 LLVM_DEBUG(
dbgs() <<
"LV: Aborting, single iteration (non) loop.\n");
4687 LLVM_DEBUG(
dbgs() <<
"LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4689 IsScalarEpilogueAllowed = !OptForSize;
4695 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4697 unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC);
4699 if (TC > 0 && TC % MaxVF == 0) {
4700 LLVM_DEBUG(
dbgs() <<
"LV: No tail will remain for any chosen VF.\n");
4709 FoldTailByMasking =
true;
4715 createMissedAnalysis(
"UnknownLoopCountComplexCFG")
4716 <<
"unable to calculate the loop count due to complex control flow");
4720 ORE->
emit(createMissedAnalysis(
"NoTailLoopWithOptForSize")
4721 <<
"cannot optimize for size and vectorize at the same time. " 4722 "Enable vectorization of this loop with '#pragma clang loop " 4723 "vectorize(enable)' when compiling with -Os/-Oz");
4728 LoopVectorizationCostModel::computeFeasibleMaxVF(
bool OptForSize,
4729 unsigned ConstTripCount) {
4731 unsigned SmallestType, WidestType;
4732 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4741 WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
4743 unsigned MaxVectorSize = WidestRegister / WidestType;
4745 LLVM_DEBUG(
dbgs() <<
"LV: The Smallest and Widest types: " << SmallestType
4746 <<
" / " << WidestType <<
" bits.\n");
4748 << WidestRegister <<
" bits.\n");
4750 assert(MaxVectorSize <= 256 &&
"Did not expect to pack so many elements" 4751 " into one vector!");
4752 if (MaxVectorSize == 0) {
4753 LLVM_DEBUG(
dbgs() <<
"LV: The target has no vector registers.\n");
4755 return MaxVectorSize;
4756 }
else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
4760 LLVM_DEBUG(
dbgs() <<
"LV: Clamping the MaxVF to the constant trip count: " 4761 << ConstTripCount <<
"\n");
4762 MaxVectorSize = ConstTripCount;
4763 return MaxVectorSize;
4766 unsigned MaxVF = MaxVectorSize;
4772 unsigned NewMaxVectorSize = WidestRegister / SmallestType;
4773 for (
unsigned VS = MaxVectorSize * 2;
VS <= NewMaxVectorSize;
VS *= 2)
4777 auto RUs = calculateRegisterUsage(VFs);
4782 for (
int i = RUs.size() - 1; i >= 0; --i) {
4783 if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
4789 if (MaxVF < MinVF) {
4791 <<
") with target's minimum: " << MinVF <<
'\n');
4801 float Cost = expectedCost(1).first;
4802 const float ScalarCost =
Cost;
4804 LLVM_DEBUG(
dbgs() <<
"LV: Scalar loop costs: " << (
int)ScalarCost <<
".\n");
4807 if (ForceVectorization && MaxVF > 1) {
4814 for (
unsigned i = 2; i <= MaxVF; i *= 2) {
4818 VectorizationCostTy
C = expectedCost(i);
4819 float VectorCost = C.first / (float)i;
4821 <<
" costs: " << (
int)VectorCost <<
".\n");
4822 if (!C.second && !ForceVectorization) {
4824 dbgs() <<
"LV: Not considering vector loop of width " << i
4825 <<
" because it will not generate any vector instructions.\n");
4828 if (VectorCost < Cost) {
4835 ORE->
emit(createMissedAnalysis(
"ConditionalStore")
4836 <<
"store that is conditionally executed prevents vectorization");
4838 dbgs() <<
"LV: No vectorization. There are conditional stores.\n");
4843 LLVM_DEBUG(
if (ForceVectorization && Width > 1 && Cost >= ScalarCost)
dbgs()
4844 <<
"LV: Vectorization seems to be not beneficial, " 4845 <<
"but was forced by a user.\n");
4851 std::pair<unsigned, unsigned>
4853 unsigned MinWidth = -1U;
4854 unsigned MaxWidth = 8;
4855 const DataLayout &DL = TheFunction->getParent()->getDataLayout();
4860 for (
Instruction &
I : BB->instructionsWithoutDebug()) {
4864 if (ValuesToIgnore.find(&
I) != ValuesToIgnore.end())
4868 if (!isa<LoadInst>(
I) && !isa<StoreInst>(
I) && !isa<PHINode>(
I))
4873 if (
auto *PN = dyn_cast<PHINode>(&
I)) {
4881 if (
auto *
ST = dyn_cast<StoreInst>(&
I))
4882 T =
ST->getValueOperand()->getType();
4893 if (T->
isPointerTy() && !isConsecutiveLoadOrStore(&
I) &&
4894 !isAccessInterleaved(&
I) && !isLegalGatherOrScatter(&
I))
4897 MinWidth = std::min(MinWidth,
4904 return {MinWidth, MaxWidth};
4909 unsigned LoopCost) {
4934 if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
4988 LoopCost = expectedCost(VF).first;
4992 if (IC > MaxInterleaveCount)
4993 IC = MaxInterleaveCount;
5000 LLVM_DEBUG(
dbgs() <<
"LV: Interleaving because of reductions.\n");
5006 bool InterleavingRequiresRuntimePointerCheck =
5012 if (!InterleavingRequiresRuntimePointerCheck && LoopCost <
SmallLoopCost) {
5023 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5024 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5032 SmallIC = std::min(SmallIC, F);
5033 StoresIC = std::min(StoresIC, F);
5034 LoadsIC = std::min(LoadsIC, F);
5038 std::max(StoresIC, LoadsIC) > SmallIC) {
5040 dbgs() <<
"LV: Interleaving to saturate store or load ports.\n");
5041 return std::max(StoresIC, LoadsIC);
5044 LLVM_DEBUG(
dbgs() <<
"LV: Interleaving to reduce branch cost.\n");
5100 for (
Instruction &
I : BB->instructionsWithoutDebug()) {
5104 for (
Value *U :
I.operands()) {
5112 if (!TheLoop->contains(Instr)) {
5113 LoopInvariants.
insert(Instr);
5118 EndPoint[Instr] = IdxToInstr.
size();
5135 unsigned MaxSafeDepDist = -1U;
5138 unsigned WidestRegister =
5140 const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5145 LLVM_DEBUG(
dbgs() <<
"LV(REG): Calculating max register usage:\n");
5148 auto GetRegUsage = [&DL, WidestRegister](
Type *Ty,
unsigned VF) {
5149 if (Ty->isTokenTy())
5151 unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5152 return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
5155 for (
unsigned int i = 0, s = IdxToInstr.
size(); i < s; ++i) {
5159 InstrList &
List = TransposeEnds[i];
5161 OpenIntervals.
erase(ToRemove);
5164 if (Ends.
find(I) == Ends.
end())
5168 if (ValuesToIgnore.find(I) != ValuesToIgnore.end())
5172 for (
unsigned j = 0, e = VFs.
size(); j < e; ++j) {
5174 MaxUsages[j] =
std::max(MaxUsages[j], OpenIntervals.
size());
5177 collectUniformsAndScalars(VFs[j]);
5180 for (
auto Inst : OpenIntervals) {
5182 if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end() ||
5183 isScalarAfterVectorization(Inst, VFs[j]))
5185 RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
5187 MaxUsages[j] =
std::max(MaxUsages[j], RegUsage);
5191 << OpenIntervals.
size() <<
'\n');
5197 for (
unsigned i = 0, e = VFs.
size(); i < e; ++i) {
5198 unsigned Invariant = 0;
5200 Invariant = LoopInvariants.
size();
5202 for (
auto Inst : LoopInvariants)
5203 Invariant += GetRegUsage(Inst->getType(), VFs[i]);
5207 LLVM_DEBUG(
dbgs() <<
"LV(REG): Found max usage: " << MaxUsages[i] <<
'\n');
5208 LLVM_DEBUG(
dbgs() <<
"LV(REG): Found invariant usage: " << Invariant
5219 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(
Instruction *
I){
5228 assert(isPredicatedInst(I) &&
"Expecting a scalar emulated instruction");
5229 return isa<LoadInst>(
I) ||
5230 (isa<StoreInst>(I) &&
5239 if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
5251 if (!blockNeedsPredication(BB))
5254 if (isScalarWithPredication(&I)) {
5258 if (!useEmulatedMaskMemRefHack(&I) &&
5259 computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5262 PredicatedBBsAfterVectorization.insert(BB);
5267 int LoopVectorizationCostModel::computePredInstDiscount(
5270 assert(!isUniformAfterVectorization(PredInst, VF) &&
5271 "Instruction marked uniform-after-vectorization will be predicated");
5290 isScalarAfterVectorization(I, VF))
5295 if (isScalarWithPredication(I))
5309 if (
auto *J = dyn_cast<Instruction>(U.get()))
5310 if (isUniformAfterVectorization(J, VF))
5323 return TheLoop->contains(I) && !isScalarAfterVectorization(I, VF);
5330 while (!Worklist.
empty()) {
5334 if (ScalarCosts.
find(I) != ScalarCosts.
end())
5339 unsigned VectorCost = getInstructionCost(I, VF).first;
5345 unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
5360 if (
auto *J = dyn_cast<Instruction>(U.get())) {
5362 "Instruction has non-scalar type");
5363 if (canBeScalarized(J))
5365 else if (needsExtract(J))
5375 Discount += VectorCost - ScalarCost;
5376 ScalarCosts[
I] = ScalarCost;
5382 LoopVectorizationCostModel::VectorizationCostTy
5383 LoopVectorizationCostModel::expectedCost(
unsigned VF) {
5384 VectorizationCostTy
Cost;
5388 VectorizationCostTy BlockCost;
5391 for (
Instruction &I : BB->instructionsWithoutDebug()) {
5393 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() ||
5394 (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end()))
5397 VectorizationCostTy
C = getInstructionCost(&I, VF);
5403 BlockCost.first += C.first;
5404 BlockCost.second |= C.second;
5405 LLVM_DEBUG(
dbgs() <<
"LV: Found an estimated cost of " << C.first
5406 <<
" for VF " << VF <<
" For instruction: " << I
5416 if (VF == 1 && blockNeedsPredication(BB))
5419 Cost.first += BlockCost.first;
5420 Cost.second |= BlockCost.second;
5435 const Loop *TheLoop) {
5443 auto SE = PSE.
getSE();
5444 unsigned NumOperands = Gep->getNumOperands();
5445 for (
unsigned i = 1; i < NumOperands; ++i) {
5446 Value *Opd = Gep->getOperand(i);
5447 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5461 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(
Instruction *I,
5463 assert(VF > 1 &&
"Scalarization cost of instruction implies vectorization.");
5492 if (isPredicatedInst(I)) {
5495 if (useEmulatedMaskMemRefHack(I))
5504 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(
Instruction *I,
5513 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5514 "Stride should be 1 or -1 for consecutive memory access");
5521 bool Reverse = ConsecutiveStride < 0;
5527 unsigned LoopVectorizationCostModel::getUniformMemOpCost(
Instruction *I,
5533 if (isa<LoadInst>(I)) {
5544 Instruction::ExtractElement,
5548 unsigned LoopVectorizationCostModel::getGatherScatterCost(
Instruction *I,
5560 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(
Instruction *I,
5566 auto Group = getInterleavedAccessGroup(I);
5567 assert(Group &&
"Fail to get an interleaved access group.");
5569 unsigned InterleaveFactor = Group->getFactor();
5575 if (isa<LoadInst>(I)) {
5576 for (
unsigned i = 0; i < InterleaveFactor; i++)
5577 if (Group->getMember(i))
5582 bool UseMaskForGaps =
5583 Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed;
5585 I->
getOpcode(), WideVecTy, Group->getFactor(), Indices,
5588 if (Group->isReverse()) {
5591 "Reverse masked interleaved access not supported.");
5592 Cost += Group->getNumMembers() *
5598 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(
Instruction *I,
5610 return getWideningCost(I, VF);
5613 LoopVectorizationCostModel::VectorizationCostTy
5614 LoopVectorizationCostModel::getInstructionCost(
Instruction *I,
unsigned VF) {
5617 if (isUniformAfterVectorization(I, VF))
5620 if (VF > 1 && isProfitableToScalarize(I, VF))
5621 return VectorizationCostTy(InstsToScalarize[VF][I],
false);
5624 auto ForcedScalar = ForcedScalars.find(VF);
5625 if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
5626 auto InstSet = ForcedScalar->second;
5627 if (InstSet.find(I) != InstSet.end())
5628 return VectorizationCostTy((getInstructionCost(I, 1).
first * VF),
false);
5632 unsigned C = getInstructionCost(I, VF, VectorTy);
5634 bool TypeNotScalarized =
5636 return VectorizationCostTy(C, TypeNotScalarized);
5654 if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
5666 unsigned Cost = getUniformMemOpCost(&I, VF);
5667 setWideningDecision(&I, VF, CM_Scalarize, Cost);
5672 if (memoryInstructionCanBeWidened(&I, VF)) {
5673 unsigned Cost = getConsecutiveMemOpCost(&I, VF);
5674 int ConsecutiveStride =
5676 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5677 "Expected consecutive stride.");
5679 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
5680 setWideningDecision(&I, VF, Decision, Cost);
5686 unsigned NumAccesses = 1;
5687 if (isAccessInterleaved(&I)) {
5688 auto Group = getInterleavedAccessGroup(&I);
5689 assert(Group &&
"Fail to get an interleaved access group.");
5692 if (getWideningDecision(&I, VF) != CM_Unknown)
5695 NumAccesses = Group->getNumMembers();
5696 if (interleavedAccessCanBeWidened(&I, VF))
5697 InterleaveCost = getInterleaveGroupCost(&I, VF);
5700 unsigned GatherScatterCost =
5701 isLegalGatherOrScatter(&I)
5702 ? getGatherScatterCost(&I, VF) * NumAccesses
5705 unsigned ScalarizationCost =
5706 getMemInstScalarizationCost(&I, VF) * NumAccesses;
5712 if (InterleaveCost <= GatherScatterCost &&
5713 InterleaveCost < ScalarizationCost) {
5714 Decision = CM_Interleave;
5715 Cost = InterleaveCost;
5716 }
else if (GatherScatterCost < ScalarizationCost) {
5717 Decision = CM_GatherScatter;
5718 Cost = GatherScatterCost;
5720 Decision = CM_Scalarize;
5721 Cost = ScalarizationCost;
5726 if (
auto Group = getInterleavedAccessGroup(&I))
5727 setWideningDecision(Group, VF, Decision, Cost);
5729 setWideningDecision(&I, VF, Decision, Cost);
5747 if (PtrDef && TheLoop->contains(PtrDef) &&
5748 getWideningDecision(&I, VF) != CM_GatherScatter)
5754 for (
auto *I : AddrDefs)
5756 while (!Worklist.
empty()) {
5759 if (
auto *InstOp = dyn_cast<Instruction>(
Op))
5760 if ((InstOp->getParent() == I->
getParent()) && !isa<PHINode>(InstOp) &&
5761 AddrDefs.insert(InstOp).second)
5765 for (
auto *I : AddrDefs) {
5766 if (isa<LoadInst>(I)) {
5772 if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
5774 setWideningDecision(I, VF, CM_Scalarize,
5775 (VF * getMemoryInstructionCost(I, 1)));
5776 else if (
auto Group = getInterleavedAccessGroup(I)) {
5778 for (
unsigned I = 0; I < Group->getFactor(); ++
I) {
5780 setWideningDecision(Member, VF, CM_Scalarize,
5781 (VF * getMemoryInstructionCost(Member, 1)));
5787 ForcedScalars[
VF].insert(I);
5791 unsigned LoopVectorizationCostModel::getInstructionCost(
Instruction *I,
5795 if (canTruncateToMinimalBitwidth(I, VF))
5797 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy :
ToVectorTy(RetTy, VF);
5802 case Instruction::GetElementPtr:
5808 case Instruction::Br: {
5812 bool ScalarPredicatedBB =
false;
5815 (PredicatedBBsAfterVectorization.find(BI->
getSuccessor(0)) !=
5816 PredicatedBBsAfterVectorization.end() ||
5817 PredicatedBBsAfterVectorization.find(BI->
getSuccessor(1)) !=
5818 PredicatedBBsAfterVectorization.end()))
5819 ScalarPredicatedBB =
true;
5821 if (ScalarPredicatedBB) {
5827 }
else if (I->
getParent() == TheLoop->getLoopLatch() || VF == 1)
5837 case Instruction::PHI: {
5838 auto *Phi = cast<PHINode>(
I);
5849 if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
5850 return (Phi->getNumIncomingValues() - 1) *
5857 case Instruction::UDiv:
5858 case Instruction::SDiv:
5859 case Instruction::URem:
5860 case Instruction::SRem:
5865 if (VF > 1 && isScalarWithPredication(I)) {
5888 case Instruction::FAdd:
5889 case Instruction::Sub:
5890 case Instruction::FSub:
5891 case Instruction::Mul:
5892 case Instruction::FMul:
5893 case Instruction::FDiv:
5894 case Instruction::FRem:
5895 case Instruction::Shl:
5896 case Instruction::LShr:
5897 case Instruction::AShr:
5898 case Instruction::And:
5899 case Instruction::Or:
5900 case Instruction::Xor: {
5914 unsigned N = isScalarAfterVectorization(I, VF) ?
VF : 1;
5922 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
5929 case Instruction::ICmp:
5930 case Instruction::FCmp: {
5933 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
5940 unsigned Width =
VF;
5943 assert(Decision != CM_Unknown &&
5944 "CM decision should be taken at this point");
5945 if (Decision == CM_Scalarize)
5949 return getMemoryInstructionCost(I, VF);
5951 case Instruction::ZExt:
5952 case Instruction::SExt:
5953 case Instruction::FPToUI:
5954 case Instruction::FPToSI:
5955 case Instruction::FPExt:
5956 case Instruction::PtrToInt:
5957 case Instruction::IntToPtr:
5958 case Instruction::SIToFP:
5959 case Instruction::UIToFP:
5960 case Instruction::Trunc:
5961 case Instruction::FPTrunc:
5962 case Instruction::BitCast: {
5966 if (isOptimizableIVTruncate(I, VF)) {
5967 auto *Trunc = cast<TruncInst>(
I);
5969 Trunc->getSrcTy(), Trunc);
5975 if (canTruncateToMinimalBitwidth(I, VF)) {
5981 Type *MinVecTy = VectorTy;
5982 if (I->
getOpcode() == Instruction::Trunc) {
5986 }
else if (I->
getOpcode() == Instruction::ZExt ||
5994 unsigned N = isScalarAfterVectorization(I, VF) ?
VF : 1;
5998 bool NeedToScalarize;
6015 static const char lv_name[] =
"Loop Vectorization";
6035 bool VectorizeOnlyWhenForced) {
6036 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6041 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(
Instruction *Inst) {
6058 VecValuesToIgnore.insert(Casts.
begin(), Casts.
end());
6065 VecValuesToIgnore.insert(Casts.
begin(), Casts.
end());
6087 assert(UserVF &&
"Expected UserVF for outer loop vectorization.");
6090 buildVPlans(UserVF, UserVF);
6094 return NoVectorization;
6100 dbgs() <<
"LV: Not vectorizing. Inner loops aren't supported in the " 6101 "VPlan-native path.\n");
6102 return NoVectorization;
6112 return NoVectorization;
6119 <<
"LV: Invalidate all interleaved groups due to fold-tail by masking " 6120 "which requires masked-interleaved support.\n");
6121 CM.InterleaveInfo.reset();
6129 CM.selectUserVectorizationFactor(UserVF);
6130 buildVPlansWithVPRecipes(UserVF, UserVF);
6135 unsigned MaxVF = MaybeMaxVF.
getValue();
6136 assert(MaxVF != 0 &&
"MaxVF is zero.");
6138 for (
unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6140 CM.collectUniformsAndScalars(VF);
6145 CM.collectInstsToScalarize(VF);
6148 buildVPlansWithVPRecipes(1, MaxVF);
6151 return NoVectorization;
6154 return CM.selectVectorizationFactor(MaxVF);
6158 LLVM_DEBUG(
dbgs() <<
"Setting best plan to VF=" << VF <<
", UF=" << UF
6163 erase_if(VPlans, [VF](
const VPlanPtr &Plan) {
6164 return !Plan->hasVF(VF);
6166 assert(VPlans.size() == 1 &&
"Best VF has not a single VPlan.");
6174 VPCallbackILV CallbackILV(ILV);
6191 assert(VPlans.size() == 1 &&
"Not a single VPlan to execute.");
6192 VPlans.front()->execute(&State);
6207 if (Cmp && Cmp->hasOneUse())
6208 DeadInstructions.
insert(Cmp);
6217 return U == Ind || DeadInstructions.
find(cast<Instruction>(U)) !=
6218 DeadInstructions.
end();
6220 DeadInstructions.
insert(IndUpdate);
6236 Value *InnerLoopUnroller::reverseVector(
Value *Vec) {
return Vec; }
6238 Value *InnerLoopUnroller::getBroadcastInstrs(
Value *V) {
return V; }
6240 Value *InnerLoopUnroller::getStepVector(
Value *Val,
int StartIdx,
Value *Step,
6261 bool IsUnrollMetadata =
false;
6265 for (
unsigned i = 1, ie = LoopID->
getNumOperands(); i < ie; ++i) {
6270 S && S->getString().startswith(
"llvm.loop.unroll.disable");
6276 if (!IsUnrollMetadata) {
6281 MDString::get(Context,
"llvm.loop.unroll.runtime.disable"));
6293 assert(Range.
End > Range.
Start &&
"Trying to test an empty VF range.");
6296 for (
unsigned TmpVF = Range.
Start * 2; TmpVF < Range.
End; TmpVF *= 2)
6297 if (
Predicate(TmpVF) != PredicateAtRangeStart) {
6302 return PredicateAtRangeStart;
6311 for (
unsigned VF = MinVF; VF < MaxVF + 1;) {
6313 VPlans.push_back(buildVPlan(SubRange));
6323 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
6325 if (ECEntryIt != EdgeMaskCache.end())
6326 return ECEntryIt->second;
6328 VPValue *SrcMask = createBlockInMask(Src, Plan);
6332 assert(BI &&
"Unexpected terminator found");
6335 return EdgeMaskCache[Edge] = SrcMask;
6338 assert(EdgeMask &&
"No Edge Mask found for condition");
6341 EdgeMask =
Builder.createNot(EdgeMask);
6344 EdgeMask =
Builder.createAnd(EdgeMask, SrcMask);
6346 return EdgeMaskCache[Edge] = EdgeMask;
6354 if (BCEntryIt != BlockMaskCache.end())
6355 return BCEntryIt->second;
6362 if (!CM.blockNeedsPredication(BB))
6363 return BlockMaskCache[BB] = BlockMask;
6368 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
6370 return BlockMaskCache[BB] = BlockMask;
6375 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
6377 return BlockMaskCache[BB] = EdgeMask;
6380 BlockMask = EdgeMask;
6384 BlockMask =
Builder.createOr(BlockMask, EdgeMask);
6387 return BlockMaskCache[BB] = BlockMask;
6399 return [=](
unsigned VF) ->
bool {
6401 CM.getWideningDecision(I, VF) ==
6412 "Generating a recipe for an adjunct member of an interleave group");
6416 Mask = createBlockInMask(I->
getParent(), Plan);
6424 if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
6427 auto willWiden = [&](
unsigned VF) ->
bool {
6430 if (CM.isScalarAfterVectorization(I, VF) ||
6431 CM.isProfitableToScalarize(I, VF))
6434 CM.getWideningDecision(I, VF);
6436 "CM decision should be taken at this point.");
6438 "Interleave memory opportunity should be caught earlier.");
6447 Mask = createBlockInMask(I->
getParent(), Plan);
6454 if (
PHINode *Phi = dyn_cast<PHINode>(I)) {
6472 auto isOptimizableIVTruncate =
6475 [=](
unsigned VF) ->
bool {
return CM.isOptimizableIVTruncate(K, VF); };
6479 isOptimizableIVTruncate(I), Range))
6481 cast<TruncInst>(I));
6498 for (
unsigned In = 0;
In < NumIncoming;
In++) {
6501 assert((EdgeMask || NumIncoming == 1) &&
6502 "Multiple predecessors with one having a full mask");
6513 [&](
unsigned VF) {
return CM.isScalarWithPredication(I, VF); }, Range);
6518 auto IsVectorizableOpcode = [](
unsigned Opcode) {
6521 case Instruction::And:
6522 case Instruction::AShr:
6523 case Instruction::BitCast:
6524 case Instruction::Br:
6526 case Instruction::FAdd:
6527 case Instruction::FCmp:
6528 case Instruction::FDiv:
6529 case Instruction::FMul:
6530 case Instruction::FPExt:
6531 case Instruction::FPToSI:
6532 case Instruction::FPToUI:
6533 case Instruction::FPTrunc:
6534 case Instruction::FRem:
6535 case Instruction::FSub:
6536 case Instruction::GetElementPtr:
6537 case Instruction::ICmp:
6538 case Instruction::IntToPtr:
6540 case Instruction::LShr:
6541 case Instruction::Mul:
6542 case Instruction::Or:
6543 case Instruction::PHI:
6544 case Instruction::PtrToInt:
6545 case Instruction::SDiv:
6547 case Instruction::SExt:
6548 case Instruction::Shl:
6549 case Instruction::SIToFP:
6550 case Instruction::SRem:
6552 case Instruction::Sub:
6553 case Instruction::Trunc:
6554 case Instruction::UDiv:
6555 case Instruction::UIToFP:
6556 case Instruction::URem:
6557 case Instruction::Xor:
6558 case Instruction::ZExt:
6564 if (!IsVectorizableOpcode(I->
getOpcode()))
6567 if (
CallInst *CI = dyn_cast<CallInst>(I)) {
6574 auto willWiden = [&](
unsigned VF) ->
bool {
6575 if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
6576 CM.isProfitableToScalarize(I, VF)))
6578 if (
CallInst *CI = dyn_cast<CallInst>(I)) {
6584 bool NeedToScalarize;
6586 bool UseVectorIntrinsic =
6588 return UseVectorIntrinsic || !NeedToScalarize;
6590 if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
6591 assert(CM.getWideningDecision(I, VF) ==
6593 "Memory widening decisions should have been taken care by now");
6604 if (!VPBB->
empty()) {
6619 [&](
unsigned VF) {
return CM.isUniformAfterVectorization(I, VF); },
6623 [&](
unsigned VF) {
return CM.isScalarWithPredication(I, VF); }, Range);
6631 if (
auto *PredInst = dyn_cast<Instruction>(
Op))
6632 if (PredInst2Recipe.
find(PredInst) != PredInst2Recipe.
end())
6633 PredInst2Recipe[PredInst]->setAlsoPack(
false);
6636 if (!IsPredicated) {
6641 LLVM_DEBUG(
dbgs() <<
"LV: Scalarizing and predicating:" << *I <<
"\n");
6643 "VPBB has successors when handling predicated replication.");
6645 PredInst2Recipe[
I] = Recipe;
6664 assert(Instr->
getParent() &&
"Predicated instruction not in any basic block");
6686 if ((Recipe = tryToInterleaveMemory(Instr, Range, Plan))) {
6692 if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) {
6698 if ((Recipe = tryToOptimizeInduction(Instr, Range))) {
6702 if ((Recipe = tryToBlend(Instr, Plan))) {
6706 if (
PHINode *Phi = dyn_cast<PHINode>(Instr)) {
6714 if (tryToWiden(Instr, VPBB, Range))
6720 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(
unsigned MinVF,
6739 if (CM.foldTailByMasking())
6749 collectTriviallyDeadInstructions(DeadInstructions);
6751 for (
unsigned VF = MinVF; VF < MaxVF + 1;) {
6754 buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions));
6759 LoopVectorizationPlanner::VPlanPtr
6760 LoopVectorizationPlanner::buildVPlanWithVPRecipes(
6773 auto Plan = llvm::make_unique<VPlan>(VPBB);
6777 for (
Value *V : NeedDef)
6778 Plan->addVPValue(V);
6788 unsigned VPBBsForBB = 0;
6789 auto *FirstVPBBForBB =
new VPBasicBlock(BB->getName());
6791 VPBB = FirstVPBBForBB;
6794 std::vector<Instruction *> Ingredients;
6798 for (
Instruction &I : BB->instructionsWithoutDebug()) {
6803 if (isa<BranchInst>(Instr) ||
6804 DeadInstructions.
find(Instr) != DeadInstructions.
end())
6810 CM.getInterleavedAccessGroup(Instr);
6813 CM.getWideningDecision(Instr, Range.
Start) ==
6815 auto SinkCandidate = SinkAfterInverse.
find(Instr);
6816 if (SinkCandidate != SinkAfterInverse.
end())
6817 Ingredients.push_back(SinkCandidate->second);
6824 auto SAIt = SinkAfter.
find(Instr);
6825 if (SAIt != SinkAfter.
end()) {
6828 <<
" to vectorize a 1st order recurrence.\n");
6829 SinkAfterInverse[SAIt->second] = Instr;
6833 Ingredients.push_back(Instr);
6837 auto SAInvIt = SinkAfterInverse.
find(Instr);
6838 if (SAInvIt != SinkAfterInverse.
end())
6839 Ingredients.push_back(SAInvIt->second);
6844 if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
6849 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
6850 Instr, Range, VPBB, PredInst2Recipe, Plan);
6851 if (NextVPBB != VPBB) {
6853 VPBB->
setName(BB->hasName() ? BB->getName() +
"." +
Twine(VPBBsForBB++)
6862 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
6863 assert(PreEntry->
empty() &&
"Expecting empty pre-entry block.");
6868 std::string PlanName;
6870 unsigned VF = Range.
Start;
6872 RSO <<
"Initial VPlan for VF={" <<
VF;
6873 for (VF *= 2; VF < Range.
End; VF *= 2) {
6879 Plan->setName(PlanName);
6884 LoopVectorizationPlanner::VPlanPtr
6885 LoopVectorizationPlanner::buildVPlan(
VFRange &Range) {
6894 auto Plan = llvm::make_unique<VPlan>();
6898 HCFGBuilder.buildHierarchicalCFG();
6904 for (
unsigned VF = Range.
Start; VF < Range.
End; VF *= 2)
6910 Value* LoopVectorizationPlanner::VPCallbackILV::
6911 getOrCreateVectorValues(
Value *V,
unsigned Part) {
6912 return ILV.getOrCreateVectorValue(V, Part);
6917 << Indent <<
"\"INTERLEAVE-GROUP with factor " << IG->getFactor() <<
" at ";
6918 IG->getInsertPos()->printAsOperand(O,
false);
6924 for (
unsigned i = 0; i < IG->getFactor(); ++i)
6936 assert(!State.
Instance &&
"Int or FP induction being replicated.");
6953 unsigned NumIncoming = Phi->getNumIncomingValues();
6956 "Multiple predecessors with predecessors having a full mask");
6962 for (
unsigned In = 0;
In < NumIncoming; ++
In) {
6963 for (
unsigned Part = 0; Part < State.
UF; ++Part) {
6979 for (
unsigned Part = 0; Part < State.
UF; ++Part)
6991 for (
unsigned Part = 0; Part < State.
UF; ++Part)
6992 MaskValues[Part] = State.
get(Mask, Part);
7000 if (AlsoPack && State.
VF > 1) {
7015 unsigned EndLane = IsUniform ? 1 : State.
VF;
7016 for (
unsigned Part = 0; Part < State.
UF; ++Part)
7017 for (
unsigned Lane = 0; Lane < EndLane; ++Lane)
7022 assert(State.
Instance &&
"Branch on Mask works only on single instance.");
7024 unsigned Part = State.
Instance->Part;
7025 unsigned Lane = State.
Instance->Lane;
7027 Value *ConditionBit =
nullptr;
7032 ConditionBit = State.
get(BlockInMask, Part);
7041 assert(isa<UnreachableInst>(CurrentTerminator) &&
7042 "Expected to replace unreachable terminator with conditional branch.");
7044 CondBr->setSuccessor(0,
nullptr);
7049 assert(State.
Instance &&
"Predicated instruction PHI works per instance.");
7054 assert(PredicatingBB &&
"Predicated block has no single predecessor.");
7062 unsigned Part = State.
Instance->Part;
7068 VPhi->addIncoming(IEI, PredicatedBB);
7086 for (
unsigned Part = 0; Part < State.
UF; ++Part)
7087 MaskValues[Part] = State.
get(Mask, Part);
7104 LoopVectorizationCostModel CM(L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7112 unsigned UserVF = Hints.
getWidth();
7129 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, UserVF, 1, LVL,
7144 "VPlan-native path is not enabled. Only process inner loops.");
7152 << DebugLocStr <<
"\n");
7157 dbgs() <<
"LV: Loop hints:" 7164 <<
" width=" << Hints.getWidth()
7165 <<
" unroll=" << Hints.getInterleave() <<
"\n");
7178 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
7188 &Requirements, &Hints, DB,
AC);
7190 LLVM_DEBUG(
dbgs() <<
"LV: Not vectorizing: Cannot prove legality.\n");
7191 Hints.emitRemarkWithHints();
7213 unsigned ExpectedTC = 0;
7214 bool HasExpectedTC =
false;
7216 dyn_cast<SCEVConstant>(SE->getBackedgeTakenCount(L))) {
7217 const APInt &ExitsCount = ConstExits->getAPInt();
7221 ExpectedTC =
static_cast<unsigned>(ExitsCount.
getZExtValue()) + 1;
7222 HasExpectedTC =
true;
7230 ExpectedTC = *EstimatedTC;
7231 HasExpectedTC =
true;
7234 if (!HasExpectedTC) {
7235 ExpectedTC = SE->getSmallConstantMaxTripCount(L);
7236 HasExpectedTC = (ExpectedTC > 0);
7240 LLVM_DEBUG(
dbgs() <<
"LV: Found a loop with a very small trip count. " 7241 <<
"This loop is worth vectorizing only if no scalar " 7242 <<
"iteration overheads are incurred.");
7244 LLVM_DEBUG(
dbgs() <<
" But vectorizing was explicitly forced.\n");
7259 LLVM_DEBUG(
dbgs() <<
"LV: Can't vectorize when the NoImplicitFloat" 7260 "attribute is used.\n");
7262 "NoImplicitFloat", L)
7263 <<
"loop not vectorized due to NoImplicitFloat attribute");
7264 Hints.emitRemarkWithHints();
7272 if (Hints.isPotentiallyUnsafe() &&
7275 dbgs() <<
"LV: Potentially unsafe FP op prevents vectorization.\n");
7278 <<
"loop not vectorized due to unsafe FP support.");
7279 Hints.emitRemarkWithHints();
7291 if (UseInterleaved) {
7296 LoopVectorizationCostModel CM(L, PSE,
LI, &LVL, *
TTI,
TLI, DB,
AC,
ORE, F,
7304 unsigned UserVF = Hints.getWidth();
7313 unsigned UserIC = Hints.getInterleave();
7316 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
7317 bool VectorizeLoop =
true, InterleaveLoop =
true;
7319 LLVM_DEBUG(
dbgs() <<
"LV: Not vectorizing: loop did not meet vectorization " 7321 Hints.emitRemarkWithHints();
7325 if (VF.
Width == 1) {
7326 LLVM_DEBUG(
dbgs() <<
"LV: Vectorization is possible but not beneficial.\n");
7327 VecDiagMsg = std::make_pair(
7328 "VectorizationNotBeneficial",
7329 "the cost-model indicates that vectorization is not beneficial");
7330 VectorizeLoop =
false;
7333 if (IC == 1 && UserIC <= 1) {
7336 IntDiagMsg = std::make_pair(
7337 "InterleavingNotBeneficial",
7338 "the cost-model indicates that interleaving is not beneficial");
7339 InterleaveLoop =
false;
7341 IntDiagMsg.first =
"InterleavingNotBeneficialAndDisabled";
7342 IntDiagMsg.second +=
7343 " and is explicitly disabled or interleave count is set to 1";
7345 }
else if (IC > 1 && UserIC == 1) {
7348 dbgs() <<
"LV: Interleaving is beneficial but is explicitly disabled.");
7349 IntDiagMsg = std::make_pair(
7350 "InterleavingBeneficialButDisabled",
7351 "the cost-model indicates that interleaving is beneficial " 7352 "but is explicitly disabled or interleave count is set to 1");
7353 InterleaveLoop =
false;
7357 IC = UserIC > 0 ? UserIC : IC;
7360 const char *VAPassName = Hints.vectorizeAnalysisPassName();
7361 if (!VectorizeLoop && !InterleaveLoop) {
7366 << VecDiagMsg.second;
7371 << IntDiagMsg.second;
7374 }
else if (!VectorizeLoop && InterleaveLoop) {
7379 << VecDiagMsg.second;
7381 }
else if (VectorizeLoop && !InterleaveLoop) {
7383 <<
") in " << DebugLocStr <<
'\n');
7387 << IntDiagMsg.second;
7389 }
else if (VectorizeLoop && InterleaveLoop) {
7391 <<
") in " << DebugLocStr <<
'\n');
7397 using namespace ore;
7398 bool DisableRuntimeUnroll =
false;
7401 if (!VectorizeLoop) {
7402 assert(IC > 1 &&
"interleave count should not be 1 or 0");
7405 InnerLoopUnroller Unroller(L, PSE,
LI,
DT,
TLI,
TTI,
AC,
ORE, IC, &LVL,
7412 <<
"interleaved loop (interleaved count: " 7413 <<
NV(
"InterleaveCount", IC) <<
")";
7417 InnerLoopVectorizer LB(L, PSE,
LI,
DT,
TLI,
TTI,
AC,
ORE, VF.
Width, IC,
7426 DisableRuntimeUnroll =
true;
7432 <<
"vectorized loop (vectorization width: " 7433 <<
NV(
"VectorizationFactor", VF.
Width)
7434 <<
", interleaved count: " <<
NV(
"InterleaveCount", IC) <<
")";
7440 LLVMLoopVectorizeFollowupEpilogue});
7444 if (DisableRuntimeUnroll)
7448 Hints.setAlreadyVectorized();
7483 bool Changed =
false;
7501 LoopsAnalyzed += Worklist.
size();
7504 while (!Worklist.
empty()) {
7511 Changed |= processLoop(L);
7532 std::function<const LoopAccessInfo &(Loop &)> GetLAA =
7538 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE);
Legacy wrapper pass to provide the GlobalsAAResult object.
Pass interface - Implemented by all 'passes'.
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value *> &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop)...
Value * CreateInBoundsGEP(Value *Ptr, ArrayRef< Value *> IdxList, const Twine &Name="")
Type * getVectorElementType() const
Value * getValueOperand()
unsigned getSmallConstantTripCount(const Loop *L)
Returns the maximum trip count of the loop if it is a single-exit loop and we can compute a small max...
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Value * getOrCreateScalarValue(Value *V, const VPIteration &Instance)
Return a value in the new loop corresponding to V from the original loop at unroll and vector indices...
Main class to build the VPlan H-CFG for an incoming IR.
A parsed version of the target data layout string in and methods for querying it. ...
const_iterator end(StringRef path)
Get end iterator over path.
void copyFastMathFlags(FastMathFlags FMF)
Convenience function for transferring all fast-math flag values to this instruction, which must be an operator which supports these flags.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
Type * getWidestInductionType()
Returns the widest induction type.
bool processLoop(Loop *L)
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
void ReplaceInstWithInst(BasicBlock::InstListType &BIL, BasicBlock::iterator &BI, Instruction *I)
Replace the instruction specified by BI with the instruction specified by I.
This class is the base class for the comparison instructions.
bool appendInstruction(Instruction *Instr)
Augment the recipe to include Instr, if it lies at its End.
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
unsigned getWidth() const
static bool runImpl(Function &F, TargetLibraryInfo &TLI, DominatorTree &DT)
This is the entry point for all transforms.
DemandedBits * DB
Demanded bits analysis.
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
BinaryOperator * getInductionBinOp() const
static IntegerType * getInt1Ty(LLVMContext &C)
static cl::opt< bool > MaximizeBandwidth("vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " "will be determined by the smallest type in loop."))
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static cl::opt< bool > LoopVectorizeWithBlockFrequency("loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, cl::desc("Enable the use of the block frequency analysis to access PGO " "heuristics minimizing code growth in cold regions and being more " "aggressive in hot regions."))
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
InstWidening
Decision that was taken during cost calculation for memory instruction.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
uint64_t getZExtValue() const
Get zero extended value.
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
bool isLegalMaskedScatter(Type *DataType)
Returns true if the target machine supports masked scatter operation for the given DataType...
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
MapVector< Instruction *, uint64_t > computeMinimumValueSizes(ArrayRef< BasicBlock *> Blocks, DemandedBits &DB, const TargetTransformInfo *TTI=nullptr)
Compute a map of integer instructions to their minimum legal type size.
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF=1)
Returns true if I is a memory instruction with consecutive memory access that can be widened...
DiagnosticInfoOptimizationBase::Argument NV
void setFast(bool B=true)
bool isMaskRequired(const Instruction *I)
Returns true if vector representation of the instruction I requires mask.
void execute(VPTransformState &State) override
Generate the wide load/store.
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
void addNewMetadata(Instruction *To, const Instruction *Orig)
Add additional metadata to To that was not present on Orig.
VPInterleaveRecipe * tryToInterleaveMemory(Instruction *I, VFRange &Range, VPlanPtr &Plan)
Check if belongs to an Interleave Group within the given VF Range,.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
void collectValuesToIgnore()
Collect values we want to ignore in the cost model.
This class represents lattice values for constants.
InnerLoopVectorizer vectorizes loops which contain only one basic block to a specified vectorization ...
static cl::opt< unsigned > ForceTargetNumScalarRegs("force-target-num-scalar-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of scalar registers."))
const LoopVectorizeHints * Hints
Loop Vectorize Hint.
const InterleaveGroup< Instruction > * getInterleavedAccessGroup(Instruction *Instr)
Get the interleaved access group that Instr belongs to.
size_type size() const
Determine the number of elements in the SetVector.
This is the interface for a simple mod/ref and alias analysis over globals.
Pass * createLoopVectorizePass(bool InterleaveOnlyWhenForced=false, bool VectorizeOnlyWhenForced=false)
A Module instance is used to store all the information related to an LLVM module. ...
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, unsigned Align, const char *Name)
Provided to resolve 'CreateAlignedLoad(Ptr, Align, "...")' correctly, instead of converting the strin...
void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance, bool IfPredicateInstr)
A helper function to scalarize a single Instruction in the innermost loop.
void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
std::unique_ptr< LoopVersioning > LVer
LoopVersioning.
void emitSCEVChecks(Loop *L, BasicBlock *Bypass)
Emit a bypass check to see if all of the SCEV assumptions we've had to make are correct.
static MDString * get(LLVMContext &Context, StringRef Str)
Interval Class - An Interval is a set of nodes defined such that every node in the interval has all o...
VectorizationFactor planInVPlanNativePath(bool OptForSize, unsigned UserVF)
Use the VPlan-native path to plan how to best vectorize, return the best VF and its cost...
ConstantInt * getConstIntStepValue() const
Min/max implemented in terms of select(cmp()).
This class represents zero extension of integer types.
Value * getScalarValue(Value *Key, const VPIteration &Instance)
Retrieve the existing scalar value that corresponds to Key and Instance.
Instruction::BinaryOps getInductionOpcode() const
Returns binary opcode of the induction operator.
void push_back(const T &Elt)
Value * createBitOrPointerCast(Value *V, VectorType *DstVTy, const DataLayout &DL)
Returns a bitcasted value to the requested vector type.
static unsigned getScalarizationOverhead(Instruction *I, unsigned VF, const TargetTransformInfo &TTI)
Estimate the overhead of scalarizing an instruction.
This provides a very simple, boring adaptor for a begin and end iterator into a range type...
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
value_op_iterator value_op_begin()
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value *> VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal, MD_access_group].
static void AddRuntimeUnrollDisableMetaData(Loop *L)
The main scalar evolution driver.
bool isUniform(Value *V)
Returns true if the value V is uniform within the loop.
TODO: The following VectorizationFactor was pulled out of LoopVectorizationCostModel class...
static std::string getDebugLocString(const Loop *L)
This is a helper struct for maintaining vectorization state.
This class represents a function call, abstracting a target machine's calling convention.
const ValueToValueMap & getSymbolicStrides() const
If an access has a symbolic strides, this maps the pointer value to the stride symbol.
void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF)
Vectorize a single PHINode in a block.
DominatorTree * DT
Dominator Tree.
void collectTriviallyDeadInstructions(SmallPtrSetImpl< Instruction *> &DeadInstructions)
Collect the instructions from the original loop that would be trivially dead in the vectorized loop i...
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
An immutable pass that tracks lazily created AssumptionCache objects.
VPWidenMemoryInstructionRecipe * tryToWidenMemory(Instruction *I, VFRange &Range, VPlanPtr &Plan)
Check if is a memory instruction to be widened for Range.Start and potentially masked.
static Type * largestIntegerVectorType(Type *T1, Type *T2)
void fixLCSSAPHIs()
The Loop exit block may have single value PHI nodes with some incoming value.
Helper class to create VPRecipies from IR instructions.
A cache of @llvm.assume calls within a function.
Analysis pass providing the TargetTransformInfo.
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, Value *Step, Instruction *EntryVal)
Create a vector induction phi node based on an existing scalar one.
Externally visible function.
Loop * TheLoop
The loop that we evaluate.
BasicBlock * LoopScalarPreHeader
The scalar-loop preheader.
bool tryToCreateRecipe(Instruction *Instr, VFRange &Range, VPlanPtr &Plan, VPBasicBlock *VPBB)
Check if a recipe can be create for I withing the given VF Range.
LLVMContext & getContext() const
All values hold a context through their type.
void initializeLoopVectorizePass(PassRegistry &)
static void collectSupportedLoops(Loop &L, LoopInfo *LI, OptimizationRemarkEmitter *ORE, SmallVectorImpl< Loop *> &V)
A Recipe for widening load/store operations.
Value * TripCount
Trip count of the original loop.
void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, const Instruction *EntryVal, Value *VectorLoopValue, unsigned Part, unsigned Lane=UINT_MAX)
If there is a cast involved in the induction variable ID, which should be ignored in the vectorized l...
This class implements a map that also provides access to all stored values in a deterministic order...
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
value_op_iterator value_op_end()
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly...
BasicBlock * getSuccessor(unsigned i) const
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
InductionKind getKind() const
STATISTIC(NumFunctions, "Total number of functions")
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
Analysis pass which computes a DominatorTree.
const MDOperand & getOperand(unsigned I) const
An instruction for reading from memory.
Value * getCondition() const
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
bool isVectorTy() const
True if this is an instance of VectorType.
bool hasAnyScalarValue(Value *Key) const
virtual Value * getStepVector(Value *Val, int StartIdx, Value *Step, Instruction::BinaryOps Opcode=Instruction::BinaryOpsEnd)
This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) to each vector element of Val...
This defines the Use class.
VPBlockBase * getSingleSuccessor() const
Value * getStartValue() const
static cl::opt< unsigned > NumberOfStoresToPredicate("vectorize-num-stores-pred", cl::init(1), cl::Hidden, cl::desc("Max number of stores to be predicated behind an if."))
The number of stores in a loop that are allowed to need predication.
const TargetLibraryInfo * TLI
Target Library Info.
static const char *const LLVMLoopVectorizeFollowupAll
iterator end()
Get an iterator to the end of the SetVector.
LLVMContext & getContext() const
Get the context in which this basic block lives.
bool hasLoopInvariantOperands(const Instruction *I) const
Return true if all the operands of the specified instruction are loop invariant.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
VPValue * createBlockInMask(BasicBlock *BB, VPlanPtr &Plan)
A helper function that computes the predicate of the block BB, assuming that the header block of the ...
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
SmallVector< Instruction *, 4 > PredicatedInstructions
Store instructions that were predicated.
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, unsigned VecWidth, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM)
static bool isBitOrNoopPointerCastable(Type *SrcTy, Type *DestTy, const DataLayout &DL)
Check whether a bitcast, inttoptr, or ptrtoint cast between these types is valid and a no-op...
iterator begin()
Instruction iterator methods.
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, unsigned Align, bool isVolatile=false)
bool isIdenticalTo(const Instruction *I) const
Return true if the specified instruction is exactly identical to the current one. ...
const MapVector< Instruction *, uint64_t > & getMinimalBitwidths() const
bool formLCSSARecursively(Loop &L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
bool hasAnyVectorValue(Value *Key) const
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
virtual ~InnerLoopVectorizer()=default
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
bool isScalarWithPredication(Instruction *I, unsigned VF=1)
Returns true if I is an instruction that will be scalarized with predication.
AnalysisUsage & addRequired()
PredicatedScalarEvolution & PSE
A wrapper around ScalarEvolution used to add runtime SCEV checks.
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
#define INITIALIZE_PASS_DEPENDENCY(depName)
void print(raw_ostream &O, const Twine &Indent) const override
Print the recipe.
bool tryToWiden(Instruction *I, VPBasicBlock *VPBB, VFRange &Range)
Check if I can be widened within the given VF Range.
static cl::opt< bool > EnableMaskedInterleavedMemAccesses("enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"))
An interleave-group may need masking if it resides in a block that needs predication, or in order to mask away gaps.
This file defines the LoopVectorizationLegality class.
This class represents the LLVM 'select' instruction.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
bool verify(VerificationLevel VL=VerificationLevel::Full) const
verify - checks if the tree is correct.
int getBasicBlockIndex(const BasicBlock *BB) const
Return the first index of the specified basic block in the value list for this PHI.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
bool simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, bool PreserveLCSSA)
Simplify each loop in a loop nest recursively.
static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal)
This is the base class for all instructions that perform data casts.
bool isFloatingPointTy() const
Return true if this is one of the six floating-point types.
SmallVector< PHINode *, 8 > OrigPHIsToFix
static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBase NewBlock after BlockPtr.
A Use represents the edge between a Value definition and its users.
iterator find(ConstPtrType Ptr) const
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
bool hasVectorValue(Value *Key, unsigned Part) const
void setIsInBounds(bool b=true)
Set or clear the inbounds flag on this GEP instruction.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
const TargetLibraryInfo * TLI
Target Library Info.
bool isIntegerTy() const
True if this is an instance of IntegerType.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Value * emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, const InductionDescriptor &ID) const
Compute the transformed value of Index at offset StartValue using step StepValue. ...
const Function * TheFunction
An analysis that produces DemandedBits for a function.
This file contains the simple types necessary to represent the attributes associated with functions a...
A recipe for handling all phi nodes except for integer and FP inductions.
Legacy analysis pass which computes BlockFrequencyInfo.
void setName(const Twine &newName)
void setWideningDecision(Instruction *I, unsigned VF, InstWidening W, unsigned Cost)
Save vectorization decision W and Cost taken by the cost model for instruction I and vector width VF...
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
void setName(const Twine &Name)
Change the name of the value.
Analysis pass that exposes the LoopInfo for a function.
RPOIterator endRPO() const
bool isOptimizableIVTruncate(Instruction *I, unsigned VF)
Return True if instruction I is an optimizable truncate whose operand is an induction variable...
uint64_t getNumElements() const
BasicBlock * LoopExitBlock
The ExitBlock of the scalar loop.
static void cse(BasicBlock *BB)
Perform cse of induction variable instructions.
bool isScalarEpilogueAllowed() const
Returns true if a scalar epilogue is not allowed due to optsize.
Value * getOrCreateVectorTripCount(Loop *NewLoop)
Returns (and creates if needed) the trip count of the widened loop.
CallInst * CreateMaskedGather(Value *Ptrs, unsigned Align, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
static const unsigned TinyTripCountInterleaveThreshold
We don't interleave loops with a known constant trip count below this number.
virtual Value * reverseVector(Value *Vec)
Generate a shuffle sequence that will reverse the vector Vec.
This file implements a class to represent arbitrary precision integral constant values and operations...
BlockT * getHeader() const
VPBlendRecipe * tryToBlend(Instruction *I, VPlanPtr &Plan)
Handle non-loop phi nodes.
int isConsecutivePtr(Value *Ptr)
Check if this pointer is consecutive when vectorizing.
void sinkScalarOperands(Instruction *PredInst)
Iteratively sink the scalarized operands of a predicated instruction into the block that was created ...
Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following: ...
Fast - This calling convention attempts to make calls as fast as possible (e.g.
InstTy * getMember(unsigned Index) const
Get the member with the given index Index.
Class to represent function types.
static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI)
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Type * getType() const
All values are typed, get the type of this value.
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
VPRegionBlock * createReplicateRegion(Instruction *I, VPRecipeBase *PredRecipe, VPlanPtr &Plan)
Create a replicating region for instruction I that requires predication.
Drive the analysis of interleaved memory accesses in the loop.
bool insert(const value_type &X)
Insert a new element into the SetVector.
bool isInductionVariable(const Value *V)
Returns True if V can be considered as an induction variable in this loop.
static cl::opt< unsigned > MaxNestedScalarReductionIC("max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop."))
static bool isEqual(const Function &Caller, const Function &Callee)
void addBasicBlockToLoop(BlockT *NewBB, LoopInfoBase< BlockT, LoopT > &LI)
This method is used by other analyses to update loop information.
LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, const Function *F, const LoopVectorizeHints *Hints, InterleavedAccessInfo &IAI)
This file provides a LoopVectorizationPlanner class.
void fixNonInductionPHIs(void)
Fix the non-induction PHIs in the OrigPHIsToFix vector.
void setLoopID(MDNode *LoopID) const
Set the llvm.loop loop id metadata for this loop.
void execute(VPTransformState &State) override
Generate the wide load or store, and shuffles.
const T & getValue() const LLVM_LVALUE_FUNCTION
void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc=nullptr)
Widen an integer or floating-point induction variable IV.
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
void execute(VPTransformState &State) override
Generate the phi/select nodes.
SmallVector< RegisterUsage, 8 > calculateRegisterUsage(ArrayRef< unsigned > VFs)
void addTopLevelLoop(LoopT *New)
This adds the specified loop to the collection of top-level loops.
LLVM_NODISCARD LLVM_ATTRIBUTE_ALWAYS_INLINE bool empty() const
empty - Check if the string is empty.
iterator begin()
Get an iterator to the beginning of the SetVector.
static Type * smallestIntegerVectorType(Type *T1, Type *T2)
The group of interleaved loads/stores sharing the same stride and close to each other.
This header provides classes for managing per-loop analyses.
static CmpInst * Create(OtherOps Op, Predicate predicate, Value *S1, Value *S2, const Twine &Name="", Instruction *InsertBefore=nullptr)
Construct a compare instruction, given the opcode, the predicate and the two operands.
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
ReductionList * getReductionVars()
Returns the reduction variables found in the loop.
PHINode * getPrimaryInduction()
Returns the primary induction variable.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
iterator_range< User::op_iterator > arg_operands()
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
A struct that represents some properties of the register usage of a loop.
Not an induction variable.
Value * getOrCreateTripCount(Loop *NewLoop)
Returns (and creates if needed) the original loop trip count.
Value * getLoadStorePointerOperand(Value *V)
A helper function that returns the pointer operand of a load or store instruction.
An instruction for storing to memory.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
bool isMinusOne() const
This function will return true iff every bit in this constant is set to true.
unsigned getLoadStoreAddressSpace(Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction...
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
SmallPtrSet< const Value *, 16 > VecValuesToIgnore
Values to ignore in the cost model when VF > 1.
bool blockNeedsPredication(BasicBlock *BB)
Return true if the block BB needs to be predicated in order for the loop to be vectorized.
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="")
bool isLegalGatherOrScatter(Value *V)
Returns true if the target machine can represent V as a masked gather or scatter operation.
void takeName(Value *V)
Transfer the name from V to this value.
Optional< const DILocation * > cloneWithDuplicationFactor(unsigned DF) const
Returns a new DILocation with duplication factor DF * current duplication factor encoded in the discr...
void perform(LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
Value * getOrCreateVectorValue(Value *V, unsigned Part)
getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a vector or scalar value on-...
iterator find(const KeyT &Key)
void truncateToMinimalBitwidths()
Shrinks vector element sizes to the smallest bitwidth they can be legally represented as...
VectorType * getType() const
Overload to return most specific vector type.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
BasicBlock * createVectorizedLoopSkeleton()
Create a new empty loop.
void execute(VPTransformState &State) override
Produce widened copies of all Ingredients.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type *> Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass)
Emit a bypass check to see if the vector trip count is zero, including if it overflows.
This class represents a truncation of integer types.
void resetScalarValue(Value *Key, const VPIteration &Instance, Value *Scalar)
Reset the scalar value associated with Key for Part and Lane.
static BinaryOperator * CreateAdd(Value *S1, Value *S2, const Twine &Name, Instruction *InsertBefore, Value *FlagsOp)
bool isPredicatedInst(Instruction *I)
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block...
Value * getOperand(unsigned i) const
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
bool runImpl(Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_, std::function< const LoopAccessInfo &(Loop &)> &GetLAA_, OptimizationRemarkEmitter &ORE)
Pointer induction var. Step = C / sizeof(elem).
const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
void execute(VPTransformState &State) override
Generates phi nodes for live-outs as needed to retain SSA form.
void execute(VPTransformState &State) override
Generate replicas of the desired Ingredient.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
iterator find(const_arg_type_t< KeyT > Val)
void widenInstruction(Instruction &I)
Widen a single instruction within the innermost loop.
Value * CreateFCmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
BasicBlock * LoopMiddleBlock
Middle Block between the vector and the scalar.
std::pair< Instruction *, Instruction * > addRuntimeChecks(Instruction *Loc) const
Add code that checks at runtime if the accessed arrays overlap.
bool isVoidTy() const
Return true if this is 'void'.
bool InterleaveOnlyWhenForced
If false, consider all loops for interleaving.
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe...
an instruction for type-safe pointer arithmetic to access elements of arrays and structs ...
void setBestPlan(unsigned VF, unsigned UF)
Finalize the best decision and dispose of all other VPlans.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata *> MDs)
static bool runOnFunction(Function &F, bool PostInlining)
enum ForceKind getForce() const
initializer< Ty > init(const Ty &Val)
void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, Value *CountRoundDown, Value *EndValue, BasicBlock *MiddleBlock)
Set up the values of the IVs correctly when exiting the vector loop.
This instruction inserts a single (scalar) element into a VectorType value.
void restoreIP(InsertPoint IP)
Sets the current insert point to a previously-saved location.
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
static cl::opt< bool > EnableCondStoresVectorization("enable-cond-stores-vec", cl::init(true), cl::Hidden, cl::desc("Enable if predication of stores during vectorization."))
Integer induction variable. Step = C.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
A set of analyses that are preserved following a run of a transformation pass.
* if(!EatIfPresent(lltok::kw_thread_local)) return false
ParseOptionalThreadLocal := /*empty.
const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF)
A helper function that returns true if the given type is irregular.
Value * concatenateVectors(IRBuilder<> &Builder, ArrayRef< Value *> Vecs)
Concatenate a list of vectors.
LLVM Basic Block Representation.
The instances of the Type class are immutable: once they are created, they are never changed...
bool isProfitableToScalarize(Instruction *I, unsigned VF) const
This is an important class for using LLVM in a threaded context.
void fixVectorizedLoop()
Fix the vectorized code, taking care of header phi's, live-outs, and more.
ConstantInt * getTrue()
Get the constant value for i1 true.
Conditional or Unconditional Branch instruction.
Min/max implemented in terms of select(cmp()).
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Value handle that tracks a Value across RAUW.
BasicBlock * LoopScalarBody
The scalar loop body.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
bool ult(const APInt &RHS) const
Unsigned less than comparison.
size_t size() const
size - Get the array size.
const char * getOpcodeName() const
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static Instruction * getDebugLocFromInstOrOperands(Instruction *I)
Look for a meaningful debug location on the instruction or it's operands.
This is an important base class in LLVM.
bool hasScalarValue(Value *Key, const VPIteration &Instance) const
This analysis provides dependence information for the memory accesses of a loop.
PredicatedScalarEvolution & PSE
Predicated scalar evolution analysis.
InsertPoint saveIP() const
Returns the current insert point.
LLVM_ATTRIBUTE_ALWAYS_INLINE iterator begin()
void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr)
Set the debug location in the builder using the debug location in the instruction.
const SCEV * getStep() const
Constant * createReplicatedMask(IRBuilder<> &Builder, unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool getDecisionAndClampRange(const std::function< bool(unsigned)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Value * createMinMaxOp(IRBuilder<> &Builder, RecurrenceDescriptor::MinMaxRecurrenceKind RK, Value *Left, Value *Right)
Returns a Min/Max operation corresponding to MinMaxRecurrenceKind.
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
bool isPointerTy() const
True if this is an instance of PointerType.
static Value * addFastMathFlag(Value *V)
A helper function that adds a 'fast' flag to floating-point operations.
bool foldTailByMasking() const
Returns true if all loop blocks should be masked to fold tail loop.
A manager for alias analyses.
unsigned selectInterleaveCount(bool OptForSize, unsigned VF, unsigned LoopCost)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
static const char *const LLVMLoopVectorizeFollowupEpilogue
InterleavedAccessInfo & InterleaveInfo
The interleave access information contains groups of interleaved accesses with the same stride and cl...
std::unique_ptr< VPlan > VPlanPtr
static cl::opt< unsigned > SmallLoopCost("small-loop-cost", cl::init(20), cl::Hidden, cl::desc("The cost of a loop that is considered 'small' by the interleaver."))
unsigned getAlignment() const
Represent the analysis usage information of a pass.
PHINode * Induction
The new Induction variable which was added to the new block.
OptimizationRemarkAnalysis createLVMissedAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop, Instruction *I=nullptr)
Create an analysis remark that explains why vectorization failed.
unsigned getNumLoads() const
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly...
bool hasFunNoNaNAttr() const
bool optForSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
bool isReductionVariable(PHINode *PN)
Returns True if PN is a reduction variable in this loop.
Analysis pass providing a never-invalidated alias analysis result.
Value * expandCodeFor(const SCEV *SH, Type *Ty, Instruction *I)
Insert code to directly compute the specified SCEV expression into the program.
void fixCrossIterationPHIs()
Handle all cross-iteration phis in the header.
void fixFirstOrderRecurrence(PHINode *Phi)
Fix a first-order recurrence.
FunctionPass class - This class is used to implement most global optimizations.
unsigned getIndex(const InstTy *Instr) const
Get the index for the given member.
static FunctionType * get(Type *Result, ArrayRef< Type *> Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
VectorizationFactor selectVectorizationFactor(unsigned MaxVF)
Constant * createBitMaskForGaps(IRBuilder<> &Builder, unsigned VF, const InterleaveGroup< Instruction > &Group)
Create a mask that filters the members of an interleave group where there are gaps.
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
unsigned UF
The vectorization unroll factor to use.
Optional< unsigned > getLoopEstimatedTripCount(Loop *L)
Get a loop's estimated trip count based on branch weight metadata.
static cl::opt< unsigned > ForceTargetNumVectorRegs("force-target-num-vector-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of vector registers."))
const RuntimePointerChecking * getRuntimePointerChecking() const
Returns the information that we collected about runtime memory check.
static CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd)
Create a BitCast AddrSpaceCast, or a PtrToInt cast instruction.
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
void execute(VPTransformState &State) override
Generate the extraction of the appropriate bit from the block mask and the conditional branch...
unsigned LoopInvariantRegs
Holds the number of loop invariant values that are used in the loop.
const Value * getCondition() const
BlockT * getExitBlock() const
If getExitBlocks would return exactly one block, return that block.
AssumptionCache * AC
Assumption cache.
bool doesNotMeet(Function *F, Loop *L, const LoopVectorizeHints &Hints)
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
AssumptionCache * AC
Assumption Cache.
const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs, and aliases.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
static void connectBlocks(VPBlockBase *From, VPBlockBase *To)
Connect VPBlockBases From and To bi-directionally.
static unsigned getIncomingValueNumForOperand(unsigned i)
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
static wasm::ValType getType(const TargetRegisterClass *RC)
RecurrenceKind getRecurrenceKind()
const std::string & getModuleIdentifier() const
Get the module identifier which is, essentially, the name of the module.
void setScalarValue(Value *Key, const VPIteration &Instance, Value *Scalar)
Set a scalar value associated with Key and Instance.
bool requiresScalarEpilogue() const
Returns true if an interleaved group requires a scalar iteration to handle accesses with gaps...
static unsigned getRecurrenceBinOp(RecurrenceKind Kind)
Returns the opcode of binary operation corresponding to the RecurrenceKind.
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
void printAsOperand(raw_ostream &O, bool PrintType=true, const Module *M=nullptr) const
Print the name of this Value out to the specified raw_ostream.
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="")
void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass)
Emit bypass checks to check any memory assumptions we may have made.
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
static const char *const LLVMLoopVectorizeFollowupVectorized
BasicBlock * LoopVectorBody
The vector loop body.
void fixReduction(PHINode *Phi)
Fix a reduction cross-iteration phi.
void executePlan(InnerLoopVectorizer &LB, DominatorTree *DT)
Generate the IR code for the body of the vectorized loop according to the best selected VPlan...
Planner drives the vectorization process after having passed Legality checks.
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Value * CreateGEP(Value *Ptr, ArrayRef< Value *> IdxList, const Twine &Name="")
A function analysis which provides an AssumptionCache.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
static Constant * getSplat(unsigned NumElts, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
bool isLegalMaskedGather(Type *DataType)
Returns true if the target machine supports masked gather operation for the given DataType...
const VPRecipeBase & back() const
A SetVector that performs no allocations if smaller than a certain size.
Analysis pass which computes BlockFrequencyInfo.
Iterator for intrusive lists based on ilist_node.
unsigned getNumOperands() const
This file defines the VPlanHCFGBuilder class which contains the public interface (buildHierarchicalCF...
static bool mayDivideByZero(Instruction &I)
A helper function for checking whether an integer division-related instruction may divide by zero (in...
LoopVectorizationLegality * Legal
Vectorization legality.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPValue *Condition, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
bool VectorizeOnlyWhenForced
If false, consider all loops for vectorization.
This is the shared class of boolean and integer constants.
const SCEVUnionPredicate & getUnionPredicate() const
Type * getType() const
Return the LLVM type of this SCEV expression.
A struct for saving information about induction variables.
BlockVerifier::State From
A range of powers-of-2 vectorization factors with fixed start and adjustable end. ...
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false...
unsigned MaxLocalUsers
Holds the maximum number of concurrent live intervals in the loop.
bool canFoldTailByMasking()
Return true if we can vectorize this loop while folding its tail by masking.
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
BasicBlock * LoopVectorPreHeader
The vector-loop preheader.
bool isFunctionVectorizable(StringRef F, unsigned VF) const
This holds vectorization requirements that must be verified late in the process.
cl::opt< bool > EnableVPlanNativePath("enable-vplan-native-path", cl::init(false), cl::Hidden, cl::desc("Enable VPlan-native vectorization path with " "support for outer loop vectorization."))
static cl::opt< bool > EnableLoadStoreRuntimeInterleave("enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, cl::desc("Enable runtime interleaving until load/store ports are saturated"))
bool isAccessInterleaved(Instruction *Instr)
Check if Instr belongs to any interleaved access group.
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small...
bool dominates(const Instruction *Def, const Use &U) const
Return true if Def dominates a use in User.
SmallPtrSet< const Value *, 16 > ValuesToIgnore
Values to ignore in the cost model.
Module.h This file contains the declarations for the Module class.
Value * CreateInsertElement(Value *Vec, Value *NewElt, Value *Idx, const Twine &Name="")
Provides information about what library functions are available for the current target.
void buildVPlans(unsigned MinVF, unsigned MaxVF)
Build VPlans for power-of-2 VF's between MinVF and MaxVF inclusive, according to the information gath...
VPValue * createEdgeMask(BasicBlock *Src, BasicBlock *Dst, VPlanPtr &Plan)
A helper function that computes the predicate of the edge between SRC and DST.
Predicate
Predicate - These are "(BI << 5) | BO" for various predicates.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
size_type count(const KeyT &Key) const
unsigned getABITypeAlignment(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
bool isAggregateType() const
Return true if the type is an aggregate type.
bool isFirstOrderRecurrence(const PHINode *Phi)
Returns True if Phi is a first-order recurrence in this loop.
LoopT * AllocateLoop(ArgsTy &&... Args)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, const InductionDescriptor &ID)
Compute scalar induction steps.
static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI)
CallInst * CreateMaskedStore(Value *Val, Value *Ptr, unsigned Align, Value *Mask)
Create a call to Masked Store intrinsic.
LLVM_NODISCARD T pop_back_val()
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getMaxSafeDepDistBytes()
static const SCEV * getAddressAccessSCEV(Value *Ptr, LoopVectorizationLegality *Legal, PredicatedScalarEvolution &PSE, const Loop *TheLoop)
Gets Address Access SCEV after verifying that the access pattern is loop invariant except the inducti...
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Drive the analysis of memory accesses in the loop.
static Constant * getSignedIntOrFpConstant(Type *Ty, int64_t C)
A helper function that returns an integer or floating-point constant with value C.
VectorizationFactor plan(bool OptForSize, unsigned UserVF)
Plan how to best vectorize, return the best VF and its cost.
static Constant * get(Type *Ty, uint64_t V, bool isSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
static BranchInst * Create(BasicBlock *IfTrue, Instruction *InsertBefore=nullptr)
bool isConditional() const
static cl::opt< bool > EnableIndVarRegisterHeur("enable-ind-var-reg-heur", cl::init(true), cl::Hidden, cl::desc("Count the induction variable only once when interleaving"))
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
pred_range predecessors(BasicBlock *BB)
const VPBlocksTy & getSuccessors() const
static Constant * get(Type *Ty, double V)
This returns a ConstantFP, or a vector containing a splat of a ConstantFP, for the specified value in...
CallInst * CreateMaskedLoad(Value *Ptr, unsigned Align, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
bool hasVectorInstrinsicScalarOpd(Intrinsic::ID ID, unsigned ScalarOpdIdx)
Identifies if the intrinsic has a scalar operand.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
unsigned getFactor() const
Value * CreateURem(Value *LHS, Value *RHS, const Twine &Name="")
StringRef getVectorizedFunction(StringRef F, unsigned VF) const
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
void setOperand(unsigned i, Value *Val)
Value * createTargetReduction(IRBuilder<> &B, const TargetTransformInfo *TTI, RecurrenceDescriptor &Desc, Value *Src, bool NoNaN=false)
Create a generic target reduction using a recurrence descriptor Desc The target is queried to determi...
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Optional< MDNode * > makeFollowupLoopID(MDNode *OrigLoopID, ArrayRef< StringRef > FollowupAttrs, const char *InheritOptionsAttrsPrefix="", bool AlwaysNew=false)
Create a new loop identifier for a loop created from a loop transformation.
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
unsigned getVectorNumElements() const
static Constant * getRecurrenceIdentity(RecurrenceKind K, Type *Tp)
Returns identity corresponding to the RecurrenceKind.
Store the result of a depth first search within basic blocks contained by a single loop...
void clear()
Completely clear the SetVector.
DenseMap< PHINode *, Value * > IVEndValues
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Class to represent vector types.
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Class for arbitrary precision integers.
bool isInductionPhi(const Value *V)
Returns True if V is a Phi node of an induction variable in this loop.
void collectUniformsAndScalars(unsigned VF)
Collect Uniform and Scalar values for the given VF.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
void addMetadata(Instruction *To, Instruction *From)
Add metadata from one instruction to another.
const LoopAccessInfo * getLAI() const
void resetVectorValue(Value *Key, unsigned Part, Value *Vector)
Reset the vector value associated with Key for the given Part.
LoopInfo * LI
Loop Info analysis.
iterator_range< user_iterator > users()
static const char lv_name[]
This class uses information about analyze scalars to rewrite expressions in canonical form...
static bool processLoopInVPlanNativePath(Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, LoopVectorizeHints &Hints)
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
static cl::opt< unsigned > ForceTargetMaxVectorInterleaveFactor("force-target-max-vector-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "vectorized loops."))
Value * VectorTripCount
Trip count of the widened loop (TripCount - TripCount % (VF*UF))
void vectorizeMemoryInstruction(Instruction *Instr, VectorParts *BlockInMask=nullptr)
Vectorize Load and Store instructions, optionally masking the vector operations if BlockInMask is non...
loop Loop Strength Reduction
static cl::opt< bool > VPlanBuildStressTest("vplan-build-stress-test", cl::init(false), cl::Hidden, cl::desc("Build VPlan for every supported loop nest in the function and bail " "out right after the build (stress test the VPlan H-CFG construction " "in the VPlan-native vectorization path)."))
ValueT lookup(const KeyT &Key) const
const TargetTransformInfo & TTI
Vector target information.
amdgpu Simplify well known AMD library false Value Value * Arg
ConstantInt * getFalse()
Get the constant value for i1 false.
const SmallVectorImpl< Instruction * > & getCastInsts() const
Returns a reference to the type cast instructions in the induction update chain, that are redundant w...
LoopVectorizationCostModel * Cost
The profitablity analysis.
Analysis pass that exposes the ScalarEvolution for a function.
void setWideningDecision(const InterleaveGroup< Instruction > *Grp, unsigned VF, InstWidening W, unsigned Cost)
Save vectorization decision W and Cost taken by the cost model for interleaving group Grp and vector ...
Constant * createStrideMask(IRBuilder<> &Builder, unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
uint64_t getTypeSizeInBits(Type *Ty) const
Size examples:
PHINode * OldInduction
The induction variable of the old basic block.
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
bool blockNeedsPredication(BasicBlock *BB)
bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF=1)
Returns true if I is a memory instruction in an interleaved-group of memory accesses that can be vect...
uint64_t getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
LoopT * getParentLoop() const
static unsigned getReciprocalPredBlockProb()
A helper function that returns the reciprocal of the block probability of predicated blocks...
Analysis pass providing a never-invalidated alias analysis result.
const SCEV * getBackedgeTakenCount()
Get the (predicated) backedge count for the analyzed loop.
static cl::opt< unsigned > TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), cl::Hidden, cl::desc("Loops with a constant trip count that is smaller than this " "value are vectorized only if no scalar iteration overheads " "are incurred."))
Loops with a known constant trip count below this number are vectorized only if no scalar iteration o...
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
MDNode * getLoopID() const
Return the llvm.loop loop id metadata node for this loop if it is present.
This analysis provides dependence information for the memory accesses of a loop.
LLVM_ATTRIBUTE_ALWAYS_INLINE iterator end()
uint64_t getMaxSafeRegisterWidth() const
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
static Type * getMemInstValueType(Value *I)
A helper function that returns the type of loaded or stored value.
void registerAssumption(CallInst *CI)
Add an @llvm.assume intrinsic to this function's cache.
void packScalarIntoVectorValue(Value *V, const VPIteration &Instance)
Construct the vector value of a scalarized value V one lane at a time.
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
bool areSafetyChecksAdded()
static unsigned getVectorCallCost(CallInst *CI, unsigned VF, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, bool &NeedToScalarize)
This class represents an analyzed expression in the program.
Value * getVectorValue(Value *Key, unsigned Part)
Retrieve the existing vector value that corresponds to Key and Part.
static cl::opt< bool > EnableInterleavedMemAccesses("enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on interleaved memory accesses in a loop"))
void addChildLoop(LoopT *NewChild)
Add the specified loop to be a child of this loop.
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
LLVM_NODISCARD bool empty() const
bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const
Floating point induction variable.
void execute(VPTransformState &State) override
Generate the phi/select nodes.
Represents a single loop in the control flow graph.
VPBasicBlock * handleReplication(Instruction *I, VFRange &Range, VPBasicBlock *VPBB, DenseMap< Instruction *, VPReplicateRecipe *> &PredInst2Recipe, VPlanPtr &Plan)
Build a VPReplicationRecipe for I and enclose it within a Region if it is predicated.
InductionList * getInductionVars()
Returns the induction variables found in the loop.
StringRef getName() const
Return a constant reference to the value's name.
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation.
const Function * getParent() const
Return the enclosing method, or null if none.
bool isUniformAfterVectorization(Instruction *I, unsigned VF) const
Returns true if I is known to be uniform after vectorization.
void collectInstsToScalarize(unsigned VF)
Collects the instructions to scalarize for each predicated instruction in the loop.
static Instruction::CastOps getCastOpcode(const Value *Val, bool SrcIsSigned, Type *Ty, bool DstIsSigned)
Returns the opcode necessary to cast Val into Ty using usual casting rules.
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
user_iterator_impl< User > user_iterator
bool isLegalMaskedStore(Type *DataType, Value *Ptr)
Returns true if the target machine supports masked store operation for the given DataType and kind of...
void execute(VPTransformState &State) override
Generate the vectorized and scalarized versions of the phi node as needed by their users...
PHINode * createInductionVariable(Loop *L, Value *Start, Value *End, Value *Step, Instruction *DL)
Create a new induction variable inside L.
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Optional< unsigned > computeMaxVF(bool OptForSize)
iterator_range< value_op_iterator > operand_values()
Loop * OrigLoop
The original loop.
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
unsigned getWideningCost(Instruction *I, unsigned VF)
Return the vectorization cost for the given instruction I and vector width VF.
void preserve()
Mark an analysis as preserved.
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value *> Args=None, const Twine &Name="", MDNode *FPMathTag=nullptr)
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction.
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector and ...
bool canVectorize(bool UseVPlanNativePath)
Returns true if it is legal to vectorize this loop.
InstWidening getWideningDecision(Instruction *I, unsigned VF)
Return the cost model decision for the given instruction I and vector width VF.
LLVM_NODISCARD bool empty() const
void setVectorValue(Value *Key, unsigned Part, Value *Vector)
Set a vector value associated with Key and Part.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
void updateAnalysis()
Insert the new loop to the loop hierarchy and pass manager and update the analysis passes...
Wrapper class to LoopBlocksDFS that provides a standard begin()/end() interface for the DFS reverse p...
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Analysis pass providing the TargetLibraryInfo.
unsigned getLoadStoreAlignment(Value *I)
A helper function that returns the alignment of load or store instruction.
AliasAnalysis * AA
Alias Analysis.
LoopVectorizationLegality * Legal
The legality analysis.
const SmallVectorImpl< const SCEVPredicate * > & getPredicates() const
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="")
separate const offset from gep
void setAlreadyVectorized()
Mark the loop L as already vectorized by setting the width to 1.
Constant * createInterleaveMask(IRBuilder<> &Builder, unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isExplicitVecOuterLoop(Loop *OuterLp, OptimizationRemarkEmitter *ORE)
SmallPtrSet< Instruction *, 8 > & getCastInsts()
Returns a reference to the instructions used for type-promoting the recurrence.
unsigned VF
The vectorization SIMD factor to use.
CallInst * CreateMaskedScatter(Value *Val, Value *Ptrs, unsigned Align, Value *Mask=nullptr)
Create a call to Masked Scatter intrinsic.
A raw_ostream that writes to an std::string.
LoopVectorizationCostModel - estimates the expected speedups due to vectorization.
VPWidenRecipe is a recipe for producing a copy of vector type for each Instruction in its ingredients...
unsigned getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
IRBuilder Builder
The builder that we use.
uint64_t PowerOf2Floor(uint64_t A)
Returns the power of two which is less than or equal to the given value.
InstTy * getInsertPos() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM Value Representation.
virtual Value * getBroadcastInstrs(Value *V)
Create a broadcast instruction.
A recipe for generating conditional branches on the bits of a mask.
uint64_t getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type...
bool isLegalMaskedLoad(Type *DataType, Value *Ptr)
Returns true if the target machine supports masked load operation for the given DataType and kind of ...
static VectorType * get(Type *ElementType, unsigned NumElements)
This static method is the primary way to construct an VectorType.
RPOIterator beginRPO() const
Reverse iterate over the cached postorder blocks.
bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
std::underlying_type< E >::type Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
BasicBlock::iterator GetInsertPoint() const
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
This class implements an extremely fast bulk output stream that can only output to a stream...
uint64_t getTypeAllocSizeInBits(Type *Ty) const
Returns the offset in bits between successive objects of the specified type, including alignment padd...
Type * getElementType() const
The legacy pass manager's analysis pass to compute loop information.
print Print MemDeps of function
SmallVector< BasicBlock *, 4 > LoopBypassBlocks
A list of all bypass blocks. The first block is the entry of the loop.
static Type * ToVectorTy(Type *Scalar, unsigned VF)
A helper function for converting Scalar types to vector types.
const TargetTransformInfo * TTI
Target Transform Info.
bool hasOneUse() const
Return true if there is exactly one user of this value.
Convenience struct for specifying and reasoning about fast-math flags.
StringRef - Represent a constant reference to a string, i.e.
This is the interface for LLVM's primary stateless and local alias analysis.
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
In what follows, the term "input IR" refers to code that is fed into the vectorizer whereas the term ...
A container for analyses that lazily runs them and caches their results.
VectorizerValueMap VectorLoopValueMap
Maps values from the original loop to their corresponding values in the vectorized loop...
Legacy analysis pass which computes a DominatorTree.
void selectUserVectorizationFactor(unsigned UserVF)
Setup cost-based decisions for user vectorization factor.
bool shouldScalarizeInstruction(Instruction *I) const
Returns true if an instruction I should be scalarized instead of vectorized for the chosen vectorizat...
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object...
void setIncomingValue(unsigned i, Value *V)
void perform(LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
Utility class for getting and setting loop vectorizer hints in the form of loop metadata.
unsigned getNumOperands() const
Return number of MDNode operands.
static BinaryOperator * CreateMul(Value *S1, Value *S2, const Twine &Name, Instruction *InsertBefore, Value *FlagsOp)
static cl::opt< unsigned > ForceTargetInstructionCost("force-target-instruction-cost", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's expected cost for " "an instruction to a single constant value. Mostly " "useful for getting consistent testing."))
unsigned getNumStores() const
void addMetadata(InstTy *NewInst) const
Add metadata (e.g.
iterator_range< block_iterator > blocks() const
VPWidenIntOrFpInductionRecipe * tryToOptimizeInduction(Instruction *I, VFRange &Range)
Check if an induction recipe should be constructed for within the given VF Range.
void vectorizeInterleaveGroup(Instruction *Instr, VectorParts *BlockInMask=nullptr)
Try to vectorize the interleaved access group that Instr belongs to, optionally masking the vector op...
void setCostBasedWideningDecision(unsigned VF)
Memory access instruction may be vectorized in more than one way.
bool isScalarAfterVectorization(Instruction *I, unsigned VF) const
Returns true if I is known to be scalar after vectorization.
static Constant * get(ArrayRef< Constant *> V)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
DenseMap< Instruction *, Instruction * > & getSinkAfter()
Return the set of instructions to sink to handle first-order recurrences.
static cl::opt< unsigned > ForceTargetMaxScalarInterleaveFactor("force-target-max-scalar-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "scalar loops."))
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
RecurrenceKind
This enum represents the kinds of recurrences that we support.
const BasicBlock * getParent() const
This class represents a constant integer value.
bool needsScalarInduction(Instruction *IV) const
Returns true if we should generate a scalar version of IV.
bool requiresScalarEpilogue() const
Returns true if this Group requires a scalar iteration to handle gaps.
InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM)
Legacy wrapper pass to provide the BasicAAResult object.
bool Need
This flag indicates if we need to add the runtime check.
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Type * getRecurrenceType()
Returns the type of the recurrence.
std::pair< unsigned, unsigned > getSmallestAndWidestTypes()
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.