100 using namespace llvm;
104 #define SV_NAME "slp-vectorizer" 105 #define DEBUG_TYPE "SLP" 107 STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
111 cl::desc(
"Only vectorize if you gain more than this " 116 cl::desc(
"Attempt to vectorize horizontal reductions"));
121 "Attempt to vectorize horizontal reductions feeding into a store"));
125 cl::desc(
"Attempt to vectorize for this register size in bits"));
133 cl::desc(
"Limit the size of the SLP scheduling region per block"));
137 cl::desc(
"Attempt to vectorize for this register size in bits"));
141 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
145 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
149 cl::desc(
"Display the SLP trees with Graphviz"));
183 for (
int i = 1, e = VL.
size(); i < e; i++) {
197 if (!isa<Constant>(i))
204 for (
unsigned i = 1, e = VL.
size(); i < e; ++i)
253 auto *EI0 = cast<ExtractElementInst>(VL[0]);
254 unsigned Size = EI0->getVectorOperandType()->getVectorNumElements();
255 Value *Vec1 =
nullptr;
256 Value *Vec2 =
nullptr;
257 enum ShuffleMode { Unknown,
Select, Permute };
258 ShuffleMode CommonShuffleMode = Unknown;
259 for (
unsigned I = 0,
E = VL.
size();
I <
E; ++
I) {
260 auto *EI = cast<ExtractElementInst>(VL[
I]);
261 auto *Vec = EI->getVectorOperand();
263 if (Vec->getType()->getVectorNumElements() !=
Size)
269 if (Idx->getValue().uge(Size))
271 unsigned IntIdx = Idx->getValue().getZExtValue();
273 if (isa<UndefValue>(Vec))
277 if (!Vec1 || Vec1 == Vec)
279 else if (!Vec2 || Vec2 == Vec)
283 if (CommonShuffleMode == Permute)
288 CommonShuffleMode = Permute;
291 CommonShuffleMode =
Select;
294 if (CommonShuffleMode ==
Select && Vec2)
305 struct InstructionsState {
307 Value *OpValue =
nullptr;
318 unsigned getAltOpcode()
const {
323 bool isAltShuffle()
const {
return getOpcode() != getAltOpcode(); }
327 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
330 InstructionsState() =
delete;
332 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
342 if (
I && S.isOpcodeOrAlt(
I))
351 unsigned BaseIndex = 0) {
354 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
356 bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
357 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
358 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->
getOpcode();
359 unsigned AltOpcode = Opcode;
360 unsigned AltIndex = BaseIndex;
364 for (
int Cnt = 0,
E = VL.
size(); Cnt <
E; Cnt++) {
365 unsigned InstOpcode = cast<Instruction>(VL[Cnt])->
getOpcode();
366 if (IsBinOp && isa<BinaryOperator>(VL[Cnt])) {
367 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
369 if (Opcode == AltOpcode) {
370 AltOpcode = InstOpcode;
374 }
else if (IsCastOp && isa<CastInst>(VL[Cnt])) {
375 Type *Ty0 = cast<Instruction>(VL[BaseIndex])->getOperand(0)->getType();
376 Type *Ty1 = cast<Instruction>(VL[Cnt])->getOperand(0)->getType();
378 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
380 if (Opcode == AltOpcode) {
381 AltOpcode = InstOpcode;
386 }
else if (InstOpcode == Opcode || InstOpcode == AltOpcode)
388 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
391 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
392 cast<Instruction>(VL[AltIndex]));
398 Type *Ty = VL[0]->getType();
399 for (
int i = 1, e = VL.
size(); i < e; i++)
409 assert((Opcode == Instruction::ExtractElement ||
410 Opcode == Instruction::ExtractValue) &&
411 "Expected extractelement or extractvalue instruction.");
412 if (Opcode == Instruction::ExtractElement) {
416 return CI->getZExtValue();
431 LoadInst *LI = cast<LoadInst>(UserInst);
439 CallInst *CI = cast<CallInst>(UserInst);
455 if (
LoadInst *LI = dyn_cast<LoadInst>(I))
462 if (
LoadInst *LI = dyn_cast<LoadInst>(I))
463 return LI->isSimple();
465 return SI->isSimple();
467 return !
MI->isVolatile();
489 :
F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC),
490 DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) {
501 MaxVecRegSize = TTI->getRegisterBitWidth(
true);
506 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
511 Value *vectorizeTree();
541 VectorizableTree.clear();
542 ScalarToTreeEntry.clear();
544 ExternalUses.clear();
545 NumOpsWantToKeepOrder.clear();
546 NumOpsWantToKeepOriginalOrder = 0;
547 for (
auto &Iter : BlocksSchedules) {
548 BlockScheduling *BS = Iter.second.get();
557 void optimizeGatherSequence();
561 auto I = std::max_element(
562 NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(),
563 [](
const decltype(NumOpsWantToKeepOrder)::value_type &D1,
564 const decltype(NumOpsWantToKeepOrder)::value_type &D2) {
565 return D1.second < D2.second;
567 if (
I == NumOpsWantToKeepOrder.end() ||
568 I->getSecond() <= NumOpsWantToKeepOriginalOrder)
579 unsigned getVectorElementSize(
Value *V);
587 return MaxVecRegSize;
592 return MinVecRegSize;
602 bool isTreeTinyAndNotFullyVectorizable();
613 int getEntryCost(TreeEntry *
E);
627 Value *vectorizeTree(TreeEntry *E);
644 const InstructionsState &S);
651 bool isFullyVectorizableTinyTree();
655 void reorderAltShuffleOperands(
const InstructionsState &S,
666 TreeEntry(std::vector<TreeEntry> &Container) : Container(Container) {}
670 if (VL.
size() == Scalars.size())
672 return VL.
size() == ReuseShuffleIndices.size() &&
674 VL.
begin(), VL.
end(), ReuseShuffleIndices.begin(),
675 [
this](
Value *V,
unsigned Idx) {
return V == Scalars[Idx]; });
682 Value *VectorizedValue =
nullptr;
685 bool NeedToGather =
false;
699 std::vector<TreeEntry> &Container;
711 int idx = VectorizableTree.size() - 1;
712 TreeEntry *Last = &VectorizableTree[idx];
713 Last->Scalars.insert(Last->Scalars.begin(), VL.
begin(), VL.
end());
714 Last->NeedToGather = !Vectorized;
715 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
716 ReuseShuffleIndices.end());
717 Last->ReorderIndices = ReorderIndices;
719 for (
int i = 0, e = VL.
size(); i != e; ++i) {
720 assert(!getTreeEntry(VL[i]) &&
"Scalar already in tree!");
721 ScalarToTreeEntry[VL[i]] = idx;
724 MustGather.insert(VL.
begin(), VL.
end());
727 if (UserTreeIdx >= 0)
728 Last->UserTreeIndices.push_back(UserTreeIdx);
734 std::vector<TreeEntry> VectorizableTree;
736 TreeEntry *getTreeEntry(
Value *V) {
737 auto I = ScalarToTreeEntry.find(V);
738 if (I != ScalarToTreeEntry.end())
739 return &VectorizableTree[I->second];
750 struct ExternalUser {
772 AliasCacheKey key = std::make_pair(Inst1, Inst2);
781 aliased = AA->alias(Loc1, Loc2);
788 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
803 DeletedInstructions.emplace_back(I);
829 struct ScheduleData {
832 enum { InvalidDeps = -1 };
834 ScheduleData() =
default;
836 void init(
int BlockSchedulingRegionID,
Value *OpVal) {
837 FirstInBundle =
this;
838 NextInBundle =
nullptr;
839 NextLoadStore =
nullptr;
841 SchedulingRegionID = BlockSchedulingRegionID;
842 UnscheduledDepsInBundle = UnscheduledDeps;
848 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
852 bool isSchedulingEntity()
const {
return FirstInBundle ==
this; }
856 bool isPartOfBundle()
const {
857 return NextInBundle !=
nullptr || FirstInBundle !=
this;
862 bool isReady()
const {
863 assert(isSchedulingEntity() &&
864 "can't consider non-scheduling entity for ready list");
865 return UnscheduledDepsInBundle == 0 && !IsScheduled;
870 int incrementUnscheduledDeps(
int Incr) {
871 UnscheduledDeps += Incr;
872 return FirstInBundle->UnscheduledDepsInBundle += Incr;
877 void resetUnscheduledDeps() {
878 incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
882 void clearDependencies() {
883 Dependencies = InvalidDeps;
884 resetUnscheduledDeps();
885 MemoryDependencies.clear();
889 if (!isSchedulingEntity()) {
891 }
else if (NextInBundle) {
893 ScheduleData *SD = NextInBundle;
895 os <<
';' << *SD->Inst;
896 SD = SD->NextInBundle;
908 ScheduleData *FirstInBundle =
nullptr;
912 ScheduleData *NextInBundle =
nullptr;
916 ScheduleData *NextLoadStore =
nullptr;
924 int SchedulingRegionID = 0;
927 int SchedulingPriority = 0;
933 int Dependencies = InvalidDeps;
939 int UnscheduledDeps = InvalidDeps;
943 int UnscheduledDepsInBundle = InvalidDeps;
947 bool IsScheduled =
false;
950 Value *OpValue =
nullptr;
955 const BoUpSLP::ScheduleData &SD) {
965 struct BlockScheduling {
967 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
971 ScheduleStart =
nullptr;
972 ScheduleEnd =
nullptr;
973 FirstLoadStoreInRegion =
nullptr;
974 LastLoadStoreInRegion =
nullptr;
978 ScheduleRegionSizeLimit -= ScheduleRegionSize;
979 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
981 ScheduleRegionSize = 0;
985 ++SchedulingRegionID;
988 ScheduleData *getScheduleData(
Value *V) {
989 ScheduleData *SD = ScheduleDataMap[V];
990 if (SD && SD->SchedulingRegionID == SchedulingRegionID)
997 return getScheduleData(V);
998 auto I = ExtraScheduleDataMap.find(V);
999 if (I != ExtraScheduleDataMap.end()) {
1000 ScheduleData *SD = I->second[
Key];
1001 if (SD && SD->SchedulingRegionID == SchedulingRegionID)
1007 bool isInSchedulingRegion(ScheduleData *SD) {
1008 return SD->SchedulingRegionID == SchedulingRegionID;
1013 template <
typename ReadyListType>
1014 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
1015 SD->IsScheduled =
true;
1018 ScheduleData *BundleMember = SD;
1019 while (BundleMember) {
1020 if (BundleMember->Inst != BundleMember->OpValue) {
1021 BundleMember = BundleMember->NextInBundle;
1025 for (
Use &U : BundleMember->Inst->operands()) {
1029 doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
1030 if (OpDef && OpDef->hasValidDependencies() &&
1031 OpDef->incrementUnscheduledDeps(-1) == 0) {
1035 ScheduleData *DepBundle = OpDef->FirstInBundle;
1036 assert(!DepBundle->IsScheduled &&
1037 "already scheduled bundle gets ready");
1038 ReadyList.insert(DepBundle);
1040 <<
"SLP: gets ready (def): " << *DepBundle <<
"\n");
1045 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
1046 if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
1049 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
1050 assert(!DepBundle->IsScheduled &&
1051 "already scheduled bundle gets ready");
1052 ReadyList.insert(DepBundle);
1054 <<
"SLP: gets ready (mem): " << *DepBundle <<
"\n");
1057 BundleMember = BundleMember->NextInBundle;
1061 void doForAllOpcodes(
Value *V,
1063 if (ScheduleData *SD = getScheduleData(V))
1065 auto I = ExtraScheduleDataMap.find(V);
1066 if (I != ExtraScheduleDataMap.end())
1067 for (
auto &
P : I->second)
1068 if (
P.second->SchedulingRegionID == SchedulingRegionID)
1073 template <
typename ReadyListType>
1074 void initialFillReadyList(ReadyListType &ReadyList) {
1075 for (
auto *I = ScheduleStart; I != ScheduleEnd; I = I->
getNextNode()) {
1076 doForAllOpcodes(I, [&](ScheduleData *SD) {
1077 if (SD->isSchedulingEntity() && SD->isReady()) {
1078 ReadyList.insert(SD);
1080 <<
"SLP: initially in ready list: " << *I <<
"\n");
1090 const InstructionsState &S);
1096 ScheduleData *allocateScheduleDataChunks();
1100 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
1105 ScheduleData *PrevLoadStore,
1106 ScheduleData *NextLoadStore);
1110 void calculateDependencies(ScheduleData *SD,
bool InsertInReadyList,
1114 void resetSchedule();
1119 std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
1135 ExtraScheduleDataMap;
1138 void insert(ScheduleData *SD) { push_back(SD); }
1142 ReadyList ReadyInsts;
1152 ScheduleData *FirstLoadStoreInRegion =
nullptr;
1156 ScheduleData *LastLoadStoreInRegion =
nullptr;
1159 int ScheduleRegionSize = 0;
1168 int SchedulingRegionID = 1;
1176 void scheduleBlock(BlockScheduling *BS);
1184 struct OrdersTypeDenseMapInfo {
1197 static unsigned getHashValue(
const OrdersType &V) {
1212 unsigned NumOpsWantToKeepOriginalOrder = 0;
1227 unsigned MaxVecRegSize;
1228 unsigned MinVecRegSize;
1251 struct ChildIteratorType
1253 SmallVector<int, 1>::iterator> {
1257 std::vector<TreeEntry> &VT)
1266 return {N->UserTreeIndices.begin(), N->Container};
1270 return {N->UserTreeIndices.end(), N->Container};
1285 static unsigned size(BoUpSLP *R) {
return R->VectorizableTree.size(); }
1296 if (
isSplat(Entry->Scalars)) {
1297 OS <<
"<splat> " << *Entry->Scalars[0];
1300 for (
auto V : Entry->Scalars) {
1303 R->ExternalUses.begin(), R->ExternalUses.end(),
1304 [&](
const BoUpSLP::ExternalUser &EU) {
return EU.Scalar == V; }))
1313 if (Entry->NeedToGather)
1323 ExtraValueToDebugLocsMap ExternallyUsedValues;
1324 buildTree(Roots, ExternallyUsedValues, UserIgnoreLst);
1328 ExtraValueToDebugLocsMap &ExternallyUsedValues,
1331 UserIgnoreList = UserIgnoreLst;
1334 buildTree_rec(Roots, 0, -1);
1337 for (
TreeEntry &EIdx : VectorizableTree) {
1341 if (Entry->NeedToGather)
1345 for (
int Lane = 0,
LE = Entry->Scalars.size(); Lane !=
LE; ++Lane) {
1347 int FoundLane = Lane;
1348 if (!Entry->ReuseShuffleIndices.empty()) {
1350 std::distance(Entry->ReuseShuffleIndices.begin(),
1351 llvm::find(Entry->ReuseShuffleIndices, FoundLane));
1355 auto ExtI = ExternallyUsedValues.find(Scalar);
1356 if (ExtI != ExternallyUsedValues.end()) {
1357 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane " 1358 << Lane <<
" from " << *Scalar <<
".\n");
1359 ExternalUses.emplace_back(Scalar,
nullptr, FoundLane);
1369 if (
TreeEntry *UseEntry = getTreeEntry(U)) {
1370 Value *UseScalar = UseEntry->Scalars[0];
1374 if (UseScalar != U ||
1376 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
1378 assert(!UseEntry->NeedToGather &&
"Bad state");
1387 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract:" << *U <<
" from lane " 1388 << Lane <<
" from " << *Scalar <<
".\n");
1389 ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));
1401 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
1402 newTreeEntry(VL,
false, UserTreeIdx);
1407 if (S.OpValue->getType()->isVectorTy()) {
1409 newTreeEntry(VL,
false, UserTreeIdx);
1413 if (
StoreInst *
SI = dyn_cast<StoreInst>(S.OpValue))
1414 if (
SI->getValueOperand()->getType()->isVectorTy()) {
1415 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to store vector type.\n");
1416 newTreeEntry(VL,
false, UserTreeIdx);
1423 newTreeEntry(VL,
false, UserTreeIdx);
1431 for (
unsigned i = 0, e = VL.
size(); i != e; ++i) {
1432 if (EphValues.count(VL[i])) {
1434 <<
") is ephemeral.\n");
1435 newTreeEntry(VL,
false, UserTreeIdx);
1441 if (
TreeEntry *
E = getTreeEntry(S.OpValue)) {
1442 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *S.OpValue <<
".\n");
1443 if (!
E->isSame(VL)) {
1444 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to partial overlap.\n");
1445 newTreeEntry(VL,
false, UserTreeIdx);
1450 E->UserTreeIndices.push_back(UserTreeIdx);
1451 LLVM_DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *S.OpValue
1457 for (
unsigned i = 0, e = VL.
size(); i != e; ++i) {
1461 if (getTreeEntry(
I)) {
1463 <<
") is already in tree.\n");
1464 newTreeEntry(VL,
false, UserTreeIdx);
1472 for (
unsigned i = 0, e = VL.
size(); i != e; ++i) {
1473 if (MustGather.count(VL[i]) ||
is_contained(UserIgnoreList, VL[i])) {
1474 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to gathered scalar.\n");
1475 newTreeEntry(VL,
false, UserTreeIdx);
1482 auto *VL0 = cast<Instruction>(S.OpValue);
1485 if (!DT->isReachableFromEntry(BB)) {
1489 newTreeEntry(VL,
false, UserTreeIdx);
1497 for (
Value *V : VL) {
1503 if (UniqueValues.
size() == VL.size()) {
1504 ReuseShuffleIndicies.
clear();
1509 newTreeEntry(VL,
false, UserTreeIdx);
1515 auto &BSRef = BlocksSchedules[BB];
1517 BSRef = llvm::make_unique<BlockScheduling>(BB);
1519 BlockScheduling &BS = *BSRef.get();
1521 if (!BS.tryScheduleBundle(VL,
this, S)) {
1522 LLVM_DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
1523 assert((!BS.getScheduleData(VL0) ||
1524 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
1525 "tryScheduleBundle should cancelScheduling on failure");
1526 newTreeEntry(VL,
false, UserTreeIdx, ReuseShuffleIndicies);
1529 LLVM_DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
1531 unsigned ShuffleOrOp = S.isAltShuffle() ?
1532 (
unsigned) Instruction::ShuffleVector : S.getOpcode();
1533 switch (ShuffleOrOp) {
1534 case Instruction::PHI: {
1538 for (
unsigned j = 0; j < VL.size(); ++j)
1541 cast<PHINode>(VL[j])->getIncomingValueForBlock(
1545 <<
"SLP: Need to swizzle PHINodes (terminator use).\n");
1546 BS.cancelScheduling(VL, VL0);
1547 newTreeEntry(VL,
false, UserTreeIdx, ReuseShuffleIndicies);
1552 newTreeEntry(VL,
true, UserTreeIdx, ReuseShuffleIndicies);
1559 Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(
1562 buildTree_rec(Operands, Depth + 1, UserTreeIdx);
1566 case Instruction::ExtractValue:
1567 case Instruction::ExtractElement: {
1568 OrdersType CurrentOrder;
1569 bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
1571 LLVM_DEBUG(
dbgs() <<
"SLP: Reusing or shuffling extract sequence.\n");
1572 ++NumOpsWantToKeepOriginalOrder;
1573 newTreeEntry(VL,
true, UserTreeIdx,
1574 ReuseShuffleIndicies);
1577 if (!CurrentOrder.empty()) {
1579 dbgs() <<
"SLP: Reusing or shuffling of reordered extract sequence " 1581 for (
unsigned Idx : CurrentOrder)
1582 dbgs() <<
" " << Idx;
1587 auto StoredCurrentOrderAndNum =
1588 NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
1589 ++StoredCurrentOrderAndNum->getSecond();
1590 newTreeEntry(VL,
true, UserTreeIdx, ReuseShuffleIndicies,
1591 StoredCurrentOrderAndNum->getFirst());
1595 newTreeEntry(VL,
false, UserTreeIdx, ReuseShuffleIndicies);
1596 BS.cancelScheduling(VL, VL0);
1606 Type *ScalarTy = VL0->getType();
1608 if (DL->getTypeSizeInBits(ScalarTy) !=
1609 DL->getTypeAllocSizeInBits(ScalarTy)) {
1610 BS.cancelScheduling(VL, VL0);
1611 newTreeEntry(VL,
false, UserTreeIdx, ReuseShuffleIndicies);
1612 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
1619 auto POIter = PointerOps.
begin();
1620 for (
Value *V : VL) {
1621 auto *L = cast<LoadInst>(V);
1622 if (!L->isSimple()) {
1623 BS.cancelScheduling(VL, VL0);
1624 newTreeEntry(VL,
false, UserTreeIdx, ReuseShuffleIndicies);
1628 *POIter = L->getPointerOperand();
1632 OrdersType CurrentOrder;
1637 if (CurrentOrder.empty()) {
1638 Ptr0 = PointerOps.front();
1639 PtrN = PointerOps.back();
1641 Ptr0 = PointerOps[CurrentOrder.front()];
1642 PtrN = PointerOps[CurrentOrder.back()];
1644 const SCEV *Scev0 = SE->getSCEV(Ptr0);
1645 const SCEV *ScevN = SE->getSCEV(PtrN);
1648 uint64_t
Size = DL->getTypeAllocSize(ScalarTy);
1650 if (Diff && Diff->getAPInt().getZExtValue() == (VL.size() - 1) * Size) {
1651 if (CurrentOrder.empty()) {
1653 ++NumOpsWantToKeepOriginalOrder;
1654 newTreeEntry(VL,
true, UserTreeIdx,
1655 ReuseShuffleIndicies);
1659 auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
1661 newTreeEntry(VL,
true, UserTreeIdx,
1662 ReuseShuffleIndicies,
I->getFirst());
1670 BS.cancelScheduling(VL, VL0);
1671 newTreeEntry(VL,
false, UserTreeIdx, ReuseShuffleIndicies);
1674 case Instruction::ZExt:
1675 case Instruction::SExt:
1676 case Instruction::FPToUI:
1677 case Instruction::FPToSI:
1678 case Instruction::FPExt:
1679 case Instruction::PtrToInt:
1680 case Instruction::IntToPtr:
1681 case Instruction::SIToFP:
1682 case Instruction::UIToFP:
1683 case Instruction::Trunc:
1684 case Instruction::FPTrunc:
1685 case Instruction::BitCast: {
1686 Type *SrcTy = VL0->getOperand(0)->getType();
1687 for (
unsigned i = 0; i < VL.size(); ++i) {
1688 Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
1690 BS.cancelScheduling(VL, VL0);
1691 newTreeEntry(VL,
false, UserTreeIdx, ReuseShuffleIndicies);
1693 <<
"SLP: Gathering casts with different src types.\n");
1697 newTreeEntry(VL,
true, UserTreeIdx, ReuseShuffleIndicies);
1700 for (
unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1704 Operands.push_back(cast<Instruction>(j)->getOperand(i));
1706 buildTree_rec(Operands, Depth + 1, UserTreeIdx);
1710 case Instruction::ICmp:
1711 case Instruction::FCmp: {
1714 Type *ComparedTy = VL0->getOperand(0)->getType();
1715 for (
unsigned i = 1, e = VL.size(); i < e; ++i) {
1716 CmpInst *Cmp = cast<CmpInst>(VL[i]);
1719 BS.cancelScheduling(VL, VL0);
1720 newTreeEntry(VL,
false, UserTreeIdx, ReuseShuffleIndicies);
1722 <<
"SLP: Gathering cmp with different predicate.\n");
1727 newTreeEntry(VL,
true, UserTreeIdx, ReuseShuffleIndicies);
1730 for (
unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1734 Operands.push_back(cast<Instruction>(j)->getOperand(i));
1736 buildTree_rec(Operands, Depth + 1, UserTreeIdx);
1742 case Instruction::FAdd:
1743 case Instruction::Sub:
1744 case Instruction::FSub:
1745 case Instruction::Mul:
1746 case Instruction::FMul:
1747 case Instruction::UDiv:
1748 case Instruction::SDiv:
1749 case Instruction::FDiv:
1750 case Instruction::URem:
1751 case Instruction::SRem:
1752 case Instruction::FRem:
1753 case Instruction::Shl:
1754 case Instruction::LShr:
1755 case Instruction::AShr:
1756 case Instruction::And:
1757 case Instruction::Or:
1758 case Instruction::Xor:
1759 newTreeEntry(VL,
true, UserTreeIdx, ReuseShuffleIndicies);
1764 if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
1766 reorderInputsAccordingToOpcode(S.getOpcode(), VL, Left, Right);
1767 buildTree_rec(Left, Depth + 1, UserTreeIdx);
1768 buildTree_rec(Right, Depth + 1, UserTreeIdx);
1772 for (
unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1776 Operands.push_back(cast<Instruction>(j)->getOperand(i));
1778 buildTree_rec(Operands, Depth + 1, UserTreeIdx);
1782 case Instruction::GetElementPtr: {
1784 for (
unsigned j = 0; j < VL.size(); ++j) {
1785 if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
1786 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
1787 BS.cancelScheduling(VL, VL0);
1788 newTreeEntry(VL,
false, UserTreeIdx, ReuseShuffleIndicies);
1795 Type *Ty0 = VL0->getOperand(0)->getType();
1796 for (
unsigned j = 0; j < VL.size(); ++j) {
1797 Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
1800 <<
"SLP: not-vectorizable GEP (different types).\n");
1801 BS.cancelScheduling(VL, VL0);
1802 newTreeEntry(VL,
false, UserTreeIdx, ReuseShuffleIndicies);
1808 for (
unsigned j = 0; j < VL.size(); ++j) {
1809 auto Op = cast<Instruction>(VL[j])->getOperand(1);
1810 if (!isa<ConstantInt>(
Op)) {
1812 <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
1813 BS.cancelScheduling(VL, VL0);
1814 newTreeEntry(VL,
false, UserTreeIdx, ReuseShuffleIndicies);
1819 newTreeEntry(VL,
true, UserTreeIdx, ReuseShuffleIndicies);
1821 for (
unsigned i = 0, e = 2; i < e; ++i) {
1825 Operands.push_back(cast<Instruction>(j)->getOperand(i));
1827 buildTree_rec(Operands, Depth + 1, UserTreeIdx);
1833 for (
unsigned i = 0, e = VL.size() - 1; i < e; ++i)
1835 BS.cancelScheduling(VL, VL0);
1836 newTreeEntry(VL,
false, UserTreeIdx, ReuseShuffleIndicies);
1841 newTreeEntry(VL,
true, UserTreeIdx, ReuseShuffleIndicies);
1846 Operands.push_back(cast<Instruction>(j)->getOperand(0));
1848 buildTree_rec(Operands, Depth + 1, UserTreeIdx);
1853 CallInst *CI = cast<CallInst>(VL0);
1858 BS.cancelScheduling(VL, VL0);
1859 newTreeEntry(VL,
false, UserTreeIdx, ReuseShuffleIndicies);
1864 Value *A1I =
nullptr;
1867 for (
unsigned i = 1, e = VL.size(); i != e; ++i) {
1872 BS.cancelScheduling(VL, VL0);
1873 newTreeEntry(VL,
false, UserTreeIdx, ReuseShuffleIndicies);
1874 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched calls:" << *CI <<
"!=" << *VL[i]
1883 BS.cancelScheduling(VL, VL0);
1884 newTreeEntry(VL,
false, UserTreeIdx, ReuseShuffleIndicies);
1886 <<
" argument " << A1I <<
"!=" << A1J <<
"\n");
1895 BS.cancelScheduling(VL, VL0);
1896 newTreeEntry(VL,
false, UserTreeIdx, ReuseShuffleIndicies);
1898 << *CI <<
"!=" << *VL[i] <<
'\n');
1903 newTreeEntry(VL,
true, UserTreeIdx, ReuseShuffleIndicies);
1907 for (
Value *j : VL) {
1911 buildTree_rec(Operands, Depth + 1, UserTreeIdx);
1915 case Instruction::ShuffleVector:
1918 if (!S.isAltShuffle()) {
1919 BS.cancelScheduling(VL, VL0);
1920 newTreeEntry(VL,
false, UserTreeIdx, ReuseShuffleIndicies);
1921 LLVM_DEBUG(
dbgs() <<
"SLP: ShuffleVector are not vectorized.\n");
1924 newTreeEntry(VL,
true, UserTreeIdx, ReuseShuffleIndicies);
1928 if (isa<BinaryOperator>(VL0)) {
1930 reorderAltShuffleOperands(S, VL, Left, Right);
1931 buildTree_rec(Left, Depth + 1, UserTreeIdx);
1932 buildTree_rec(Right, Depth + 1, UserTreeIdx);
1936 for (
unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1940 Operands.push_back(cast<Instruction>(j)->getOperand(i));
1942 buildTree_rec(Operands, Depth + 1, UserTreeIdx);
1947 BS.cancelScheduling(VL, VL0);
1948 newTreeEntry(VL,
false, UserTreeIdx, ReuseShuffleIndicies);
1959 N =
ST->getNumElements();
1960 EltTy = *
ST->element_begin();
1962 N = cast<ArrayType>(
T)->getNumElements();
1963 EltTy = cast<ArrayType>(
T)->getElementType();
1972 for (
const auto *Ty :
ST->elements())
1983 E0->
getOpcode() == Instruction::ExtractValue);
1989 CurrentOrder.
clear();
1993 if (E0->
getOpcode() == Instruction::ExtractValue) {
1995 NElts = canMapToVector(Vec->getType(), DL);
2003 NElts = Vec->getType()->getVectorNumElements();
2006 if (NElts != VL.
size())
2010 bool ShouldKeepOrder =
true;
2011 unsigned E = VL.
size();
2017 CurrentOrder.
assign(E, E + 1);
2019 for (; I <
E; ++
I) {
2020 auto *Inst = cast<Instruction>(VL[
I]);
2021 if (Inst->getOperand(0) != Vec)
2026 const unsigned ExtIdx = *Idx;
2028 if (ExtIdx >= E || CurrentOrder[ExtIdx] != E + 1)
2030 ShouldKeepOrder =
false;
2031 CurrentOrder[ExtIdx] =
I;
2033 if (CurrentOrder[I] != E + 1)
2035 CurrentOrder[
I] =
I;
2039 CurrentOrder.
clear();
2043 return ShouldKeepOrder;
2046 bool BoUpSLP::areAllUsersVectorized(
Instruction *
I)
const {
2049 return ScalarToTreeEntry.count(U) > 0;
2053 int BoUpSLP::getEntryCost(TreeEntry *
E) {
2056 Type *ScalarTy = VL[0]->getType();
2058 ScalarTy =
SI->getValueOperand()->getType();
2059 else if (
CmpInst *CI = dyn_cast<CmpInst>(VL[0]))
2060 ScalarTy = CI->getOperand(0)->getType();
2065 if (MinBWs.count(VL[0]))
2069 unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size();
2070 bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
2071 int ReuseShuffleCost = 0;
2072 if (NeedToShuffleReuses) {
2076 if (E->NeedToGather) {
2080 return ReuseShuffleCost +
2087 int Cost = TTI->getShuffleCost(ShuffleKind.
getValue(), VecTy);
2088 for (
auto *V : VL) {
2093 if (areAllUsersVectorized(cast<Instruction>(V)) &&
2094 !ScalarToTreeEntry.count(V)) {
2095 auto *IO = cast<ConstantInt>(
2096 cast<ExtractElementInst>(V)->getIndexOperand());
2097 Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
2098 IO->getZExtValue());
2101 return ReuseShuffleCost + Cost;
2104 return ReuseShuffleCost + getGatherCost(VL);
2109 unsigned ShuffleOrOp = S.isAltShuffle() ?
2110 (
unsigned) Instruction::ShuffleVector : S.getOpcode();
2111 switch (ShuffleOrOp) {
2112 case Instruction::PHI:
2115 case Instruction::ExtractValue:
2116 case Instruction::ExtractElement:
2117 if (NeedToShuffleReuses) {
2119 for (
unsigned I : E->ReuseShuffleIndices) {
2120 if (ShuffleOrOp == Instruction::ExtractElement) {
2121 auto *IO = cast<ConstantInt>(
2122 cast<ExtractElementInst>(VL[
I])->getIndexOperand());
2123 Idx = IO->getZExtValue();
2124 ReuseShuffleCost -= TTI->getVectorInstrCost(
2125 Instruction::ExtractElement, VecTy, Idx);
2127 ReuseShuffleCost -= TTI->getVectorInstrCost(
2128 Instruction::ExtractElement, VecTy, Idx);
2132 Idx = ReuseShuffleNumbers;
2133 for (
Value *V : VL) {
2134 if (ShuffleOrOp == Instruction::ExtractElement) {
2135 auto *IO = cast<ConstantInt>(
2136 cast<ExtractElementInst>(V)->getIndexOperand());
2137 Idx = IO->getZExtValue();
2142 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx);
2145 if (!E->NeedToGather) {
2146 int DeadCost = ReuseShuffleCost;
2147 if (!E->ReorderIndices.empty()) {
2149 DeadCost += TTI->getShuffleCost(
2152 for (
unsigned i = 0, e = VL.
size(); i < e; ++i) {
2157 if (areAllUsersVectorized(E)) {
2161 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
2163 [](
User *U) {
return isa<GetElementPtrInst>(U); })) {
2166 DeadCost -= TTI->getExtractWithExtendCost(
2169 DeadCost += TTI->getCastInstrCost(
2175 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
2180 return ReuseShuffleCost + getGatherCost(VL);
2182 case Instruction::ZExt:
2183 case Instruction::SExt:
2184 case Instruction::FPToUI:
2185 case Instruction::FPToSI:
2186 case Instruction::FPExt:
2187 case Instruction::PtrToInt:
2188 case Instruction::IntToPtr:
2189 case Instruction::SIToFP:
2190 case Instruction::UIToFP:
2191 case Instruction::Trunc:
2192 case Instruction::FPTrunc:
2193 case Instruction::BitCast: {
2194 Type *SrcTy = VL0->getOperand(0)->getType();
2196 TTI->getCastInstrCost(S.getOpcode(), ScalarTy, SrcTy, VL0);
2197 if (NeedToShuffleReuses) {
2198 ReuseShuffleCost -= (ReuseShuffleNumbers - VL.
size()) * ScalarEltCost;
2202 int ScalarCost = VL.
size() * ScalarEltCost;
2207 if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
2208 VecCost = ReuseShuffleCost +
2209 TTI->getCastInstrCost(S.getOpcode(), VecTy, SrcVecTy, VL0);
2211 return VecCost - ScalarCost;
2213 case Instruction::FCmp:
2214 case Instruction::ICmp:
2217 int ScalarEltCost = TTI->getCmpSelInstrCost(S.getOpcode(), ScalarTy,
2219 if (NeedToShuffleReuses) {
2220 ReuseShuffleCost -= (ReuseShuffleNumbers - VL.
size()) * ScalarEltCost;
2224 int VecCost = TTI->getCmpSelInstrCost(S.getOpcode(), VecTy, MaskTy, VL0);
2225 return ReuseShuffleCost + VecCost - ScalarCost;
2228 case Instruction::FAdd:
2229 case Instruction::Sub:
2230 case Instruction::FSub:
2231 case Instruction::Mul:
2232 case Instruction::FMul:
2233 case Instruction::UDiv:
2234 case Instruction::SDiv:
2235 case Instruction::FDiv:
2236 case Instruction::URem:
2237 case Instruction::SRem:
2238 case Instruction::FRem:
2239 case Instruction::Shl:
2240 case Instruction::LShr:
2241 case Instruction::AShr:
2242 case Instruction::And:
2243 case Instruction::Or:
2244 case Instruction::Xor: {
2262 for (
unsigned i = 0, e = VL.
size(); i < e; ++i) {
2271 !CInt->getValue().isPowerOf2())
2282 int ScalarEltCost = TTI->getArithmeticInstrCost(
2283 S.getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands);
2284 if (NeedToShuffleReuses) {
2285 ReuseShuffleCost -= (ReuseShuffleNumbers - VL.
size()) * ScalarEltCost;
2288 int VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy, Op1VK,
2289 Op2VK, Op1VP, Op2VP, Operands);
2290 return ReuseShuffleCost + VecCost - ScalarCost;
2292 case Instruction::GetElementPtr: {
2300 if (NeedToShuffleReuses) {
2301 ReuseShuffleCost -= (ReuseShuffleNumbers - VL.
size()) * ScalarEltCost;
2306 return ReuseShuffleCost + VecCost - ScalarCost;
2310 unsigned alignment = cast<LoadInst>(VL0)->
getAlignment();
2313 if (NeedToShuffleReuses) {
2314 ReuseShuffleCost -= (ReuseShuffleNumbers - VL.
size()) * ScalarEltCost;
2319 if (!E->ReorderIndices.empty()) {
2321 VecLdCost += TTI->getShuffleCost(
2324 return ReuseShuffleCost + VecLdCost - ScalarLdCost;
2328 unsigned alignment = cast<StoreInst>(VL0)->
getAlignment();
2331 if (NeedToShuffleReuses) {
2332 ReuseShuffleCost -= (ReuseShuffleNumbers - VL.
size()) * ScalarEltCost;
2337 return ReuseShuffleCost + VecStCost - ScalarStCost;
2340 CallInst *CI = cast<CallInst>(VL0);
2349 if (
auto *FPMO = dyn_cast<FPMathOperator>(CI))
2350 FMF = FPMO->getFastMathFlags();
2353 TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);
2354 if (NeedToShuffleReuses) {
2355 ReuseShuffleCost -= (ReuseShuffleNumbers - VL.
size()) * ScalarEltCost;
2360 int VecCallCost = TTI->getIntrinsicInstrCost(ID, CI->
getType(),
Args, FMF,
2363 LLVM_DEBUG(
dbgs() <<
"SLP: Call cost " << VecCallCost - ScalarCallCost
2364 <<
" (" << VecCallCost <<
"-" << ScalarCallCost <<
")" 2365 <<
" for " << *CI <<
"\n");
2367 return ReuseShuffleCost + VecCallCost - ScalarCallCost;
2369 case Instruction::ShuffleVector: {
2370 assert(S.isAltShuffle() &&
2375 "Invalid Shuffle Vector Operand");
2377 if (NeedToShuffleReuses) {
2378 for (
unsigned Idx : E->ReuseShuffleIndices) {
2380 ReuseShuffleCost -= TTI->getInstructionCost(
2383 for (
Value *V : VL) {
2385 ReuseShuffleCost += TTI->getInstructionCost(
2389 for (
Value *i : VL) {
2391 assert(S.isOpcodeOrAlt(I) &&
"Unexpected main/alternate opcode");
2392 ScalarCost += TTI->getInstructionCost(
2399 VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy);
2400 VecCost += TTI->getArithmeticInstrCost(S.getAltOpcode(), VecTy);
2402 Type *Src0SclTy = S.MainOp->getOperand(0)->getType();
2403 Type *Src1SclTy = S.AltOp->getOperand(0)->getType();
2406 VecCost = TTI->getCastInstrCost(S.getOpcode(), VecTy, Src0Ty);
2407 VecCost += TTI->getCastInstrCost(S.getAltOpcode(), VecTy, Src1Ty);
2410 return ReuseShuffleCost + VecCost - ScalarCost;
2417 bool BoUpSLP::isFullyVectorizableTinyTree() {
2419 << VectorizableTree.size() <<
" is fully vectorizable .\n");
2422 if (VectorizableTree.size() == 1 && !VectorizableTree[0].NeedToGather)
2425 if (VectorizableTree.size() != 2)
2429 if (!VectorizableTree[0].NeedToGather &&
2431 isSplat(VectorizableTree[1].Scalars)))
2435 if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather)
2441 bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() {
2449 if (isFullyVectorizableTinyTree())
2452 assert(VectorizableTree.empty()
2453 ? ExternalUses.empty()
2454 :
true &&
"We shouldn't have any external users");
2461 int BoUpSLP::getSpillCost() {
2466 unsigned BundleWidth = VectorizableTree.front().Scalars.size();
2472 for (
const auto &
N : VectorizableTree) {
2483 LiveValues.
erase(PrevInst);
2484 for (
auto &J : PrevInst->
operands()) {
2485 if (isa<Instruction>(&*J) && getTreeEntry(&*J))
2486 LiveValues.
insert(cast<Instruction>(&*J));
2490 dbgs() <<
"SLP: #LV: " << LiveValues.
size();
2491 for (
auto *
X : LiveValues)
2492 dbgs() <<
" " <<
X->getName();
2493 dbgs() <<
", Looking at ";
2501 while (InstIt != PrevInstIt) {
2508 if ((isa<CallInst>(&*PrevInstIt) &&
2509 !isa<DbgInfoIntrinsic>(&*PrevInstIt)) &&
2510 &*PrevInstIt != PrevInst) {
2512 for (
auto *II : LiveValues)
2514 Cost += TTI->getCostOfKeepingLiveOverCall(V);
2526 int BoUpSLP::getTreeCost() {
2529 << VectorizableTree.size() <<
".\n");
2531 unsigned BundleWidth = VectorizableTree[0].Scalars.size();
2533 for (
unsigned I = 0, E = VectorizableTree.size(); I <
E; ++
I) {
2534 TreeEntry &TE = VectorizableTree[
I];
2548 if (TE.NeedToGather &&
2549 std::any_of(std::next(VectorizableTree.begin(), I + 1),
2550 VectorizableTree.end(), [TE](TreeEntry &Entry) {
2551 return Entry.NeedToGather && Entry.isSame(TE.Scalars);
2555 int C = getEntryCost(&TE);
2557 <<
" for bundle that starts with " << *TE.Scalars[0]
2563 int ExtractCost = 0;
2564 for (ExternalUser &EU : ExternalUses) {
2566 if (!ExtractCostCalculated.
insert(EU.Scalar).second)
2572 if (EphValues.count(EU.User))
2579 auto *ScalarRoot = VectorizableTree[0].Scalars[0];
2580 if (MinBWs.count(ScalarRoot)) {
2583 MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;
2585 ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
2589 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
2593 int SpillCost = getSpillCost();
2594 Cost += SpillCost + ExtractCost;
2599 OS <<
"SLP: Spill Cost = " << SpillCost <<
".\n" 2600 <<
"SLP: Extract Cost = " << ExtractCost <<
".\n" 2601 <<
"SLP: Total Cost = " << Cost <<
".\n";
2606 ViewGraph(
this,
"SLP" +
F->getName(),
false, Str);
2611 int BoUpSLP::getGatherCost(
Type *Ty,
2614 for (
unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
2615 if (!ShuffledIndices.
count(i))
2616 Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
2617 if (!ShuffledIndices.
empty())
2624 Type *ScalarTy = VL[0]->getType();
2626 ScalarTy =
SI->getValueOperand()->getType();
2634 for (
unsigned I = VL.
size(); I > 0; --
I) {
2635 unsigned Idx = I - 1;
2636 if (!UniqueElements.
insert(VL[Idx]).second)
2637 ShuffledElements.insert(Idx);
2639 return getGatherCost(VecTy, ShuffledElements);
2651 void BoUpSLP::reorderAltShuffleOperands(
const InstructionsState &S,
2656 for (
Value *V : VL) {
2657 auto *I = cast<Instruction>(V);
2658 assert(S.isOpcodeOrAlt(I) &&
"Incorrect instruction in vector");
2665 for (
unsigned j = 0; j < VL.size() - 1; ++j) {
2666 if (
LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
2667 if (
LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
2673 }
else if (VL2->isCommutative() &&
2681 if (
LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
2682 if (
LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
2688 }
else if (VL2->isCommutative() &&
2708 bool SplatLeft,
bool SplatRight,
Value *&VLeft,
Value *&VRight) {
2713 if (VRight == Right[i - 1])
2716 if (VLeft == Right[i - 1]) {
2720 if (SplatLeft && VLeft == Left[i - 1])
2727 if (VLeft == Left[i - 1])
2730 if (VRight == Left[i - 1])
2739 if (AllSameOpcodeRight) {
2740 unsigned RightPrevOpcode = cast<Instruction>(Right[i - 1])->
getOpcode();
2741 if (IRight && RightPrevOpcode == IRight->getOpcode())
2744 if (ILeft && RightPrevOpcode == ILeft->
getOpcode()) {
2749 if (AllSameOpcodeLeft && ILeft &&
2756 if (AllSameOpcodeLeft) {
2757 unsigned LeftPrevOpcode = cast<Instruction>(Left[i - 1])->
getOpcode();
2758 if (ILeft && LeftPrevOpcode == ILeft->
getOpcode())
2760 if (IRight && LeftPrevOpcode == IRight->getOpcode())
2766 void BoUpSLP::reorderInputsAccordingToOpcode(
unsigned Opcode,
2773 auto *I = cast<Instruction>(VL[0]);
2776 if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft))
2784 bool AllSameOpcodeLeft = isa<Instruction>(Left[0]);
2785 bool AllSameOpcodeRight = isa<Instruction>(Right[0]);
2787 bool SplatLeft =
true;
2788 bool SplatRight =
true;
2790 for (
unsigned i = 1, e = VL.
size(); i != e; ++i) {
2794 "Can only process commutative instruction");
2800 AllSameOpcodeRight, SplatLeft, SplatRight, VLeft,
2809 SplatRight = SplatRight && (Right[i - 1] == Right[i]);
2810 SplatLeft = SplatLeft && (Left[i - 1] == Left[i]);
2811 AllSameOpcodeLeft = AllSameOpcodeLeft && isa<Instruction>(Left[i]) &&
2812 (cast<Instruction>(Left[i - 1])->getOpcode() ==
2813 cast<Instruction>(Left[i])->
getOpcode());
2814 AllSameOpcodeRight = AllSameOpcodeRight && isa<Instruction>(Right[i]) &&
2815 (cast<Instruction>(Right[i - 1])->getOpcode() ==
2816 cast<Instruction>(Right[i])->
getOpcode());
2820 if (SplatRight || SplatLeft)
2838 for (
unsigned j = 0, e = VL.
size() - 1; j < e; ++j) {
2839 if (
LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
2840 if (
LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
2847 if (
LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
2848 if (
LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
2860 const InstructionsState &S) {
2863 auto *Front = cast<Instruction>(S.OpValue);
2864 auto *BB = Front->getParent();
2866 auto *I = cast<Instruction>(V);
2867 return !S.isOpcodeOrAlt(I) || I->
getParent() == BB;
2877 if (BlocksSchedules.count(BB)) {
2879 BlocksSchedules[BB]->getScheduleData(
isOneOf(S, VL.
back()));
2880 if (Bundle && Bundle->isPartOfBundle())
2881 for (; Bundle; Bundle = Bundle->NextInBundle)
2882 if (Bundle->OpValue == Bundle->Inst)
2883 LastInst = Bundle->Inst;
2907 if (Bundle.erase(&I) && S.isOpcodeOrAlt(&I))
2916 Builder.SetInsertPoint(BB, ++LastInst->
getIterator());
2917 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
2924 Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
2925 if (
Instruction *Insrt = dyn_cast<Instruction>(Vec)) {
2926 GatherSeq.insert(Insrt);
2927 CSEBlocks.insert(Insrt->getParent());
2930 if (TreeEntry *E = getTreeEntry(VL[i])) {
2933 for (
unsigned Lane = 0,
LE = E->Scalars.size(); Lane !=
LE; ++Lane) {
2935 if (E->Scalars[Lane] == VL[i]) {
2940 assert(FoundLane >= 0 &&
"Could not find the correct lane");
2941 if (!E->ReuseShuffleIndices.empty()) {
2943 std::distance(E->ReuseShuffleIndices.begin(),
2944 llvm::find(E->ReuseShuffleIndices, FoundLane));
2946 ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
2956 if (S.getOpcode()) {
2957 if (TreeEntry *E = getTreeEntry(S.OpValue)) {
2958 if (E->isSame(VL)) {
2959 Value *V = vectorizeTree(E);
2960 if (VL.
size() == E->Scalars.size() && !E->ReuseShuffleIndices.empty()) {
2962 if (
auto *SV = dyn_cast<ShuffleVectorInst>(V)) {
2963 V = SV->getOperand(0);
2968 for(
unsigned Idx : E->ReuseShuffleIndices)
2969 if (UsedIdxs.
insert(Idx).second)
2980 Type *ScalarTy = S.OpValue->getType();
2981 if (
StoreInst *
SI = dyn_cast<StoreInst>(S.OpValue))
2982 ScalarTy =
SI->getValueOperand()->getType();
2987 if (VL.
size() > 2) {
2989 for (
Value *V : VL) {
2992 if (Res.second || isa<Constant>(V))
2997 if (UniqueValues.
size() == VL.size() || UniqueValues.
size() <= 1 ||
2999 ReuseShuffleIndicies.
clear();
3005 Value *V = Gather(VL, VecTy);
3006 if (!ReuseShuffleIndicies.
empty()) {
3008 ReuseShuffleIndicies,
"shuffle");
3009 if (
auto *I = dyn_cast<Instruction>(V)) {
3010 GatherSeq.insert(I);
3020 const unsigned E = Indices.
size();
3022 for (
unsigned I = 0; I <
E; ++
I)
3023 Mask[Indices[I]] = I;
3026 Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
3029 if (E->VectorizedValue) {
3030 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *E->Scalars[0] <<
".\n");
3031 return E->VectorizedValue;
3038 ScalarTy =
SI->getValueOperand()->getType();
3041 bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
3043 if (E->NeedToGather) {
3044 setInsertPointAfterBundle(E->Scalars, S);
3045 auto *V = Gather(E->Scalars, VecTy);
3046 if (NeedToShuffleReuses) {
3048 E->ReuseShuffleIndices,
"shuffle");
3049 if (
auto *I = dyn_cast<Instruction>(V)) {
3050 GatherSeq.insert(I);
3054 E->VectorizedValue = V;
3058 unsigned ShuffleOrOp = S.isAltShuffle() ?
3059 (
unsigned) Instruction::ShuffleVector : S.getOpcode();
3060 switch (ShuffleOrOp) {
3061 case Instruction::PHI: {
3064 Builder.SetCurrentDebugLocation(PH->
getDebugLoc());
3067 if (NeedToShuffleReuses) {
3069 E->ReuseShuffleIndices,
"shuffle");
3071 E->VectorizedValue = V;
3081 if (!VisitedBBs.
insert(IBB).second) {
3087 for (
Value *V : E->Scalars)
3088 Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(IBB));
3091 Builder.SetCurrentDebugLocation(PH->
getDebugLoc());
3092 Value *Vec = vectorizeTree(Operands);
3097 "Invalid number of incoming values");
3101 case Instruction::ExtractElement: {
3102 if (!E->NeedToGather) {
3104 if (!E->ReorderIndices.empty()) {
3107 Builder.SetInsertPoint(VL0);
3111 if (NeedToShuffleReuses) {
3113 if (E->ReorderIndices.empty())
3114 Builder.SetInsertPoint(VL0);
3116 E->ReuseShuffleIndices,
"shuffle");
3118 E->VectorizedValue = V;
3121 setInsertPointAfterBundle(E->Scalars, S);
3122 auto *V = Gather(E->Scalars, VecTy);
3123 if (NeedToShuffleReuses) {
3125 E->ReuseShuffleIndices,
"shuffle");
3126 if (
auto *I = dyn_cast<Instruction>(V)) {
3127 GatherSeq.insert(I);
3131 E->VectorizedValue = V;
3134 case Instruction::ExtractValue: {
3135 if (!E->NeedToGather) {
3137 Builder.SetInsertPoint(LI);
3142 if (!E->ReorderIndices.empty()) {
3145 NewV = Builder.CreateShuffleVector(NewV,
UndefValue::get(VecTy), Mask,
3148 if (NeedToShuffleReuses) {
3150 NewV = Builder.CreateShuffleVector(
3153 E->VectorizedValue = NewV;
3156 setInsertPointAfterBundle(E->Scalars, S);
3157 auto *V = Gather(E->Scalars, VecTy);
3158 if (NeedToShuffleReuses) {
3160 E->ReuseShuffleIndices,
"shuffle");
3161 if (
auto *I = dyn_cast<Instruction>(V)) {
3162 GatherSeq.insert(I);
3166 E->VectorizedValue = V;
3169 case Instruction::ZExt:
3170 case Instruction::SExt:
3171 case Instruction::FPToUI:
3172 case Instruction::FPToSI:
3173 case Instruction::FPExt:
3174 case Instruction::PtrToInt:
3175 case Instruction::IntToPtr:
3176 case Instruction::SIToFP:
3177 case Instruction::UIToFP:
3178 case Instruction::Trunc:
3179 case Instruction::FPTrunc:
3180 case Instruction::BitCast: {
3182 for (
Value *V : E->Scalars)
3183 INVL.push_back(cast<Instruction>(V)->getOperand(0));
3185 setInsertPointAfterBundle(E->Scalars, S);
3187 Value *InVec = vectorizeTree(INVL);
3189 if (E->VectorizedValue) {
3190 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
3191 return E->VectorizedValue;
3196 if (NeedToShuffleReuses) {
3198 E->ReuseShuffleIndices,
"shuffle");
3200 E->VectorizedValue = V;
3201 ++NumVectorInstructions;
3204 case Instruction::FCmp:
3205 case Instruction::ICmp: {
3206 ValueList LHSV, RHSV;
3207 for (
Value *V : E->Scalars) {
3208 LHSV.push_back(cast<Instruction>(V)->getOperand(0));
3209 RHSV.push_back(cast<Instruction>(V)->getOperand(1));
3212 setInsertPointAfterBundle(E->Scalars, S);
3214 Value *L = vectorizeTree(LHSV);
3215 Value *
R = vectorizeTree(RHSV);
3217 if (E->VectorizedValue) {
3218 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
3219 return E->VectorizedValue;
3224 if (S.getOpcode() == Instruction::FCmp)
3225 V = Builder.CreateFCmp(P0, L, R);
3227 V = Builder.CreateICmp(P0, L, R);
3230 if (NeedToShuffleReuses) {
3232 E->ReuseShuffleIndices,
"shuffle");
3234 E->VectorizedValue = V;
3235 ++NumVectorInstructions;
3239 ValueList TrueVec, FalseVec, CondVec;
3240 for (
Value *V : E->Scalars) {
3241 CondVec.push_back(cast<Instruction>(V)->getOperand(0));
3242 TrueVec.push_back(cast<Instruction>(V)->getOperand(1));
3243 FalseVec.push_back(cast<Instruction>(V)->getOperand(2));
3246 setInsertPointAfterBundle(E->Scalars, S);
3248 Value *Cond = vectorizeTree(CondVec);
3249 Value *True = vectorizeTree(TrueVec);
3250 Value *False = vectorizeTree(FalseVec);
3252 if (E->VectorizedValue) {
3253 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
3254 return E->VectorizedValue;
3257 Value *V = Builder.CreateSelect(Cond, True, False);
3258 if (NeedToShuffleReuses) {
3260 E->ReuseShuffleIndices,
"shuffle");
3262 E->VectorizedValue = V;
3263 ++NumVectorInstructions;
3267 case Instruction::FAdd:
3268 case Instruction::Sub:
3269 case Instruction::FSub:
3270 case Instruction::Mul:
3271 case Instruction::FMul:
3272 case Instruction::UDiv:
3273 case Instruction::SDiv:
3274 case Instruction::FDiv:
3275 case Instruction::URem:
3276 case Instruction::SRem:
3277 case Instruction::FRem:
3278 case Instruction::Shl:
3279 case Instruction::LShr:
3280 case Instruction::AShr:
3281 case Instruction::And:
3282 case Instruction::Or:
3283 case Instruction::Xor: {
3284 ValueList LHSVL, RHSVL;
3286 reorderInputsAccordingToOpcode(S.getOpcode(), E->Scalars, LHSVL,
3289 for (
Value *V : E->Scalars) {
3290 auto *I = cast<Instruction>(V);
3295 setInsertPointAfterBundle(E->Scalars, S);
3297 Value *LHS = vectorizeTree(LHSVL);
3298 Value *RHS = vectorizeTree(RHSVL);
3300 if (E->VectorizedValue) {
3301 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
3302 return E->VectorizedValue;
3305 Value *V = Builder.CreateBinOp(
3306 static_cast<Instruction::BinaryOps>(S.getOpcode()), LHS, RHS);
3308 if (
auto *I = dyn_cast<Instruction>(V))
3311 if (NeedToShuffleReuses) {
3313 E->ReuseShuffleIndices,
"shuffle");
3315 E->VectorizedValue = V;
3316 ++NumVectorInstructions;
3323 bool IsReorder = !E->ReorderIndices.empty();
3326 VL0 = cast<Instruction>(S.OpValue);
3328 setInsertPointAfterBundle(E->Scalars, S);
3330 LoadInst *LI = cast<LoadInst>(VL0);
3341 if (getTreeEntry(PO))
3342 ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0));
3345 LI = Builder.CreateLoad(VecPtr);
3347 Alignment = DL->getABITypeAlignment(ScalarLoadTy);
3355 Mask,
"reorder_shuffle");
3357 if (NeedToShuffleReuses) {
3360 E->ReuseShuffleIndices,
"shuffle");
3362 E->VectorizedValue = V;
3363 ++NumVectorInstructions;
3371 ValueList ScalarStoreValues;
3372 for (
Value *V : E->Scalars)
3373 ScalarStoreValues.push_back(cast<StoreInst>(V)->getValueOperand());
3375 setInsertPointAfterBundle(E->Scalars, S);
3377 Value *VecValue = vectorizeTree(ScalarStoreValues);
3380 StoreInst *
ST = Builder.CreateStore(VecValue, VecPtr);
3385 if (getTreeEntry(ScalarPtr))
3386 ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0));
3393 if (NeedToShuffleReuses) {
3395 E->ReuseShuffleIndices,
"shuffle");
3397 E->VectorizedValue = V;
3398 ++NumVectorInstructions;
3401 case Instruction::GetElementPtr: {
3402 setInsertPointAfterBundle(E->Scalars, S);
3405 for (
Value *V : E->Scalars)
3406 Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0));
3408 Value *Op0 = vectorizeTree(Op0VL);
3410 std::vector<Value *> OpVecs;
3411 for (
int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
3414 for (
Value *V : E->Scalars)
3415 OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j));
3417 Value *OpVec = vectorizeTree(OpVL);
3418 OpVecs.push_back(OpVec);
3421 Value *V = Builder.CreateGEP(
3422 cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
3426 if (NeedToShuffleReuses) {
3428 E->ReuseShuffleIndices,
"shuffle");
3430 E->VectorizedValue = V;
3431 ++NumVectorInstructions;
3436 CallInst *CI = cast<CallInst>(VL0);
3437 setInsertPointAfterBundle(E->Scalars, S);
3440 Value *ScalarArg =
nullptr;
3444 std::vector<Value *> OpVecs;
3450 CallInst *CEI = cast<CallInst>(VL0);
3455 for (
Value *V : E->Scalars) {
3460 Value *OpVec = vectorizeTree(OpVL);
3461 LLVM_DEBUG(
dbgs() <<
"SLP: OpVec[" << j <<
"]: " << *OpVec <<
"\n");
3462 OpVecs.push_back(OpVec);
3471 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
3476 if (ScalarArg && getTreeEntry(ScalarArg))
3477 ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
3480 if (NeedToShuffleReuses) {
3482 E->ReuseShuffleIndices,
"shuffle");
3484 E->VectorizedValue = V;
3485 ++NumVectorInstructions;
3488 case Instruction::ShuffleVector: {
3489 ValueList LHSVL, RHSVL;
3490 assert(S.isAltShuffle() &&
3495 "Invalid Shuffle Vector Operand");
3499 reorderAltShuffleOperands(S, E->Scalars, LHSVL, RHSVL);
3500 setInsertPointAfterBundle(E->Scalars, S);
3501 LHS = vectorizeTree(LHSVL);
3502 RHS = vectorizeTree(RHSVL);
3505 for (
Value *V : E->Scalars)
3506 INVL.push_back(cast<Instruction>(V)->getOperand(0));
3507 setInsertPointAfterBundle(E->Scalars, S);
3508 LHS = vectorizeTree(INVL);
3511 if (E->VectorizedValue) {
3512 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
3513 return E->VectorizedValue;
3518 V0 = Builder.CreateBinOp(
3519 static_cast<Instruction::BinaryOps>(S.getOpcode()), LHS, RHS);
3520 V1 = Builder.CreateBinOp(
3521 static_cast<Instruction::BinaryOps>(S.getAltOpcode()), LHS, RHS);
3523 V0 = Builder.CreateCast(
3524 static_cast<Instruction::CastOps>(S.getOpcode()), LHS, VecTy);
3525 V1 = Builder.CreateCast(
3526 static_cast<Instruction::CastOps>(S.getAltOpcode()), LHS, VecTy);
3532 ValueList OpScalars, AltScalars;
3533 unsigned e = E->Scalars.size();
3535 for (
unsigned i = 0; i < e; ++i) {
3536 auto *OpInst = cast<Instruction>(E->Scalars[i]);
3537 assert(S.isOpcodeOrAlt(OpInst) &&
"Unexpected main/alternate opcode");
3538 if (OpInst->getOpcode() == S.getAltOpcode()) {
3539 Mask[i] = Builder.getInt32(e + i);
3542 Mask[i] = Builder.getInt32(i);
3551 Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
3554 if (NeedToShuffleReuses) {
3556 E->ReuseShuffleIndices,
"shuffle");
3558 E->VectorizedValue = V;
3559 ++NumVectorInstructions;
3571 return vectorizeTree(ExternallyUsedValues);
3577 for (
auto &BSIter : BlocksSchedules) {
3578 scheduleBlock(BSIter.second.get());
3581 Builder.SetInsertPoint(&
F->getEntryBlock().front());
3582 auto *VectorRoot = vectorizeTree(&VectorizableTree[0]);
3587 auto *ScalarRoot = VectorizableTree[0].Scalars[0];
3588 if (MinBWs.count(ScalarRoot)) {
3589 if (
auto *I = dyn_cast<Instruction>(VectorRoot))
3591 auto BundleWidth = VectorizableTree[0].Scalars.size();
3594 auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);
3595 VectorizableTree[0].VectorizedValue = Trunc;
3603 auto extend = [&](
Value *ScalarRoot,
Value *Ex,
Type *ScalarType) {
3604 if (!MinBWs.count(ScalarRoot))
3606 if (MinBWs[ScalarRoot].
second)
3607 return Builder.CreateSExt(Ex, ScalarType);
3608 return Builder.CreateZExt(Ex, ScalarType);
3612 for (
const auto &ExternalUse : ExternalUses) {
3620 TreeEntry *E = getTreeEntry(Scalar);
3621 assert(E &&
"Invalid scalar");
3622 assert(!E->NeedToGather &&
"Extracting from a gather list");
3624 Value *Vec = E->VectorizedValue;
3625 assert(Vec &&
"Can't find vectorizable value");
3627 Value *Lane = Builder.getInt32(ExternalUse.Lane);
3633 "Scalar with nullptr as an external user must be registered in " 3634 "ExternallyUsedValues map");
3635 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
3636 Builder.SetInsertPoint(VecI->getParent(),
3637 std::next(VecI->getIterator()));
3639 Builder.SetInsertPoint(&
F->getEntryBlock().front());
3641 Value *Ex = Builder.CreateExtractElement(Vec, Lane);
3642 Ex = extend(ScalarRoot, Ex, Scalar->
getType());
3643 CSEBlocks.insert(cast<Instruction>(Scalar)->
getParent());
3644 auto &Locs = ExternallyUsedValues[
Scalar];
3645 ExternallyUsedValues.
insert({Ex, Locs});
3646 ExternallyUsedValues.
erase(Scalar);
3654 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
3655 if (
PHINode *PH = dyn_cast<PHINode>(User)) {
3656 for (
int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
3657 if (PH->getIncomingValue(i) ==
Scalar) {
3659 PH->getIncomingBlock(i)->getTerminator();
3660 if (isa<CatchSwitchInst>(IncomingTerminator)) {
3661 Builder.SetInsertPoint(VecI->getParent(),
3662 std::next(VecI->getIterator()));
3664 Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
3666 Value *Ex = Builder.CreateExtractElement(Vec, Lane);
3667 Ex = extend(ScalarRoot, Ex, Scalar->
getType());
3668 CSEBlocks.insert(PH->getIncomingBlock(i));
3669 PH->setOperand(i, Ex);
3673 Builder.SetInsertPoint(cast<Instruction>(User));
3674 Value *Ex = Builder.CreateExtractElement(Vec, Lane);
3675 Ex = extend(ScalarRoot, Ex, Scalar->
getType());
3676 CSEBlocks.insert(cast<Instruction>(User)->
getParent());
3680 Builder.SetInsertPoint(&
F->getEntryBlock().front());
3681 Value *Ex = Builder.CreateExtractElement(Vec, Lane);
3682 Ex = extend(ScalarRoot, Ex, Scalar->
getType());
3683 CSEBlocks.insert(&
F->getEntryBlock());
3691 for (TreeEntry &EIdx : VectorizableTree) {
3692 TreeEntry *Entry = &EIdx;
3695 if (Entry->NeedToGather)
3698 assert(Entry->VectorizedValue &&
"Can't find vectorizable value");
3701 for (
int Lane = 0,
LE = Entry->Scalars.size(); Lane !=
LE; ++Lane) {
3712 "Replacing out-of-tree value with undef");
3718 LLVM_DEBUG(
dbgs() <<
"SLP: \tErasing scalar:" << *Scalar <<
".\n");
3723 Builder.ClearInsertionPoint();
3725 return VectorizableTree[0].VectorizedValue;
3728 void BoUpSLP::optimizeGatherSequence() {
3730 <<
" gather sequences instructions.\n");
3733 if (!isa<InsertElementInst>(I) && !isa<ShuffleVectorInst>(
I))
3762 CSEWorkList.
reserve(CSEBlocks.size());
3765 assert(DT->isReachableFromEntry(
N));
3771 std::stable_sort(CSEWorkList.
begin(), CSEWorkList.
end(),
3773 return DT->properlyDominates(A,
B);
3780 for (
auto I = CSEWorkList.
begin(), E = CSEWorkList.
end(); I !=
E; ++
I) {
3781 assert((I == CSEWorkList.
begin() || !DT->dominates(*I, *std::prev(I))) &&
3782 "Worklist not sorted properly!");
3787 if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(
In))
3794 DT->dominates(v->getParent(), In->
getParent())) {
3803 Visited.push_back(In);
3815 const InstructionsState &S) {
3816 if (isa<PHINode>(S.OpValue))
3821 ScheduleData *PrevInBundle =
nullptr;
3822 ScheduleData *Bundle =
nullptr;
3823 bool ReSchedule =
false;
3828 for (
Value *V : VL) {
3829 if (!extendSchedulingRegion(V, S))
3833 for (
Value *V : VL) {
3834 ScheduleData *BundleMember = getScheduleData(V);
3836 "no ScheduleData for bundle member (maybe not in same basic block)");
3837 if (BundleMember->IsScheduled) {
3841 LLVM_DEBUG(
dbgs() <<
"SLP: reset schedule because " << *BundleMember
3842 <<
" was already scheduled\n");
3845 assert(BundleMember->isSchedulingEntity() &&
3846 "bundle member already part of other bundle");
3848 PrevInBundle->NextInBundle = BundleMember;
3850 Bundle = BundleMember;
3852 BundleMember->UnscheduledDepsInBundle = 0;
3853 Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
3856 BundleMember->FirstInBundle = Bundle;
3857 PrevInBundle = BundleMember;
3859 if (ScheduleEnd != OldScheduleEnd) {
3865 for (
auto *I = ScheduleStart; I != ScheduleEnd; I = I->
getNextNode()) {
3866 doForAllOpcodes(I, [](ScheduleData *SD) {
3867 SD->clearDependencies();
3874 initialFillReadyList(ReadyInsts);
3877 LLVM_DEBUG(
dbgs() <<
"SLP: try schedule bundle " << *Bundle <<
" in block " 3878 << BB->getName() <<
"\n");
3880 calculateDependencies(Bundle,
true, SLP);
3886 while (!Bundle->isReady() && !ReadyInsts.empty()) {
3888 ScheduleData *pickedSD = ReadyInsts.back();
3889 ReadyInsts.pop_back();
3891 if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
3892 schedule(pickedSD, ReadyInsts);
3895 if (!Bundle->isReady()) {
3896 cancelScheduling(VL, S.OpValue);
3904 if (isa<PHINode>(OpValue))
3907 ScheduleData *Bundle = getScheduleData(OpValue);
3908 LLVM_DEBUG(
dbgs() <<
"SLP: cancel scheduling of " << *Bundle <<
"\n");
3909 assert(!Bundle->IsScheduled &&
3910 "Can't cancel bundle which is already scheduled");
3911 assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
3912 "tried to unbundle something which is not a bundle");
3915 ScheduleData *BundleMember = Bundle;
3916 while (BundleMember) {
3917 assert(BundleMember->FirstInBundle == Bundle &&
"corrupt bundle links");
3918 BundleMember->FirstInBundle = BundleMember;
3919 ScheduleData *Next = BundleMember->NextInBundle;
3920 BundleMember->NextInBundle =
nullptr;
3921 BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
3922 if (BundleMember->UnscheduledDepsInBundle == 0) {
3923 ReadyInsts.insert(BundleMember);
3925 BundleMember = Next;
3929 BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
3931 if (ChunkPos >= ChunkSize) {
3935 return &(ScheduleDataChunks.back()[ChunkPos++]);
3938 bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
Value *V,
3939 const InstructionsState &S) {
3940 if (getScheduleData(V,
isOneOf(S, V)))
3943 assert(I &&
"bundle member must be an instruction");
3944 assert(!isa<PHINode>(I) &&
"phi nodes don't need to be scheduled");
3945 auto &&CheckSheduleForI = [
this, &S](
Instruction *
I) ->
bool {
3946 ScheduleData *ISD = getScheduleData(I);
3949 assert(isInSchedulingRegion(ISD) &&
3950 "ScheduleData not in scheduling region");
3951 ScheduleData *SD = allocateScheduleDataChunks();
3953 SD->init(SchedulingRegionID, S.OpValue);
3954 ExtraScheduleDataMap[
I][S.OpValue] = SD;
3957 if (CheckSheduleForI(I))
3959 if (!ScheduleStart) {
3961 initScheduleData(I, I->
getNextNode(),
nullptr,
nullptr);
3965 CheckSheduleForI(I);
3966 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
3967 LLVM_DEBUG(
dbgs() <<
"SLP: initialize schedule region to " << *I <<
"\n");
3978 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
3979 LLVM_DEBUG(
dbgs() <<
"SLP: exceeded schedule region size limit\n");
3983 if (UpIter != UpperEnd) {
3984 if (&*UpIter == I) {
3985 initScheduleData(I, ScheduleStart,
nullptr, FirstLoadStoreInRegion);
3988 CheckSheduleForI(I);
3989 LLVM_DEBUG(
dbgs() <<
"SLP: extend schedule region start to " << *I
3995 if (DownIter != LowerEnd) {
3996 if (&*DownIter == I) {
3997 initScheduleData(ScheduleEnd, I->
getNextNode(), LastLoadStoreInRegion,
4001 CheckSheduleForI(I);
4002 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
4009 assert((UpIter != UpperEnd || DownIter != LowerEnd) &&
4010 "instruction not found in block");
4015 void BoUpSLP::BlockScheduling::initScheduleData(
Instruction *FromI,
4017 ScheduleData *PrevLoadStore,
4018 ScheduleData *NextLoadStore) {
4019 ScheduleData *CurrentLoadStore = PrevLoadStore;
4021 ScheduleData *SD = ScheduleDataMap[
I];
4023 SD = allocateScheduleDataChunks();
4024 ScheduleDataMap[
I] = SD;
4027 assert(!isInSchedulingRegion(SD) &&
4028 "new ScheduleData already in scheduling region");
4029 SD->init(SchedulingRegionID, I);
4032 (!isa<IntrinsicInst>(
I) ||
4035 if (CurrentLoadStore) {
4036 CurrentLoadStore->NextLoadStore = SD;
4038 FirstLoadStoreInRegion = SD;
4040 CurrentLoadStore = SD;
4043 if (NextLoadStore) {
4044 if (CurrentLoadStore)
4045 CurrentLoadStore->NextLoadStore = NextLoadStore;
4047 LastLoadStoreInRegion = CurrentLoadStore;
4051 void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
4052 bool InsertInReadyList,
4054 assert(SD->isSchedulingEntity());
4059 while (!WorkList.empty()) {
4060 ScheduleData *SD = WorkList.back();
4061 WorkList.pop_back();
4063 ScheduleData *BundleMember = SD;
4064 while (BundleMember) {
4065 assert(isInSchedulingRegion(BundleMember));
4066 if (!BundleMember->hasValidDependencies()) {
4070 BundleMember->Dependencies = 0;
4071 BundleMember->resetUnscheduledDeps();
4074 if (BundleMember->OpValue != BundleMember->Inst) {
4075 ScheduleData *UseSD = getScheduleData(BundleMember->Inst);
4076 if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
4077 BundleMember->Dependencies++;
4078 ScheduleData *DestBundle = UseSD->FirstInBundle;
4079 if (!DestBundle->IsScheduled)
4080 BundleMember->incrementUnscheduledDeps(1);
4081 if (!DestBundle->hasValidDependencies())
4082 WorkList.push_back(DestBundle);
4085 for (
User *U : BundleMember->Inst->users()) {
4086 if (isa<Instruction>(U)) {
4087 ScheduleData *UseSD = getScheduleData(U);
4088 if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
4089 BundleMember->Dependencies++;
4090 ScheduleData *DestBundle = UseSD->FirstInBundle;
4091 if (!DestBundle->IsScheduled)
4092 BundleMember->incrementUnscheduledDeps(1);
4093 if (!DestBundle->hasValidDependencies())
4094 WorkList.push_back(DestBundle);
4100 BundleMember->Dependencies++;
4101 BundleMember->incrementUnscheduledDeps(1);
4107 ScheduleData *DepDest = BundleMember->NextLoadStore;
4111 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
4112 unsigned numAliased = 0;
4113 unsigned DistToSrc = 1;
4116 assert(isInSchedulingRegion(DepDest));
4125 if (DistToSrc >= MaxMemDepDistance ||
4126 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
4127 (numAliased >= AliasedCheckLimit ||
4128 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
4135 DepDest->MemoryDependencies.push_back(BundleMember);
4136 BundleMember->Dependencies++;
4137 ScheduleData *DestBundle = DepDest->FirstInBundle;
4138 if (!DestBundle->IsScheduled) {
4139 BundleMember->incrementUnscheduledDeps(1);
4141 if (!DestBundle->hasValidDependencies()) {
4142 WorkList.push_back(DestBundle);
4145 DepDest = DepDest->NextLoadStore;
4160 if (DistToSrc >= 2 * MaxMemDepDistance)
4166 BundleMember = BundleMember->NextInBundle;
4168 if (InsertInReadyList && SD->isReady()) {
4169 ReadyInsts.push_back(SD);
4176 void BoUpSLP::BlockScheduling::resetSchedule() {
4178 "tried to reset schedule on block which has not been scheduled");
4180 doForAllOpcodes(I, [&](ScheduleData *SD) {
4181 assert(isInSchedulingRegion(SD) &&
4182 "ScheduleData not in scheduling region");
4183 SD->IsScheduled =
false;
4184 SD->resetUnscheduledDeps();
4190 void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
4191 if (!BS->ScheduleStart)
4194 LLVM_DEBUG(
dbgs() <<
"SLP: schedule block " << BS->BB->getName() <<
"\n");
4196 BS->resetSchedule();
4201 struct ScheduleDataCompare {
4202 bool operator()(ScheduleData *SD1, ScheduleData *SD2)
const {
4203 return SD2->SchedulingPriority < SD1->SchedulingPriority;
4206 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
4211 int NumToSchedule = 0;
4212 for (
auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
4214 BS->doForAllOpcodes(I, [
this, &Idx, &NumToSchedule, BS](ScheduleData *SD) {
4215 assert(SD->isPartOfBundle() ==
4216 (getTreeEntry(SD->Inst) !=
nullptr) &&
4217 "scheduler and vectorizer bundle mismatch");
4218 SD->FirstInBundle->SchedulingPriority = Idx++;
4219 if (SD->isSchedulingEntity()) {
4220 BS->calculateDependencies(SD,
false,
this);
4225 BS->initialFillReadyList(ReadyInsts);
4230 while (!ReadyInsts.empty()) {
4231 ScheduleData *picked = *ReadyInsts.begin();
4232 ReadyInsts.erase(ReadyInsts.begin());
4236 ScheduleData *BundleMember = picked;
4237 while (BundleMember) {
4239 if (LastScheduledInst->
getNextNode() != pickedInst) {
4240 BS->BB->getInstList().remove(pickedInst);
4241 BS->BB->getInstList().insert(LastScheduledInst->
getIterator(),
4244 LastScheduledInst = pickedInst;
4245 BundleMember = BundleMember->NextInBundle;
4248 BS->schedule(picked, ReadyInsts);
4251 assert(NumToSchedule == 0 &&
"could not schedule all instructions");
4254 BS->ScheduleStart =
nullptr;
4257 unsigned BoUpSLP::getVectorElementSize(
Value *V) {
4260 if (
auto *
Store = dyn_cast<StoreInst>(V))
4261 return DL->getTypeSizeInBits(
Store->getValueOperand()->getType());
4269 if (
auto *I = dyn_cast<Instruction>(V))
4275 auto FoundUnknownInst =
false;
4276 while (!Worklist.
empty() && !FoundUnknownInst) {
4283 if (isa<VectorType>(Ty))
4284 FoundUnknownInst =
true;
4288 else if (isa<LoadInst>(I))
4289 MaxWidth = std::max<unsigned>(MaxWidth, DL->getTypeSizeInBits(Ty));
4294 else if (isa<PHINode>(I) || isa<CastInst>(
I) || isa<GetElementPtrInst>(I) ||
4295 isa<CmpInst>(
I) || isa<SelectInst>(I) || isa<BinaryOperator>(
I)) {
4297 if (
auto *J = dyn_cast<Instruction>(U.get()))
4298 if (!Visited.
count(J))
4304 FoundUnknownInst =
true;
4309 if (!MaxWidth || FoundUnknownInst)
4310 return DL->getTypeSizeInBits(V->
getType());
4323 if (isa<Constant>(V)) {
4338 case Instruction::Trunc:
4341 case Instruction::ZExt:
4342 case Instruction::SExt:
4348 case Instruction::Sub:
4349 case Instruction::Mul:
4350 case Instruction::And:
4351 case Instruction::Or:
4352 case Instruction::Xor:
4369 case Instruction::PHI: {
4390 if (ExternalUses.empty())
4394 auto &TreeRoot = VectorizableTree[0].Scalars;
4406 for (
auto &EU : ExternalUses)
4407 if (!Expr.erase(EU.Scalar))
4415 for (
auto &Entry : VectorizableTree)
4416 Expr.
insert(Entry.Scalars.begin(), Entry.Scalars.end());
4420 for (
auto *Root : TreeRoot)
4421 if (!Root->hasOneUse() || Expr.count(*Root->user_begin()))
4429 for (
auto *Root : TreeRoot)
4436 auto MaxBitWidth = 8u;
4440 for (
auto *Root : TreeRoot) {
4441 auto Mask = DB->getDemandedBits(cast<Instruction>(Root));
4442 MaxBitWidth = std::max<unsigned>(
4443 Mask.getBitWidth() -
Mask.countLeadingZeros(), MaxBitWidth);
4449 bool IsKnownPositive =
true;
4460 if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType()) &&
4463 return isa<GetElementPtrInst>(R->
user_back());
4476 for (
auto *
Scalar : ToDemote) {
4478 auto NumTypeBits = DL->getTypeSizeInBits(
Scalar->getType());
4479 MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);
4497 if (!IsKnownPositive)
4507 if (MaxBitWidth >= TreeRootIT->getBitWidth())
4513 while (!Roots.
empty())
4517 for (
auto *
Scalar : ToDemote)
4518 MinBWs[
Scalar] = std::make_pair(MaxBitWidth, !IsKnownPositive);
4534 bool doInitialization(
Module &M)
override {
4539 if (skipFunction(F))
4542 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
4543 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
4544 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
4545 auto *TLI = TLIP ? &TLIP->getTLI() :
nullptr;
4546 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
4547 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
4548 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
4549 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
4550 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
4551 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
4553 return Impl.
runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
4587 bool Changed =
runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
4616 bool Changed =
false;
4620 if (!TTI->getNumberOfRegisters(
true))
4631 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
4638 collectSeedInstructions(BB);
4641 if (!Stores.empty()) {
4643 <<
" underlying objects.\n");
4644 Changed |= vectorizeStoreChains(R);
4648 Changed |= vectorizeChainsInBlock(BB, R);
4653 if (!GEPs.empty()) {
4655 <<
" underlying objects.\n");
4656 Changed |= vectorizeGEPIndices(BB, R);
4661 R.optimizeGatherSequence();
4675 unsigned SliceSize) {
4676 VL = VL.
slice(SliceBegin, SliceSize);
4677 VH = VH.
slice(SliceBegin, SliceSize);
4681 bool SLPVectorizerPass::vectorizeStoreChain(
ArrayRef<Value *> Chain, BoUpSLP &R,
4682 unsigned VecRegSize) {
4683 const unsigned ChainLen = Chain.
size();
4684 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing a store chain of length " << ChainLen
4686 const unsigned Sz = R.getVectorElementSize(Chain[0]);
4687 const unsigned VF = VecRegSize / Sz;
4695 bool Changed =
false;
4697 for (
unsigned i = 0, e = ChainLen; i + VF <= e; ++i) {
4703 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << VF <<
" stores at offset " << i
4707 R.buildTree(Operands);
4708 if (R.isTreeTinyAndNotFullyVectorizable())
4711 R.computeMinimumValueSizes();
4713 int Cost = R.getTreeCost();
4715 LLVM_DEBUG(
dbgs() <<
"SLP: Found cost=" << Cost <<
" for VF=" << VF
4718 LLVM_DEBUG(
dbgs() <<
"SLP: Decided to vectorize cost=" << Cost <<
"\n");
4720 using namespace ore;
4723 cast<StoreInst>(Chain[i]))
4724 <<
"Stores SLP vectorized with cost " <<
NV(
"Cost", Cost)
4725 <<
" and with tree size " 4726 <<
NV(
"TreeSize", R.getTreeSize()));
4747 BoUpSLP::ValueSet VectorizedStores;
4748 bool Changed =
false;
4753 unsigned E = Stores.
size();
4754 IndexQueue.
resize(E - 1);
4755 for (
unsigned I = E; I > 0; --
I) {
4756 unsigned Idx = I - 1;
4763 for (
unsigned J = 0; J < E - 1; ++J, ++
Offset) {
4764 if (Idx >= Offset) {
4765 IndexQueue[Cnt] = Idx -
Offset;
4768 if (Idx + Offset < E) {
4769 IndexQueue[Cnt] = Idx +
Offset;
4774 for (
auto K : IndexQueue) {
4776 Tails.
insert(Stores[Idx]);
4778 ConsecutiveChain[Stores[K]] = Stores[Idx];
4791 BoUpSLP::ValueList Operands;
4794 while ((Tails.
count(I) || Heads.
count(I)) && !VectorizedStores.count(I)) {
4795 Operands.push_back(I);
4797 I = ConsecutiveChain[
I];
4802 for (
unsigned Size = R.getMaxVecRegSize();
Size >= R.getMinVecRegSize();
4804 if (vectorizeStoreChain(Operands, R,
Size)) {
4806 VectorizedStores.
insert(Operands.begin(), Operands.end());
4816 void SLPVectorizerPass::collectSeedInstructions(
BasicBlock *BB) {
4827 if (
auto *
SI = dyn_cast<StoreInst>(&I)) {
4828 if (!
SI->isSimple())
4838 else if (
auto *
GEP = dyn_cast<GetElementPtrInst>(&I)) {
4839 auto Idx =
GEP->idx_begin()->get();
4840 if (
GEP->getNumIndices() > 1 || isa<Constant>(Idx))
4844 if (
GEP->getType()->isVectorTy())
4846 GEPs[
GEP->getPointerOperand()].push_back(
GEP);
4851 bool SLPVectorizerPass::tryToVectorizePair(
Value *A,
Value *
B, BoUpSLP &R) {
4854 Value *VL[] = { A, B };
4855 return tryToVectorizeList(VL, R, 0,
true);
4859 int UserCost,
bool AllowReorder) {
4863 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize a list of length = " 4864 << VL.
size() <<
".\n");
4873 unsigned Sz = R.getVectorElementSize(I0);
4874 unsigned MinVF =
std::max(2U, R.getMinVecRegSize() / Sz);
4877 R.getORE()->emit([&]() {
4879 <<
"Cannot SLP vectorize list: vectorization factor " 4880 <<
"less than 2 is not supported";
4885 for (
Value *V : VL) {
4890 R.getORE()->emit([&]() {
4891 std::string type_str;
4895 <<
"Cannot SLP vectorize list: type " 4896 << rso.
str() +
" is unsupported by vectorizer";
4902 bool Changed =
false;
4903 bool CandidateFound =
false;
4909 unsigned NextInst = 0, MaxInst = VL.
size();
4910 for (
unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
4916 if (TTI->getNumberOfParts(VecTy) == VF)
4918 for (
unsigned I = NextInst; I < MaxInst; ++
I) {
4919 unsigned OpsWidth = 0;
4921 if (I + VF > MaxInst)
4922 OpsWidth = MaxInst -
I;
4933 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << OpsWidth <<
" operations " 4940 if (AllowReorder && Order) {
4947 Value *ReorderedOps[] = {Ops[1], Ops[0]};
4948 R.buildTree(ReorderedOps,
None);
4950 if (R.isTreeTinyAndNotFullyVectorizable())
4953 R.computeMinimumValueSizes();
4954 int Cost = R.getTreeCost() - UserCost;
4955 CandidateFound =
true;
4956 MinCost = std::min(MinCost, Cost);
4959 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorizing list at cost:" << Cost <<
".\n");
4961 cast<Instruction>(Ops[0]))
4962 <<
"SLP vectorized with cost " <<
ore::NV(
"Cost", Cost)
4963 <<
" and with tree size " 4964 <<
ore::NV(
"TreeSize", R.getTreeSize()));
4975 if (!Changed && CandidateFound) {
4976 R.getORE()->emit([&]() {
4978 <<
"List vectorization was possible but not beneficial with cost " 4979 <<
ore::NV(
"Cost", MinCost) <<
" >= " 4982 }
else if (!Changed) {
4983 R.getORE()->emit([&]() {
4985 <<
"Cannot SLP vectorize list: vectorization was impossible" 4986 <<
" with available vectorization factors";
4992 bool SLPVectorizerPass::tryToVectorize(
Instruction *I, BoUpSLP &R) {
4996 if (!isa<BinaryOperator>(I) && !isa<CmpInst>(I))
5004 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() !=
P)
5008 if (tryToVectorizePair(Op0, Op1, R))
5014 if (B && B->hasOneUse()) {
5017 if (B0 && B0->getParent() == P && tryToVectorizePair(A, B0, R))
5019 if (B1 && B1->getParent() == P && tryToVectorizePair(A, B1, R))
5027 if (A0 && A0->getParent() == P && tryToVectorizePair(A0, B, R))
5029 if (A1 && A1->getParent() == P && tryToVectorizePair(A1, B, R))
5046 bool IsPairwise,
bool IsLeft,
5048 assert((IsPairwise || !IsLeft) &&
"Don't support a <0,1,undef,...> mask");
5055 for (
unsigned i = 0; i != NumEltsToRdx; ++i)
5056 ShuffleMask[i] = Builder.
getInt32(2 * i + !IsLeft);
5059 for (
unsigned i = 0; i != NumEltsToRdx; ++i)
5060 ShuffleMask[i] = Builder.
getInt32(NumEltsToRdx + i);
5094 class HorizontalReduction {
5097 ReductionOpsListType ReductionOps;
5113 class OperationData {
5115 unsigned Opcode = 0;
5118 Value *LHS =
nullptr;
5121 Value *RHS =
nullptr;
5130 bool isVectorizable()
const {
5131 return LHS && RHS &&
5133 ((Kind == RK_Arithmetic &&
5135 Opcode == Instruction::Mul || Opcode == Instruction::FMul ||
5136 Opcode == Instruction::And || Opcode == Instruction::Or ||
5137 Opcode == Instruction::Xor)) ||
5138 ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
5139 (Kind == RK_Min || Kind == RK_Max)) ||
5140 (Opcode == Instruction::ICmp &&
5141 (Kind == RK_UMin || Kind == RK_UMax)));
5146 assert(isVectorizable() &&
5147 "Expected add|fadd or min/max reduction operation.");
5154 Cmp = Opcode == Instruction::ICmp ? Builder.
CreateICmpSLT(LHS, RHS)
5158 Cmp = Opcode == Instruction::ICmp ? Builder.
CreateICmpSGT(LHS, RHS)
5162 assert(Opcode == Instruction::ICmp &&
"Expected integer types.");
5166 assert(Opcode == Instruction::ICmp &&
"Expected integer types.");
5176 explicit OperationData() =
default;
5180 explicit OperationData(
Value *V) {
5181 if (
auto *I = dyn_cast<Instruction>(V))
5189 : Opcode(Opcode), LHS(LHS), RHS(RHS),
Kind(Kind), NoNaN(NoNaN) {
5190 assert(Kind != RK_None &&
"One of the reduction operations is expected.");
5193 explicit operator bool()
const {
return Opcode; }
5196 unsigned getFirstOperandIndex()
const {
5197 assert(!!*
this &&
"The opcode is not set.");
5212 unsigned getNumberOfOperands()
const {
5213 assert(Kind != RK_None && !!*
this && LHS && RHS &&
5214 "Expected reduction operation.");
5231 assert(Kind != RK_None && !!*
this && LHS && RHS &&
5232 "Expected reduction operation.");
5245 auto *Cmp = cast<Instruction>(cast<SelectInst>(
I)->getCondition());
5246 return I->
getParent() == P && Cmp && Cmp->getParent() ==
P;
5254 bool hasRequiredNumberOfUses(
Instruction *I,
bool IsReductionOp)
const {
5255 assert(Kind != RK_None && !!*
this && LHS && RHS &&
5256 "Expected reduction operation.");
5266 cast<SelectInst>(
I)->getCondition()->hasOneUse());
5274 void initReductionOps(ReductionOpsListType &ReductionOps) {
5275 assert(Kind != RK_None && !!*
this && LHS && RHS &&
5276 "Expected reduction operation.");
5279 ReductionOps.assign(1, ReductionOpsType());
5285 ReductionOps.assign(2, ReductionOpsType());
5292 void addReductionOps(
Instruction *I, ReductionOpsListType &ReductionOps) {
5293 assert(Kind != RK_None && !!*
this && LHS && RHS &&
5294 "Expected reduction operation.");
5297 ReductionOps[0].emplace_back(I);
5303 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
5304 ReductionOps[1].emplace_back(I);
5313 assert(Kind != RK_None && *
this && LHS && RHS &&
5314 "Expected reduction operation.");
5320 return Opcode == Instruction::ICmp ||
5321 cast<Instruction>(I->
getOperand(0))->isFast();
5324 assert(Opcode == Instruction::ICmp &&
5325 "Only integer compare operation is expected.");
5341 assert(((Kind != OD.Kind) || ((!LHS == !OD.LHS) && (!RHS == !OD.RHS))) &&
5342 "One of the comparing operations is incorrect.");
5343 return this == &OD || (Kind == OD.Kind && Opcode == OD.Opcode);
5345 bool operator!=(
const OperationData &OD) {
return !(*
this == OD); }
5356 assert(isVectorizable() &&
"Expected vectorizable operation.");
5362 Value *getLHS()
const {
return LHS; }
5363 Value *getRHS()
const {
return RHS; }
5364 Type *getConditionType()
const {
5382 const ReductionOpsListType &ReductionOps)
const {
5383 assert(isVectorizable() &&
5384 "Expected add|fadd or min/max reduction operation.");
5385 auto *
Op = createOp(Builder, Name);
5394 if (
auto *
SI = dyn_cast<SelectInst>(
Op))
5407 assert(isVectorizable() &&
5408 "Expected add|fadd or min/max reduction operation.");
5409 auto *
Op = createOp(Builder, Name);
5418 if (
auto *
SI = dyn_cast<SelectInst>(
Op)) {
5420 cast<SelectInst>(
I)->getCondition());
5432 Flags.
NoNaN = NoNaN;
5437 Flags.
IsSigned = Opcode == Instruction::ICmp;
5441 Flags.
IsSigned = Opcode == Instruction::ICmp;
5462 OperationData ReductionData;
5465 OperationData ReducedValueData;
5469 bool IsPairwiseReduction =
false;
5473 void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem,
5475 if (ExtraArgs.
count(ParentStackElem.first)) {
5476 ExtraArgs[ParentStackElem.first] =
nullptr;
5483 ParentStackElem.second = ParentStackElem.first->getNumOperands();
5487 ExtraArgs[ParentStackElem.first] = ExtraArg;
5491 static OperationData getOperationData(
Value *V) {
5493 return OperationData();
5498 return OperationData(cast<BinaryOperator>(V)->
getOpcode(), LHS, RHS,
5501 if (
auto *
Select = dyn_cast<SelectInst>(V)) {
5504 return OperationData(Instruction::ICmp, LHS, RHS, RK_UMin);
5506 return OperationData(Instruction::ICmp, LHS, RHS, RK_Min);
5509 return OperationData(
5510 Instruction::FCmp, LHS, RHS, RK_Min,
5511 cast<Instruction>(
Select->getCondition())->hasNoNaNs());
5513 return OperationData(Instruction::ICmp, LHS, RHS, RK_UMax);
5515 return OperationData(Instruction::ICmp, LHS, RHS, RK_Max);
5518 return OperationData(
5519 Instruction::FCmp, LHS, RHS, RK_Max,
5520 cast<Instruction>(
Select->getCondition())->hasNoNaNs());
5537 LHS =
Select->getTrueValue();
5538 RHS =
Select->getFalseValue();
5543 if (!isa<ExtractElementInst>(RHS) ||
5545 return OperationData(V);
5547 if (!isa<ExtractElementInst>(LHS) ||
5549 return OperationData(V);
5551 if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
5552 return OperationData(V);
5556 return OperationData(V);
5560 return OperationData(V);
5564 return OperationData(Instruction::ICmp, LHS, RHS, RK_UMin);
5568 return OperationData(Instruction::ICmp, LHS, RHS, RK_Min);
5574 return OperationData(Instruction::FCmp, LHS, RHS, RK_Min,
5575 cast<Instruction>(Cond)->hasNoNaNs());
5579 return OperationData(Instruction::ICmp, LHS, RHS, RK_UMax);
5583 return OperationData(Instruction::ICmp, LHS, RHS, RK_Max);
5589 return OperationData(Instruction::FCmp, LHS, RHS, RK_Max,
5590 cast<Instruction>(Cond)->hasNoNaNs());
5594 return OperationData(V);
5598 HorizontalReduction() =
default;
5603 "Thi phi needs to use the binary operator");
5605 ReductionData = getOperationData(B);
5611 if (ReductionData.getLHS() == Phi) {
5614 ReductionData = getOperationData(B);
5615 }
else if (ReductionData.getRHS() == Phi) {
5618 ReductionData = getOperationData(B);
5622 if (!ReductionData.isVectorizable(B))
5631 ReducedValueData.clear();
5637 Stack.
push_back(std::make_pair(B, ReductionData.getFirstOperandIndex()));
5638 ReductionData.initReductionOps(ReductionOps);
5639 while (!Stack.
empty()) {
5641 unsigned EdgeToVist = Stack.
back().second++;
5642 OperationData OpData = getOperationData(TreeN);
5643 bool IsReducedValue = OpData != ReductionData;
5646 if (IsReducedValue || EdgeToVist == OpData.getNumberOfOperands()) {
5650 auto I = ExtraArgs.
find(TreeN);
5651 if (I != ExtraArgs.
end() && !I->second) {
5653 if (Stack.
size() <= 1) {
5661 markExtraArg(Stack[Stack.
size() - 2], TreeN);
5662 ExtraArgs.
erase(TreeN);
5664 ReductionData.addReductionOps(TreeN, ReductionOps);
5675 OpData = getOperationData(I);
5680 if (I && (!ReducedValueData || OpData == ReducedValueData ||
5681 OpData == ReductionData)) {
5682 const bool IsReductionOperation = OpData == ReductionData;
5684 if (!ReductionData.hasSameParent(I, B->
getParent(),
5685 IsReductionOperation)) {
5687 markExtraArg(Stack.
back(),
I);
5693 if (!ReductionData.hasRequiredNumberOfUses(I,
5694 OpData == ReductionData) &&
5697 markExtraArg(Stack.
back(),
I);
5701 if (IsReductionOperation) {
5703 if (!OpData.isAssociative(I)) {
5705 markExtraArg(Stack.
back(),
I);
5708 }
else if (ReducedValueData &&
5709 ReducedValueData != OpData) {
5713 markExtraArg(Stack.
back(),
I);
5715 }
else if (!ReducedValueData)
5716 ReducedValueData = OpData;
5718 Stack.
push_back(std::make_pair(I, OpData.getFirstOperandIndex()));
5723 markExtraArg(Stack.
back(), NextV);
5731 if (ReducedVals.
empty())
5737 unsigned NumReducedVals = ReducedVals.
size();
5738 if (NumReducedVals < 4)
5743 Value *VectorizedTree =
nullptr;
5744 IRBuilder<> Builder(cast<Instruction>(ReductionRoot));
5750 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
5753 for (
auto &Pair : ExtraArgs) {
5754 assert(Pair.first &&
"DebugLoc must be set.");
5755 ExternallyUsedValues[Pair.second].push_back(Pair.first);
5759 ExternallyUsedValues[ReductionRoot];
5761 for (
auto &V : ReductionOps)
5762 IgnoreList.
append(V.begin(), V.end());
5763 while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
5765 V.buildTree(VL, ExternallyUsedValues, IgnoreList);
5768 if (Order && Order->size() == VL.
size()) {
5772 [VL](
const unsigned Idx) {
return VL[Idx]; });
5773 V.buildTree(ReorderedOps, ExternallyUsedValues, IgnoreList);
5775 if (V.isTreeTinyAndNotFullyVectorizable())
5778 V.computeMinimumValueSizes();
5781 int TreeCost = V.getTreeCost();
5782 int ReductionCost = getReductionCost(TTI, ReducedVals[i], ReduxWidth);
5783 int Cost = TreeCost + ReductionCost;
5785 V.getORE()->emit([&]() {
5787 SV_NAME,
"HorSLPNotBeneficial", cast<Instruction>(VL[0]))
5788 <<
"Vectorizing horizontal reduction is possible" 5789 <<
"but not beneficial with cost " 5790 <<
ore::NV(
"Cost", Cost) <<
" and threshold " 5796 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorizing horizontal reduction at cost:" 5797 << Cost <<
". (HorRdx)\n");
5798 V.getORE()->emit([&]() {
5800 SV_NAME,
"VectorizedHorizontalReduction", cast<Instruction>(VL[0]))
5801 <<
"Vectorized horizontal reduction with cost " 5802 <<
ore::NV(
"Cost", Cost) <<
" and with tree size " 5803 <<
ore::NV(
"TreeSize", V.getTreeSize());
5808 Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues);
5812 Value *ReducedSubTree =
5813 emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
5814 if (VectorizedTree) {
5816 OperationData VectReductionData(ReductionData.getOpcode(),
5817 VectorizedTree, ReducedSubTree,
5818 ReductionData.getKind());
5820 VectReductionData.createOp(Builder,
"op.rdx", ReductionOps);
5822 VectorizedTree = ReducedSubTree;
5827 if (VectorizedTree) {
5829 for (; i < NumReducedVals; ++i) {
5830 auto *I = cast<Instruction>(ReducedVals[i]);
5832 OperationData VectReductionData(ReductionData.getOpcode(),
5834 ReductionData.getKind());
5835 VectorizedTree = VectReductionData.createOp(Builder,
"", ReductionOps);
5837 for (
auto &Pair : ExternallyUsedValues) {
5839 for (
auto *I : Pair.second) {
5841 OperationData VectReductionData(ReductionData.getOpcode(),
5842 VectorizedTree, Pair.first,
5843 ReductionData.getKind());
5844 VectorizedTree = VectReductionData.createOp(Builder,
"op.extra", I);
5848 ReductionRoot->replaceAllUsesWith(VectorizedTree);
5850 return VectorizedTree !=
nullptr;
5853 unsigned numReductionValues()
const {
5854 return ReducedVals.
size();
5860 unsigned ReduxWidth) {
5864 int PairwiseRdxCost;
5865 int SplittingRdxCost;
5866 switch (ReductionData.getKind()) {
5880 bool IsUnsigned = ReductionData.getKind() == RK_UMin ||
5881 ReductionData.getKind() == RK_UMax;
5894 IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
5895 int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
5897 int ScalarReduxCost;
5898 switch (ReductionData.getKind()) {
5915 ScalarReduxCost *= (ReduxWidth - 1);
5917 LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " << VecReduxCost - ScalarReduxCost
5918 <<
" for reduction that starts with " << *FirstReducedVal
5920 << (IsPairwiseReduction ?
"pairwise" :
"splitting")
5921 <<
" reduction)\n");
5923 return VecReduxCost - ScalarReduxCost;
5929 assert(VectorizedValue &&
"Need to have a vectorized tree node");
5931 "We only handle power-of-two reductions for now");
5933 if (!IsPairwiseReduction)
5935 Builder, TTI, ReductionData.getOpcode(), VectorizedValue,
5936 ReductionData.getFlags(), ReductionOps.back());
5938 Value *TmpVec = VectorizedValue;
5939 for (
unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
5950 OperationData VectReductionData(ReductionData.getOpcode(), LeftShuf,
5951 RightShuf, ReductionData.getKind());
5952 TmpVec = VectReductionData.createOp(Builder,
"op.rdx", ReductionOps);
5977 if (
auto *CI = dyn_cast<ConstantInt>(LastInsertElem->
getOperand(2))) {
5980 CI->getZExtValue());
5984 if (isa<UndefValue>(V))
5987 if (!LastInsertElem || !LastInsertElem->
hasOneUse())
6003 if (isa<UndefValue>(V))
6029 auto DominatedReduxValue = [&](
Value *R) {
6030 return isa<Instruction>(R) &&
6034 Value *Rdx =
nullptr;
6043 if (Rdx && DominatedReduxValue(Rdx))
6062 if (Rdx && DominatedReduxValue(Rdx))
6089 if (Root->
getParent() != BB || isa<PHINode>(Root))
6103 while (!Stack.empty()) {
6106 std::tie(V, Level) = Stack.pop_back_val();
6115 HorizontalReduction HorRdx;
6116 if (HorRdx.matchAssociativeReduction(P, Inst)) {
6117 if (HorRdx.tryToReduce(R, TTI)) {
6140 if (Vectorize(Inst, R)) {
6149 for (
auto *
Op : Inst->operand_values())
6150 if (VisitedInstrs.
insert(
Op).second)
6151 if (
auto *I = dyn_cast<Instruction>(
Op))
6152 if (!isa<PHINode>(I) && I->
getParent() == BB)
6153 Stack.emplace_back(
Op, Level);
6158 bool SLPVectorizerPass::vectorizeRootInstruction(
PHINode *P,
Value *V,
6167 if (!isa<BinaryOperator>(I))
6170 auto &&ExtraVectorization = [
this](
Instruction *
I, BoUpSLP &R) ->
bool {
6171 return tryToVectorize(I, R);
6174 ExtraVectorization);
6177 bool SLPVectorizerPass::vectorizeInsertValueInst(
InsertValueInst *IVI,
6180 if (!R.canMapToVector(IVI->
getType(), DL))
6187 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IVI <<
"\n");
6190 return tryToVectorizeList(BuildVectorOpds, R);
6199 [](
Value *V) {
return isa<ExtractElementInst>(V); }) &&
6205 return tryToVectorizeList(BuildVectorOpds, R, UserCost);
6213 bool OpsChanged =
false;
6214 for (
int Idx = 0; Idx < 2; ++Idx) {
6216 vectorizeRootInstruction(
nullptr, CI->
getOperand(Idx), BB, R, TTI);
6221 bool SLPVectorizerPass::vectorizeSimpleInstructions(
6223 bool OpsChanged =
false;
6224 for (
auto &VH :
reverse(Instructions)) {
6225 auto *I = dyn_cast_or_null<Instruction>(VH);
6228 if (
auto *LastInsertValue = dyn_cast<InsertValueInst>(I))
6229 OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
6230 else if (
auto *LastInsertElem = dyn_cast<InsertElementInst>(I))
6231 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
6232 else if (
auto *CI = dyn_cast<CmpInst>(I))
6233 OpsChanged |= vectorizeCmpInst(CI, BB, R);
6235 Instructions.
clear();
6239 bool SLPVectorizerPass::vectorizeChainsInBlock(
BasicBlock *BB, BoUpSLP &R) {
6240 bool Changed =
false;
6244 bool HaveVectorizedPhiNodes =
true;
6245 while (HaveVectorizedPhiNodes) {
6246 HaveVectorizedPhiNodes =
false;
6255 if (!VisitedInstrs.
count(P))
6269 while (SameTypeIt != E &&
6270 (*SameTypeIt)->getType() == (*IncIt)->getType()) {
6271 VisitedInstrs.
insert(*SameTypeIt);
6276 unsigned NumElts = (SameTypeIt - IncIt);
6277 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize starting at PHIs (" 6278 << NumElts <<
")\n");
6283 bool AllowReorder = NumElts == 2;
6284 if (NumElts > 1 && tryToVectorizeList(
makeArrayRef(IncIt, NumElts), R,
6287 HaveVectorizedPhiNodes =
true;
6297 VisitedInstrs.
clear();
6303 if (!VisitedInstrs.
insert(&*it).second) {
6304 if (it->use_empty() && KeyNodes.
count(&*it) > 0 &&
6305 vectorizeSimpleInstructions(PostProcessInstructions, BB, R)) {
6315 if (isa<DbgInfoIntrinsic>(it))
6319 if (
PHINode *P = dyn_cast<PHINode>(it)) {
6338 if (it->use_empty() && (it->getType()->isVoidTy() || isa<CallInst>(it) ||
6339 isa<InvokeInst>(it))) {
6341 bool OpsChanged =
false;
6343 for (
auto *V : it->operand_values()) {
6345 OpsChanged |= vectorizeRootInstruction(
nullptr, V, BB, R, TTI);
6351 OpsChanged |= vectorizeSimpleInstructions(PostProcessInstructions, BB, R);
6362 if (isa<InsertElementInst>(it) || isa<CmpInst>(it) ||
6363 isa<InsertValueInst>(it))
6364 PostProcessInstructions.
push_back(&*it);
6370 bool SLPVectorizerPass::vectorizeGEPIndices(
BasicBlock *BB, BoUpSLP &R) {
6371 auto Changed =
false;
6372 for (
auto &Entry : GEPs) {
6375 if (Entry.second.size() < 2)
6378 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing a getelementptr list of length " 6379 << Entry.second.size() <<
".\n");
6383 for (
unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += 16) {
6384 auto Len = std::min<unsigned>(BE - BI, 16);
6397 Candidates.
remove(
nullptr);
6404 for (
int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++
I) {
6405 auto *GEPI = cast<GetElementPtrInst>(GEPList[
I]);
6406 if (!Candidates.count(GEPI))
6408 auto *SCEVI = SE->getSCEV(GEPList[I]);
6409 for (
int J = I + 1; J < E && Candidates.size() > 1; ++J) {
6410 auto *GEPJ = cast<GetElementPtrInst>(GEPList[J]);
6411 auto *SCEVJ = SE->getSCEV(GEPList[J]);
6412 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
6413 Candidates.remove(GEPList[I]);
6414 Candidates.remove(GEPList[J]);
6415 }
else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
6416 Candidates.remove(GEPList[J]);
6423 if (Candidates.size() < 2)
6430 auto BundleIndex = 0u;
6431 for (
auto *V : Candidates) {
6432 auto *
GEP = cast<GetElementPtrInst>(V);
6433 auto *GEPIdx =
GEP->idx_begin()->get();
6434 assert(
GEP->getNumIndices() == 1 || !isa<Constant>(GEPIdx));
6435 Bundle[BundleIndex++] = GEPIdx;
6447 Changed |= tryToVectorizeList(Bundle, R);
6453 bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
6454 bool Changed =
false;
6456 for (StoreListMap::iterator it = Stores.
begin(), e = Stores.
end(); it != e;
6458 if (it->second.size() < 2)
6462 << it->second.size() <<
".\n");
6468 for (
unsigned CI = 0, CE = it->second.size(); CI < CE; CI += 16) {
6469 unsigned Len = std::min<unsigned>(CE - CI, 16);
6470 Changed |= vectorizeStores(
makeArrayRef(&it->second[CI], Len), R);
Legacy wrapper pass to provide the GlobalsAAResult object.
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
Pass interface - Implemented by all 'passes'.
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value *> &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop)...
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, AliasSetTracker *AST, MemorySSAUpdater *MSSAU)
bool hasOperandBundles() const
Return true if this User has any operand bundles.
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, OptimizationRemarkEmitter *ORE=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
Value * getValueOperand()
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
A parsed version of the target data layout string in and methods for querying it. ...
uint64_t getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
This class is the base class for the comparison instructions.
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, TargetLibraryInfo &TLI, DominatorTree &DT)
This is the entry point for all transforms.
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
static IntegerType * getInt1Ty(LLVMContext &C)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
static bool allConstant(ArrayRef< Value *> VL)
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
MapVector< Instruction *, uint64_t > computeMinimumValueSizes(ArrayRef< BasicBlock *> Blocks, DemandedBits &DB, const TargetTransformInfo *TTI=nullptr)
Compute a map of integer instructions to their minimum legal type size.
const T & back() const
back - Get the last element.
Value * getAggregateOperand()
void setFast(bool B=true)
DiagnosticInfoOptimizationBase::Argument NV
void dropAllReferences()
Drop all references to operands.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
OptimizationRemarkEmitter * getORE()
This class represents lattice values for constants.
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
This is the interface for a simple mod/ref and alias analysis over globals.
A Module instance is used to store all the information related to an LLVM module. ...
static InstructionsState getSameOpcode(ArrayRef< Value *> VL, unsigned BaseIndex=0)
static Value * getReductionValue(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction value from a phi node.
void push_back(const T &Elt)
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI)
This provides a very simple, boring adaptor for a begin and end iterator into a range type...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value *> VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal, MD_access_group].
The main scalar evolution driver.
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
This class represents a function call, abstracting a target machine's calling convention.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
An immutable pass that tracks lazily created AssumptionCache objects.
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space...
const Value * getTrueValue() const
An efficient, type-erasing, non-owning reference to a callable.
static const unsigned MaxMemDepDistance
A cache of @llvm.assume calls within a function.
Analysis pass providing the TargetTransformInfo.
0 1 0 0 True if ordered and less than
static bool collectValuesToDemote(Value *V, SmallPtrSetImpl< Value *> &Expr, SmallVectorImpl< Value *> &ToDemote, SmallVectorImpl< Value *> &Roots)
Value * createSimpleTargetReduction(IRBuilder<> &B, const TargetTransformInfo *TTI, unsigned Opcode, Value *Src, TargetTransformInfo::ReductionFlags Flags=TargetTransformInfo::ReductionFlags(), ArrayRef< Value *> RedOps=None)
Create a target reduction of the given vector.
bool isTerminator() const
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
This class implements a map that also provides access to all stored values in a deterministic order...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly...
STATISTIC(NumFunctions, "Total number of functions")
Analysis pass which computes a DominatorTree.
An instruction for reading from memory.
reverse_iterator rbegin()
MaxMin_match< FCmpInst, LHS, RHS, ufmax_pred_ty > m_UnordFMax(const LHS &L, const RHS &R)
Match an 'unordered' floating point maximum function.
VectorType::iterator erase(typename VectorType::iterator Iterator)
Remove the element given by Iterator.
std::enable_if<!std::is_array< T >::value, std::unique_ptr< T > >::type make_unique(Args &&... args)
Constructs a new T() with the given args and returns a unique_ptr<T> which owns the object...
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
DOTGraphTraits(bool isSimple=false)
This defines the Use class.
void reserve(size_type N)
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
*ViewGraph Emit a dot run run gv on the postscript *then cleanup For use from the debugger *void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
iterator begin()
Instruction iterator methods.
bool isIdenticalTo(const Instruction *I) const
Return true if the specified instruction is exactly identical to the current one. ...
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
static nodes_iterator nodes_end(BoUpSLP *R)
Value * getArgOperand(unsigned i) const
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
void dump() const
Support for debugging, callable in GDB: V->dump()
bool match(Val *V, const Pattern &P)
AnalysisUsage & addRequired()
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
#define INITIALIZE_PASS_DEPENDENCY(depName)
static uint32_t getAlignment(const MCSectionCOFF &Sec)
amdgpu Simplify well known AMD library false Value Value const Twine & Name
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< unsigned > &Mask)
static Optional< TargetTransformInfo::ShuffleKind > isShuffle(ArrayRef< Value *> VL)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
This class represents the LLVM 'select' instruction.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
static bool findBuildAggregate(InsertValueInst *IV, SmallVectorImpl< Value *> &BuildVectorOpds)
Like findBuildVector, but looks for construction of aggregate.
This is the base class for all instructions that perform data casts.
ArrayRef< T > makeArrayRef(const T &OneElt)
Construct an ArrayRef from a single element.
Class to represent struct types.
A Use represents the edge between a Value definition and its users.
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
static Optional< unsigned > getOpcode(ArrayRef< VPValue *> Values)
Returns the opcode of Values or ~0 if they do not all agree.
unsigned getTreeSize() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
An analysis that produces DemandedBits for a function.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
0 1 0 1 True if ordered and less than or equal
This file contains the simple types necessary to represent the attributes associated with functions a...
static bool shouldReorderOperands(int i, unsigned Opcode, Instruction &I, ArrayRef< Value *> Left, ArrayRef< Value *> Right, bool AllSameOpcodeLeft, bool AllSameOpcodeRight, bool SplatLeft, bool SplatRight, Value *&VLeft, Value *&VRight)
MaxMin_match< FCmpInst, LHS, RHS, ufmin_pred_ty > m_UnordFMin(const LHS &L, const RHS &R)
Match an 'unordered' floating point minimum function.
static const unsigned AliasedCheckLimit
Analysis pass that exposes the LoopInfo for a function.
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
uint64_t getNumElements() const
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
std::vector< TreeEntry > & VectorizableTree
bool remove(const value_type &X)
Remove an item from the set vector.
void assign(size_type NumElts, const T &Elt)
static ChildIteratorType child_end(NodeRef N)
static bool allSameBlock(ArrayRef< Value *> VL)
static bool isSimple(Instruction *I)
auto reverse(ContainerTy &&C, typename std::enable_if< has_rbegin< ContainerTy >::value >::type *=nullptr) -> decltype(make_range(C.rbegin(), C.rend()))
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
Type * getType() const
All values are typed, get the type of this value.
Value handle that is nullable, but tries to track the Value.
bool insert(const value_type &X)
Insert a new element into the SetVector.
static bool isEqual(const Function &Caller, const Function &Callee)
Value * CreateICmpUGT(Value *LHS, Value *RHS, const Twine &Name="")
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
BoUpSLP::TreeEntry TreeEntry
const T & getValue() const LLVM_LVALUE_FUNCTION
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
MaxMin_match< FCmpInst, LHS, RHS, ofmin_pred_ty > m_OrdFMin(const LHS &L, const RHS &R)
Match an 'ordered' floating point minimum function.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
void insert(ScheduleData *SD)
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Value * getInsertedValueOperand()
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
iterator_range< User::op_iterator > arg_operands()
static Value * createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx, bool IsPairwise, bool IsLeft, IRBuilder<> &Builder)
Generate a shuffle mask to be used in a reduction tree.
An instruction for storing to memory.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator find(const KeyT &Key)
VectorType * getType() const
Overload to return most specific vector type.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
Function * getDeclaration(Module *M, ID id, ArrayRef< Type *> Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block...
Value * getOperand(unsigned i) const
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Class to represent pointers.
void replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
static ChildIteratorType child_begin(NodeRef N)
unsigned getMinVecRegSize() const
static bool findBuildVector(InsertElementInst *LastInsertElem, TargetTransformInfo *TTI, SmallVectorImpl< Value *> &BuildVectorOpds, int &UserCost)
Recognize construction of vectors like ra = insertelement <4 x float> undef, float s0...
bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
bool isVoidTy() const
Return true if this is 'void'.
const BasicBlock & getEntryBlock() const
static bool runOnFunction(Function &F, bool PostInlining)
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits...
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
initializer< Ty > init(const Ty &Val)
This instruction inserts a single (scalar) element into a VectorType value.
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled...
typename GraphType::UnknownGraphTypeError NodeRef
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
A set of analyses that are preserved following a run of a transformation pass.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
bool hasNUses(unsigned N) const
Return true if this Value has exactly N users.
BoUpSLP::TreeEntry TreeEntry
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
LLVM Basic Block Representation.
The instances of the Type class are immutable: once they are created, they are never changed...
static bool tryToVectorizeHorReductionOrInstOperands(PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R, TargetTransformInfo *TTI, const function_ref< bool(Instruction *, BoUpSLP &)> Vectorize)
Attempt to reduce a horizontal reduction.
CRTP base class for adapting an iterator to a different type.
size_t size() const
size - Get the array size.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
LLVM_ATTRIBUTE_ALWAYS_INLINE iterator begin()
Value * getIncomingValueForBlock(const BasicBlock *BB) const
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
A manager for alias analyses.
std::pair< iterator, bool > insert(const ValueT &V)
Value * CreateFCmpOGT(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
bool isAssociative() const LLVM_READONLY
Return true if the instruction is associative:
Represent the analysis usage information of a pass.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly...
Analysis pass providing a never-invalidated alias analysis result.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
FunctionPass class - This class is used to implement most global optimizations.
Value * getPointerOperand()
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
iterator_range< po_iterator< T > > post_order(const T &G)
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
self_iterator getIterator()
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Class to represent integer types.
std::pair< NoneType, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
void setAlignment(unsigned Align)
Used in the streaming interface as the general argument type.
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
static Optional< unsigned > getExtractIndex(Instruction *E)
static wasm::ValType getType(const TargetRegisterClass *RC)
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
auto find(R &&Range, const T &Val) -> decltype(adl_begin(Range))
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly...
1 1 0 1 True if unordered, less than, or equal
Value * GetUnderlyingObject(Value *V, const DataLayout &DL, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value...
std::string & str()
Flushes the stream contents to the target string and returns the string's reference.
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
void propagateIRFlags(Value *I, ArrayRef< Value *> VL, Value *OpValue=nullptr)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
static Value * isOneOf(const InstructionsState &S, Value *Op)
Chooses the correct key for scheduling data.
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
const Value * Ptr
The address of the start of the location.
Representation for a specific memory location.
Value * CreateFCmpOLT(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
0 0 1 0 True if ordered and greater than
A function analysis which provides an AssumptionCache.
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
Predicate getPredicate(unsigned Condition, unsigned Hint)
Return predicate consisting of specified condition and hint bits.
This is the common base class for memset/memcpy/memmove.
Iterator for intrusive lists based on ilist_node.
This is the shared class of boolean and integer constants.
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false...
1 1 0 0 True if unordered or less than
bool dominates(const Instruction *Def, const Use &U) const
Return true if Def dominates a use in User.
Module.h This file contains the declarations for the Module class.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Provides information about what library functions are available for the current target.
size_type count(const KeyT &Key) const
User(Type *ty, unsigned vty, Use *, unsigned NumOps)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
LLVM_NODISCARD T pop_back_val()
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
static nodes_iterator nodes_begin(BoUpSLP *R)
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
Optional< ArrayRef< unsigned > > bestOrder() const
void setPreservesCFG()
This function should be called by the pass, iff they do not:
bool hasVectorInstrinsicScalarOpd(Intrinsic::ID ID, unsigned ScalarOpdIdx)
Identifies if the intrinsic has a scalar operand.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
bool isCommutative() const
Return true if the instruction is commutative:
Intrinsic::ID getIntrinsicID() const LLVM_READONLY
getIntrinsicID - This method returns the ID number of the specified function, or Intrinsic::not_intri...
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Implements a dense probed hash-table based set with some number of buckets stored inline...
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Class to represent vector types.
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
static bool allSameType(ArrayRef< Value *> VL)
iterator_range< user_iterator > users()
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
unsigned getMaxVecRegSize() const
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array...
Represents analyses that only rely on functions' control flow.
static void clear(coro::Shape &Shape)
iterator insert(iterator I, T &&Elt)
const Value * getFalseValue() const
void append(in_iter in_start, in_iter in_end)
Add the specified range to the end of the SmallVector.
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
bool isX86_FP80Ty() const
Return true if this is x86 long double.
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
Analysis pass that exposes the ScalarEvolution for a function.
bool operator!=(uint64_t V1, const APInt &V2)
Predicate getPredicate() const
Return the predicate for this instruction.
LLVM_ATTRIBUTE_ALWAYS_INLINE iterator end()
static unsigned size(BoUpSLP *R)
unsigned getNumArgOperands() const
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
void initializeSLPVectorizerPass(PassRegistry &)
unsigned getAlignment() const
Return the alignment of the access that is being performed.
void emplace_back(ArgTypes &&... Args)
This class represents an analyzed expression in the program.
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
LLVM_NODISCARD bool empty() const
unsigned greater or equal
Represents a single loop in the control flow graph.
This file provides utility analysis objects describing memory locations.
void preserveSet()
Mark an analysis set as preserved.
StringRef getName() const
Return a constant reference to the value's name.
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation.
const Function * getParent() const
Return the enclosing method, or null if none.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AliasAnalysis *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
ilist_iterator< OptionsT, !IsReverse, IsConst > getReverse() const
Get a reverse iterator to the same node.
void preserve()
Mark an analysis as preserved.
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
1 0 1 0 True if unordered or greater than
unsigned getAlignment() const
Return the alignment of the access that is being performed.
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
Pass * createSLPVectorizerPass()
OutputIt transform(R &&Range, OutputIt d_first, UnaryPredicate P)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere...
ChildIteratorType(SmallVector< int, 1 >::iterator W, std::vector< TreeEntry > &VT)
Analysis pass providing the TargetLibraryInfo.
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
static MemoryLocation getLocation(Instruction *I, AliasAnalysis *AA)
static NodeRef getEntryNode(BoUpSLP &R)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
user_iterator user_begin()
static const char lv_name[]
static bool PhiTypeSorterFunc(Value *V, Value *V2)
A raw_ostream that writes to an std::string.
MaxMin_match< FCmpInst, LHS, RHS, ofmax_pred_ty > m_OrdFMax(const LHS &L, const RHS &R)
Match an 'ordered' floating point maximum function.
uint64_t PowerOf2Floor(uint64_t A)
Returns the power of two which is less than or equal to the given value.
Module * getParent()
Get the module that this global value is contained inside of...
LLVM Value Representation.
void setAlignment(unsigned Align)
1 0 1 1 True if unordered, greater than, or equal
A vector that has set insertion semantics.
static VectorType * get(Type *ElementType, unsigned NumElements)
This static method is the primary way to construct an VectorType.
static bool hasValueBeenRAUWed(ArrayRef< Value *> VL, ArrayRef< WeakTrackingVH > VH, unsigned SliceBegin, unsigned SliceSize)
Check that the Values in the slice in VL array are still existent in the WeakTrackingVH array...
std::underlying_type< E >::type Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
static const Function * getParent(const Value *V)
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
This class implements an extremely fast bulk output stream that can only output to a stream...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
The legacy pass manager's analysis pass to compute loop information.
bool hasOneUse() const
Return true if there is exactly one user of this value.
Convenience struct for specifying and reasoning about fast-math flags.
bool isNonNegative() const
Returns true if this value is known to be non-negative.
A container for analyses that lazily runs them and caches their results.
static bool isSplat(ArrayRef< Value *> VL)
Legacy analysis pass which computes a DominatorTree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
bool operator==(uint64_t V1, const APInt &V2)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object...
This header defines various interfaces for pass management in LLVM.
static bool isAssociative(const COFFSection &Section)
op_range incoming_values()
0 0 1 1 True if ordered and greater than or equal
Bottom Up SLP Vectorizer.
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
bool sortPtrAccesses(ArrayRef< Value *> VL, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices, if reordering is required.
Value * getPointerOperand()
static Constant * get(ArrayRef< Constant *> V)
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
bool empty() const
empty - Check if the array is empty.
const BasicBlock * getParent() const
This class represents a constant integer value.
This instruction inserts a struct field of array element value into an aggregate value.
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.