50 #define DEBUG_TYPE "interleaved-load-combine" 55 STATISTIC(NumInterleavedLoadCombine,
"Number of combined loads");
60 cl::desc(
"Disable combining of interleaved loads"));
64 struct InterleavedLoadCombineImpl {
68 :
F(F), DT(DT), MSSA(MSSA),
94 LoadInst *findFirstLoad(
const std::set<LoadInst *> &LIs);
99 bool combine(std::list<VectorInfo> &InterleavedLoad,
104 bool findPattern(std::list<VectorInfo> &Candidates,
105 std::list<VectorInfo> &InterleavedLoad,
unsigned Factor,
187 Polynomial(
Value *V) : ErrorMSBs((
unsigned)-1), V(V),
B(), A() {
196 Polynomial(
const APInt &A,
unsigned ErrorMSBs = 0)
197 : ErrorMSBs(ErrorMSBs), V(NULL),
B(), A(A) {}
199 Polynomial(
unsigned BitWidth, uint64_t A,
unsigned ErrorMSBs = 0)
200 : ErrorMSBs(ErrorMSBs), V(NULL),
B(), A(BitWidth, A) {}
202 Polynomial() : ErrorMSBs((
unsigned)-1), V(NULL),
B(), A() {}
205 void incErrorMSBs(
unsigned amt) {
206 if (ErrorMSBs == (
unsigned)-1)
210 if (ErrorMSBs > A.getBitWidth())
211 ErrorMSBs = A.getBitWidth();
215 void decErrorMSBs(
unsigned amt) {
216 if (ErrorMSBs == (
unsigned)-1)
253 Polynomial &mul(
const APInt &C) {
325 pushBOperation(Mul, C);
330 Polynomial &lshr(
const APInt &C) {
479 if (A.countTrailingZeros() < shiftAmt)
480 ErrorMSBs = A.getBitWidth();
482 incErrorMSBs(shiftAmt);
485 pushBOperation(LShr, C);
486 A = A.lshr(shiftAmt);
492 Polynomial &sextOrTrunc(
unsigned n) {
493 if (n < A.getBitWidth()) {
496 decErrorMSBs(A.getBitWidth() - n);
498 pushBOperation(Trunc,
APInt(
sizeof(n) * 8, n));
500 if (n > A.getBitWidth()) {
503 incErrorMSBs(n - A.getBitWidth());
505 pushBOperation(SExt,
APInt(
sizeof(n) * 8, n));
512 bool isFirstOrder()
const {
return V !=
nullptr; }
515 bool isCompatibleTo(
const Polynomial &o)
const {
517 if (A.getBitWidth() != o.A.getBitWidth())
521 if (!isFirstOrder() && !o.isFirstOrder())
529 if (B.
size() != o.B.size())
532 auto ob = o.B.begin();
544 Polynomial
operator-(
const Polynomial &o)
const {
546 if (!isCompatibleTo(o))
552 return Polynomial(A - o.A,
std::max(ErrorMSBs, o.ErrorMSBs));
557 Polynomial Result(*
this);
564 Polynomial Result(*
this);
570 bool isProvenEqualTo(
const Polynomial &o) {
572 Polynomial r = *
this - o;
573 return (r.ErrorMSBs == 0) && (!r.isFirstOrder()) && (r.A.isNullValue());
578 OS <<
"[{#ErrBits:" << ErrorMSBs <<
"} ";
583 OS <<
"(" << *V <<
") ";
601 OS << b.second <<
") ";
605 OS <<
"+ " << A <<
"]";
614 void pushBOperation(
const BOps
Op,
const APInt &C) {
615 if (isFirstOrder()) {
638 VectorInfo(
const VectorInfo &c) : VTy(c.VTy) {
640 "Copying VectorInfo is neither implemented nor necessary,");
653 ElementInfo(Polynomial
Offset = Polynomial(),
LoadInst *LI =
nullptr)
664 std::set<LoadInst *> LIs;
667 std::set<Instruction *> Is;
679 : BB(
nullptr), PV(
nullptr), LIs(), Is(), SVI(
nullptr), VTy(VTy) {
683 virtual ~VectorInfo() {
delete[] EI; }
694 bool isInterleaved(
unsigned Factor,
const DataLayout &DL)
const {
696 for (
unsigned i = 1; i < getDimension(); i++) {
697 if (!EI[i].Ofs.isProvenEqualTo(EI[0].Ofs + i * Factor * Size)) {
712 static bool compute(
Value *V, VectorInfo &Result,
const DataLayout &DL) {
715 return computeFromSVI(SVI, Result, DL);
718 return computeFromLI(LI, Result, DL);
721 return computeFromBCI(BCI, Result, DL);
731 static bool computeFromBCI(
BitCastInst *BCI, VectorInfo &Result,
746 unsigned Factor = Result.VTy->getNumElements() / VTy->
getNumElements();
750 if (NewSize * Factor != OldSize)
754 if (!compute(Op, Old, DL))
757 for (
unsigned i = 0; i < Result.VTy->getNumElements(); i += Factor) {
758 for (
unsigned j = 0; j < Factor; j++) {
760 ElementInfo(Old.EI[i / Factor].Ofs + j * NewSize,
761 j == 0 ? Old.EI[i / Factor].LI :
nullptr);
767 Result.LIs.insert(Old.LIs.begin(), Old.LIs.end());
768 Result.Is.insert(Old.Is.begin(), Old.Is.end());
769 Result.Is.insert(BCI);
770 Result.SVI =
nullptr;
789 assert(ArgTy &&
"ShuffleVector Operand is not a VectorType");
792 VectorInfo LHS(ArgTy);
797 VectorInfo RHS(ArgTy);
802 if (!LHS.BB && !RHS.BB)
815 else if ((LHS.BB == RHS.BB) && (LHS.PV == RHS.PV)) {
826 Result.LIs.insert(LHS.LIs.begin(), LHS.LIs.end());
827 Result.Is.insert(LHS.Is.begin(), LHS.Is.end());
830 Result.LIs.insert(RHS.LIs.begin(), RHS.LIs.end());
831 Result.Is.insert(RHS.Is.begin(), RHS.Is.end());
833 Result.Is.insert(SVI);
839 "Invalid ShuffleVectorInst (index out of bounds)");
842 Result.EI[j] = ElementInfo();
845 Result.EI[j] = LHS.EI[i];
847 Result.EI[j] = ElementInfo();
852 Result.EI[j] = ElementInfo();
868 static bool computeFromLI(
LoadInst *LI, VectorInfo &Result,
884 Result.LIs.insert(LI);
885 Result.Is.insert(LI);
887 for (
unsigned i = 0; i < Result.getDimension(); i++) {
893 Result.EI[i] = ElementInfo(Offset + Ofs, i == 0 ? LI :
nullptr);
903 static void computePolynomialBinOp(
BinaryOperator &BO, Polynomial &Result) {
920 computePolynomial(*LHS, Result);
924 case Instruction::LShr:
928 computePolynomial(*LHS, Result);
936 Result = Polynomial(&BO);
943 static void computePolynomial(
Value &V, Polynomial &Result) {
944 if (isa<BinaryOperator>(&V))
945 computePolynomialBinOp(*dyn_cast<BinaryOperator>(&V), Result);
947 Result = Polynomial(&V);
956 static void computePolynomialFromPointer(
Value &Ptr, Polynomial &Result,
962 Result = Polynomial();
965 unsigned PointerBits =
969 if (isa<CastInst>(&Ptr)) {
970 CastInst &CI = *cast<CastInst>(&Ptr);
972 case Instruction::BitCast:
973 computePolynomialFromPointer(*CI.
getOperand(0), Result, BasePtr, DL);
977 Polynomial(PointerBits, 0);
982 else if (isa<GetElementPtrInst>(&Ptr)) {
985 APInt BaseOffset(PointerBits, 0);
989 Result = Polynomial(BaseOffset);
995 unsigned idxOperand, e;
1006 if (idxOperand + 1 != e) {
1007 Result = Polynomial();
1013 computePolynomial(*GEP.
getOperand(idxOperand), Result);
1022 Result.sextOrTrunc(PointerBits);
1023 Result.mul(
APInt(PointerBits, ResultSize));
1024 Result.add(BaseOffset);
1043 for (
unsigned i = 0; i < getDimension(); i++)
1044 OS << ((i == 0) ?
"[" :
", ") << EI[i].Ofs;
1052 bool InterleavedLoadCombineImpl::findPattern(
1053 std::list<VectorInfo> &Candidates, std::list<VectorInfo> &InterleavedLoad,
1055 for (
auto C0 = Candidates.begin(), E0 = Candidates.end(); C0 != E0; ++C0) {
1061 std::vector<std::list<VectorInfo>::iterator> Res(Factor, Candidates.end());
1063 for (
auto C = Candidates.begin(),
E = Candidates.end();
C !=
E;
C++) {
1064 if (
C->VTy != C0->VTy)
1066 if (
C->BB != C0->BB)
1068 if (
C->PV != C0->PV)
1072 for (i = 1; i < Factor; i++) {
1073 if (
C->EI[0].Ofs.isProvenEqualTo(C0->EI[0].Ofs + i * Size)) {
1078 for (i = 1; i < Factor; i++) {
1079 if (Res[i] == Candidates.end())
1088 if (Res[0] != Candidates.end()) {
1090 for (
unsigned i = 0; i < Factor; i++) {
1091 InterleavedLoad.splice(InterleavedLoad.end(), Candidates, Res[i]);
1101 InterleavedLoadCombineImpl::findFirstLoad(
const std::set<LoadInst *> &LIs) {
1102 assert(!LIs.empty() &&
"No load instructions given.");
1110 assert(FLI != BB->end());
1112 return cast<LoadInst>(FLI);
1115 bool InterleavedLoadCombineImpl::combine(std::list<VectorInfo> &InterleavedLoad,
1122 LoadInst *InsertionPoint = InterleavedLoad.front().EI[0].LI;
1125 if (!InsertionPoint)
1128 std::set<LoadInst *> LIs;
1129 std::set<Instruction *> Is;
1130 std::set<Instruction *> SVIs;
1132 unsigned InterleavedCost;
1133 unsigned InstructionCost = 0;
1136 unsigned Factor = InterleavedLoad.size();
1139 for (
auto &
VI : InterleavedLoad) {
1141 LIs.insert(
VI.LIs.begin(),
VI.LIs.end());
1146 Is.insert(
VI.Is.begin(),
VI.Is.end());
1149 SVIs.insert(
VI.SVI);
1159 for (
auto &
I : Is) {
1165 if (SVIs.find(
I) != SVIs.end())
1170 for (
const auto &U :
I->users()) {
1171 if (Is.find(dyn_cast<Instruction>(U)) == Is.end())
1178 LoadInst *First = findFirstLoad(LIs);
1183 auto FMA = MSSA.getMemoryAccess(First);
1184 for (
auto LI : LIs) {
1185 auto MADef = MSSA.getMemoryAccess(LI)->getDefiningAccess();
1186 if (!MSSA.dominates(MADef,
FMA))
1189 assert(!LIs.empty() &&
"There are no LoadInst to combine");
1192 for (
auto &
VI : InterleavedLoad) {
1193 if (!DT.dominates(InsertionPoint,
VI.SVI))
1200 Type *ETy = InterleavedLoad.front().SVI->getType()->getElementType();
1201 unsigned ElementsPerSVI =
1202 InterleavedLoad.front().SVI->getType()->getNumElements();
1206 for (
unsigned i = 0; i < Factor; i++)
1208 InterleavedCost = TTI.getInterleavedMemoryOpCost(
1212 if (InterleavedCost >= InstructionCost) {
1219 "interleaved.wide.ptrcast");
1223 "interleaved.wide.load");
1225 MemoryUse *MSSALoad = cast<MemoryUse>(MSSAU.createMemoryAccessBefore(
1226 LI,
nullptr, MSSA.getMemoryAccess(InsertionPoint)));
1227 MSSAU.insertUse(MSSALoad);
1231 for (
auto &
VI : InterleavedLoad) {
1233 for (
unsigned j = 0; j < ElementsPerSVI; j++)
1238 Mask,
"interleaved.shuffle");
1239 VI.SVI->replaceAllUsesWith(SVI);
1243 NumInterleavedLoadCombine++;
1246 <<
"Load interleaved combined with factor " 1253 bool InterleavedLoadCombineImpl::run() {
1255 bool changed =
false;
1256 unsigned MaxFactor = TLI.getMaxSupportedInterleaveFactor();
1258 auto &DL =
F.getParent()->getDataLayout();
1261 for (
unsigned Factor = MaxFactor; Factor >= 2; Factor--) {
1262 std::list<VectorInfo> Candidates;
1266 if (
auto SVI = dyn_cast<ShuffleVectorInst>(&
I)) {
1268 Candidates.emplace_back(SVI->getType());
1270 if (!VectorInfo::computeFromSVI(SVI, Candidates.back(), DL)) {
1271 Candidates.pop_back();
1275 if (!Candidates.back().isInterleaved(Factor, DL)) {
1276 Candidates.pop_back();
1282 std::list<VectorInfo> InterleavedLoad;
1283 while (findPattern(Candidates, InterleavedLoad, Factor, DL)) {
1284 if (combine(InterleavedLoad, ORE)) {
1289 Candidates.splice(Candidates.begin(), InterleavedLoad,
1290 std::next(InterleavedLoad.begin()),
1291 InterleavedLoad.end());
1293 InterleavedLoad.clear();
1310 StringRef getPassName()
const override {
1311 return "Interleaved Load Combine Pass";
1315 if (DisableInterleavedLoadCombine)
1318 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
1325 return InterleavedLoadCombineImpl(
1326 F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
1327 getAnalysis<MemorySSAWrapperPass>().getMSSA(),
1346 "Combine interleaved loads into wide loads and shufflevector instructions",
1352 "Combine interleaved loads into wide loads and shufflevector instructions",
1357 auto P =
new InterleavedLoadCombine();
A parsed version of the target data layout string in and methods for querying it. ...
Value * getPointerOperand()
unsigned getIndexSizeInBits(unsigned AS) const
Size in bits of index used for address calculation in getelementptr.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
uint64_t getZExtValue() const
Get zero extended value.
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
DiagnosticInfoOptimizationBase::Argument NV
APInt operator+(APInt a, const APInt &b)
This class represents lattice values for constants.
BinaryOps getOpcode() const
bool isAtomic() const
Return true if this instruction has an AtomicOrdering of unordered or higher.
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, unsigned Align, const char *Name)
Provided to resolve 'CreateAlignedLoad(Ptr, Align, "...")' correctly, instead of converting the strin...
void push_back(const T &Elt)
virtual const TargetLowering * getTargetLowering() const
This instruction constructs a fixed permutation of two input vectors.
LLVMContext & getContext() const
All values hold a context through their type.
APInt trunc(unsigned width) const
Truncate to new width.
STATISTIC(NumFunctions, "Total number of functions")
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
An instruction for reading from memory.
unsigned getBitWidth() const
Return the number of bits in the APInt.
Represents read-only accesses to memory.
unsigned countTrailingZeros() const
Count the number of trailing zero bits.
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Legacy analysis pass which computes MemorySSA.
bool isVolatile() const
Return true if this is a load from a volatile memory location.
This is the base class for all instructions that perform data casts.
ArrayRef< T > makeArrayRef(const T &OneElt)
Construct an ArrayRef from a single element.
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
Encapsulates MemorySSA, including all data associated with memory accesses.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
uint64_t getNumElements() const
Type * getSourceElementType() const
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
Type * getType() const
All values are typed, get the type of this value.
FunctionPass * createInterleavedLoadCombinePass()
InterleavedLoadCombines Pass - This pass identifies interleaved loads and combines them into wide loa...
This class represents a no-op cast from one type to another.
const APInt & getValue() const
Return the constant as an APInt value reference.
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block...
virtual TargetTransformInfo getTargetTransformInfo(const Function &F)
Return a TargetTransformInfo for a given function.
Value * getOperand(unsigned i) const
Class to represent pointers.
an instruction for type-safe pointer arithmetic to access elements of arrays and structs ...
static bool runOnFunction(Function &F, bool PostInlining)
initializer< Ty > init(const Ty &Val)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
LLVM Basic Block Representation.
The instances of the Type class are immutable: once they are created, they are never changed...
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
bool isOneValue() const
Determine if this is a value of 1.
Represent the analysis usage information of a pass.
FunctionPass class - This class is used to implement most global optimizations.
Value * getPointerOperand()
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
Class to represent integer types.
auto find_if(R &&Range, UnaryPredicate P) -> decltype(adl_begin(Range))
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly...
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Iterator for intrusive lists based on ilist_node.
unsigned getNumOperands() const
This is the shared class of boolean and integer constants.
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small...
Module.h This file contains the declarations for the Module class.
static void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
static Constant * get(Type *Ty, uint64_t V, bool isSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
bool isCommutative() const
Return true if the instruction is commutative:
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Class to represent vector types.
void initializeInterleavedLoadCombinePass(PassRegistry &)
Class for arbitrary precision integers.
Value * CreatePointerCast(Value *V, Type *DestTy, const Twine &Name="")
uint64_t getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
unsigned getAlignment() const
Return the alignment of the access that is being performed.
static IntegerType * getInt32Ty(LLVMContext &C)
bool accumulateConstantOffset(const DataLayout &DL, APInt &Offset) const
Accumulate the constant address offset of this GEP if possible.
This file provides utility analysis objects describing memory locations.
StringRef getName() const
Return a constant reference to the value's name.
Type * getResultElementType() const
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
raw_ostream & operator<<(raw_ostream &OS, const APInt &I)
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
INITIALIZE_PASS_BEGIN(InterleavedLoadCombine, DEBUG_TYPE, "Combine interleaved loads into wide loads and shufflevector instructions", false, false) INITIALIZE_PASS_END(InterleavedLoadCombine
LLVM Value Representation.
FMA - Perform a * b + c with no intermediate rounding step.
static VectorType * get(Type *ElementType, unsigned NumElements)
This static method is the primary way to construct an VectorType.
std::underlying_type< E >::type Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
static const Function * getParent(const Value *V)
This class implements an extremely fast bulk output stream that can only output to a stream...
Primary interface to the complete machine description for the target machine.
This file exposes an interface to building/using memory SSA to walk memory instructions using a use/d...
Type * getElementType() const
StringRef - Represent a constant reference to a string, i.e.
Legacy analysis pass which computes a DominatorTree.
int64_t getIndexedOffsetInType(Type *ElemTy, ArrayRef< Value *> Indices) const
Returns the offset from the beginning of the type for the specified indices.
bool isNullValue() const
Determine if all bits are clear.
This file describes how to lower LLVM code to machine code.
const BasicBlock * getParent() const
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.