50 class X86InterleavedAccessGroup {
62 const unsigned Factor;
93 unsigned NumSubVecElems);
98 unsigned NumSubVecElems);
101 unsigned NumSubVecElems);
115 : Inst(I), Shuffles(Shuffs), Indices(Ind), Factor(F), Subtarget(STarget),
116 DL(Inst->getModule()->getDataLayout()), Builder(B) {}
120 bool isSupported()
const;
124 bool lowerIntoOptimizedSequence();
129 bool X86InterleavedAccessGroup::isSupported()
const {
130 VectorType *ShuffleVecTy = Shuffles[0]->getType();
132 unsigned ShuffleElemSize = DL.getTypeSizeInBits(ShuffleEltTy);
133 unsigned WideInstSize;
141 if (!Subtarget.hasAVX() || (Factor != 4 && Factor != 3))
144 if (isa<LoadInst>(Inst)) {
145 WideInstSize = DL.getTypeSizeInBits(Inst->getType());
146 if (cast<LoadInst>(Inst)->getPointerAddressSpace())
149 WideInstSize = DL.getTypeSizeInBits(Shuffles[0]->
getType());
153 if (ShuffleElemSize == 64 && WideInstSize == 1024 && Factor == 4)
156 if (ShuffleElemSize == 8 && isa<StoreInst>(Inst) && Factor == 4 &&
157 (WideInstSize == 256 || WideInstSize == 512 || WideInstSize == 1024 ||
158 WideInstSize == 2048))
161 if (ShuffleElemSize == 8 && Factor == 3 &&
162 (WideInstSize == 384 || WideInstSize == 768 || WideInstSize == 1536))
168 void X86InterleavedAccessGroup::decompose(
171 assert((isa<LoadInst>(VecInst) || isa<ShuffleVectorInst>(VecInst)) &&
172 "Expected Load or Shuffle");
177 DL.getTypeSizeInBits(VecWidth) >=
178 DL.getTypeSizeInBits(SubVecTy) * NumSubVectors &&
179 "Invalid Inst-size!!!");
181 if (
auto *SVI = dyn_cast<ShuffleVectorInst>(VecInst)) {
182 Value *Op0 = SVI->getOperand(0);
183 Value *Op1 = SVI->getOperand(1);
186 for (
unsigned i = 0; i < NumSubVectors; ++i)
188 cast<ShuffleVectorInst>(Builder.CreateShuffleVector(
196 LoadInst *LI = cast<LoadInst>(VecInst);
199 unsigned int NumLoads = NumSubVectors;
203 unsigned VecLength = DL.getTypeSizeInBits(VecWidth);
204 if (VecLength == 768 || VecLength == 1536) {
208 NumLoads = NumSubVectors * (VecLength / 384);
212 for (
unsigned i = 0; i < NumLoads; i++) {
214 Value *NewBasePtr = Builder.CreateGEP(VecBasePtr, Builder.getInt32(i));
216 Builder.CreateAlignedLoad(NewBasePtr, LI->
getAlignment());
230 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
231 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
232 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
233 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 };
255 "This function doesn't accept width smaller then 256");
257 for (
unsigned i = 0; i < Mask.
size(); i++)
259 for (
unsigned i = 0; i < Mask.
size(); i++)
260 Out.
push_back(Mask[i] + HighOffset + NumOfElm);
283 unsigned VecElems,
unsigned Stride,
286 if (VecElems == 16) {
287 for (
unsigned i = 0; i < Stride; i++)
296 for (
unsigned i = 0; i < (VecElems / 16) * Stride; i += 2) {
298 (i + 1) / Stride * 16);
300 Vec[i % Stride], Vec[(i + 1) % Stride], OptimizeShuf);
301 OptimizeShuf.
clear();
304 if (VecElems == 32) {
309 for (
unsigned i = 0; i < Stride; i++)
310 TransposedMatrix[i] =
314 void X86InterleavedAccessGroup::interleave8bitStride4VF8(
324 TransposedMatrix.
resize(2);
329 for (
unsigned i = 0; i < 8; ++i) {
334 createUnpackShuffleMask<uint32_t>(VT, MaskLowTemp1,
true,
false);
335 createUnpackShuffleMask<uint32_t>(VT, MaskHighTemp1,
false,
false);
336 scaleShuffleMask<uint32_t>(2, MaskHighTemp1, MaskHighWord);
337 scaleShuffleMask<uint32_t>(2, MaskLowTemp1, MaskLowWord);
341 Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
343 Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
348 TransposedMatrix[0] =
349 Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskLowWord);
350 TransposedMatrix[1] =
351 Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskHighWord);
354 void X86InterleavedAccessGroup::interleave8bitStride4(
366 TransposedMatrix.
resize(4);
376 createUnpackShuffleMask<uint32_t>(VT, MaskLow,
true,
false);
377 createUnpackShuffleMask<uint32_t>(VT, MaskHigh,
false,
false);
382 createUnpackShuffleMask<uint32_t>(HalfVT, MaskLowTemp,
true,
false);
383 createUnpackShuffleMask<uint32_t>(HalfVT, MaskHighTemp,
false,
false);
384 scaleShuffleMask<uint32_t>(2, MaskLowTemp, LowHighMask[0]);
385 scaleShuffleMask<uint32_t>(2, MaskHighTemp, LowHighMask[1]);
393 IntrVec[0] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
394 IntrVec[1] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskHigh);
395 IntrVec[2] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
396 IntrVec[3] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskHigh);
404 for (
int i = 0; i < 4; i++)
405 VecOut[i] = Builder.CreateShuffleVector(IntrVec[i / 2], IntrVec[i / 2 + 2],
419 NumOfElm, 4, Builder);
436 int LaneCount =
std::max(VectorSize / 128, 1);
437 for (
int Lane = 0; Lane < LaneCount; Lane++)
438 for (
int i = 0, LaneSize = VF / LaneCount; i != LaneSize; ++i)
439 Mask.
push_back((i * Stride) % LaneSize + LaneSize * Lane);
449 for (
int i = 0, FirstGroupElement = 0; i < 3; i++) {
450 int GroupSize =
std::ceil((VF - FirstGroupElement) / 3.0);
452 FirstGroupElement = ((GroupSize)*3 + FirstGroupElement) % VF;
471 bool AlignDirection =
true,
bool Unary =
false) {
474 unsigned NumLaneElts = NumElts / NumLanes;
476 Imm = AlignDirection ? Imm : (NumLaneElts - Imm);
479 for (
unsigned l = 0; l != NumElts; l += NumLaneElts) {
480 for (
unsigned i = 0; i != NumLaneElts; ++i) {
484 if (Base >= NumLaneElts)
485 Base = Unary ? Base % NumLaneElts : Base + NumElts - NumLaneElts;
520 if (VecElems == 16) {
521 for (
int i = 0; i < 3; i++)
526 for (
unsigned j = 0; j < VecElems / 32; j++)
527 for (
int i = 0; i < 3; i++)
529 InVec[j * 6 + i], InVec[j * 6 + i + 3],
makeArrayRef(Concat, 32));
534 for (
int i = 0; i < 3; i++)
538 void X86InterleavedAccessGroup::deinterleave8bitStride3(
546 TransposedMatrix.
resize(3);
552 Value *Vec[6], *TempVector[3];
559 for (
int i = 0; i < 2; i++)
570 for (
int i = 0; i < 3; i++)
571 Vec[i] = Builder.CreateShuffleVector(
578 for (
int i = 0; i < 3; i++)
580 Builder.CreateShuffleVector(Vec[(i + 2) % 3], Vec[i], VPAlign[0]);
586 for (
int i = 0; i < 3; i++)
587 Vec[i] = Builder.CreateShuffleVector(TempVector[(i + 1) % 3], TempVector[i],
594 Value *TempVec = Builder.CreateShuffleVector(
596 TransposedMatrix[0] = Builder.CreateShuffleVector(
598 TransposedMatrix[1] = VecElems == 8 ? Vec[2] : TempVec;
599 TransposedMatrix[2] = VecElems == 8 ? TempVec : Vec[2];
607 int IndexGroup[3] = {0, 0, 0};
612 int Lane = (VectorWidth / 128 > 0) ? VectorWidth / 128 : 1;
613 for (
int i = 0; i < 3; i++) {
614 IndexGroup[(Index * 3) % (VF / Lane)] =
Index;
618 for (
int i = 0; i < VF / Lane; i++) {
624 void X86InterleavedAccessGroup::interleave8bitStride3(
632 TransposedMatrix.
resize(3);
639 Value *Vec[3], *TempVector[3];
644 for (
int i = 0; i < 3; i++)
654 Vec[0] = Builder.CreateShuffleVector(
656 Vec[1] = Builder.CreateShuffleVector(
664 for (
int i = 0; i < 3; i++)
666 Builder.CreateShuffleVector(Vec[i], Vec[(i + 2) % 3], VPAlign[1]);
672 for (
int i = 0; i < 3; i++)
673 Vec[i] = Builder.CreateShuffleVector(TempVector[i], TempVector[(i + 1) % 3],
685 void X86InterleavedAccessGroup::transpose_4x4(
688 assert(Matrix.
size() == 4 &&
"Invalid matrix size");
689 TransposedMatrix.
resize(4);
694 Value *IntrVec1 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
695 Value *IntrVec2 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
700 Value *IntrVec3 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
701 Value *IntrVec4 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
706 TransposedMatrix[0] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
707 TransposedMatrix[2] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
712 TransposedMatrix[1] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
713 TransposedMatrix[3] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
718 bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
721 VectorType *ShuffleTy = Shuffles[0]->getType();
723 if (isa<LoadInst>(Inst)) {
725 decompose(Inst, Factor, ShuffleTy, DecomposedVectors);
727 Type *ShuffleEltTy = Inst->getType();
733 switch (NumSubVecElems) {
737 transpose_4x4(DecomposedVectors, TransposedVectors);
743 deinterleave8bitStride3(DecomposedVectors, TransposedVectors,
750 for (
unsigned i = 0, e = Shuffles.size(); i < e; ++i)
751 Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);
762 decompose(Shuffles[0], Factor,
VectorType::get(ShuffleEltTy, NumSubVecElems),
767 switch (NumSubVecElems) {
769 transpose_4x4(DecomposedVectors, TransposedVectors);
772 interleave8bitStride4VF8(DecomposedVectors, TransposedVectors);
778 interleave8bitStride4(DecomposedVectors, TransposedVectors,
781 interleave8bitStride3(DecomposedVectors, TransposedVectors,
806 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
807 "Invalid interleave factor");
808 assert(!Shuffles.
empty() &&
"Empty shufflevector input");
810 "Unmatched number of shufflevectors and indices");
814 X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget,
817 return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
822 unsigned Factor)
const {
823 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
824 "Invalid interleave factor");
827 "Invalid interleaved store");
833 for (
unsigned i = 0; i < Factor; i++)
834 Indices.push_back(
Mask[i]);
840 X86InterleavedAccessGroup Grp(SI, Shuffles, Indices, Factor, Subtarget,
843 return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
static void setGroupSize(MVT VT, SmallVectorImpl< uint32_t > &SizeInfo)
Type * getVectorElementType() const
static MVT getIntegerVT(unsigned BitWidth)
A parsed version of the target data layout string in and methods for querying it. ...
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
This class represents lattice values for constants.
static MVT getVectorVT(MVT VT, unsigned NumElements)
void push_back(const T &Elt)
unsigned getVectorNumElements() const
This instruction constructs a fixed permutation of two input vectors.
LLVMContext & getContext() const
All values hold a context through their type.
An instruction for reading from memory.
static void genShuffleBland(MVT VT, ArrayRef< uint32_t > Mask, SmallVectorImpl< uint32_t > &Out, int LowOffset, int HighOffset)
bool isVectorTy() const
True if this is an instance of VectorType.
bool lowerInterleavedLoad(LoadInst *LI, ArrayRef< ShuffleVectorInst *> Shuffles, ArrayRef< unsigned > Indices, unsigned Factor) const override
Lower interleaved load(s) into target specific instructions/intrinsics.
ArrayRef< T > makeArrayRef(const T &OneElt)
Construct an ArrayRef from a single element.
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
unsigned getSizeInBits() const
Constant * createSequentialMask(IRBuilder<> &Builder, unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
Type * getType() const
All values are typed, get the type of this value.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
An instruction for storing to memory.
MVT getVectorElementType() const
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Value * concatenateVectors(IRBuilder<> &Builder, ArrayRef< Value *> Vecs)
Concatenate a list of vectors.
The instances of the Type class are immutable: once they are created, they are never changed...
unsigned getScalarSizeInBits() const
size_t size() const
size - Get the array size.
LLVM_ATTRIBUTE_ALWAYS_INLINE iterator begin()
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Value * getPointerOperand()
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
static wasm::ValType getType(const TargetRegisterClass *RC)
static void concatSubVector(Value **Vec, ArrayRef< Instruction *> InVec, unsigned VecElems, IRBuilder<> Builder)
static void reorderSubVector(MVT VT, SmallVectorImpl< Value *> &TransposedMatrix, ArrayRef< Value *> Vec, ArrayRef< uint32_t > VPShuf, unsigned VecElems, unsigned Stride, IRBuilder<> Builder)
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override
Lower interleaved store(s) into target specific instructions/intrinsics.
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small...
Module.h This file contains the declarations for the Module class.
static void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static MVT scaleVectorType(MVT VT)
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
unsigned getVectorNumElements() const
Class to represent vector types.
unsigned getAlignment() const
Return the alignment of the access that is being performed.
static void group2Shuffle(MVT VT, SmallVectorImpl< uint32_t > &Mask, SmallVectorImpl< uint32_t > &Output)
unsigned getAlignment() const
Return the alignment of the access that is being performed.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
static void createShuffleStride(MVT VT, int Stride, SmallVectorImpl< uint32_t > &Mask)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
LLVM Value Representation.
static VectorType * get(Type *ElementType, unsigned NumElements)
This static method is the primary way to construct an VectorType.
std::underlying_type< E >::type Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
void DecodePALIGNRMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
OutputIt copy(R &&Range, OutputIt Out)
VectorType * getType() const
Overload to return most specific vector type.
Value * getPointerOperand()
static IntegerType * getInt8Ty(LLVMContext &C)
bool empty() const
empty - Check if the array is empty.