LLVM  8.0.1
X86InterleavedAccess.cpp
Go to the documentation of this file.
1 //===- X86InterleavedAccess.cpp -------------------------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// This file contains the X86 implementation of the interleaved accesses
12 /// optimization generating X86-specific instructions/intrinsics for
13 /// interleaved access groups.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "X86ISelLowering.h"
18 #include "X86Subtarget.h"
19 #include "llvm/ADT/ArrayRef.h"
20 #include "llvm/ADT/SmallVector.h"
22 #include "llvm/IR/Constants.h"
23 #include "llvm/IR/DataLayout.h"
24 #include "llvm/IR/DerivedTypes.h"
25 #include "llvm/IR/IRBuilder.h"
26 #include "llvm/IR/Instruction.h"
27 #include "llvm/IR/Instructions.h"
28 #include "llvm/IR/Module.h"
29 #include "llvm/IR/Type.h"
30 #include "llvm/IR/Value.h"
31 #include "llvm/Support/Casting.h"
33 #include <algorithm>
34 #include <cassert>
35 #include <cmath>
36 #include <cstdint>
37 
38 using namespace llvm;
39 
40 namespace {
41 
42 /// This class holds necessary information to represent an interleaved
43 /// access group and supports utilities to lower the group into
44 /// X86-specific instructions/intrinsics.
45 /// E.g. A group of interleaving access loads (Factor = 2; accessing every
46 /// other element)
47 /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
48 /// %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <0, 2, 4, 6>
49 /// %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <1, 3, 5, 7>
50 class X86InterleavedAccessGroup {
51  /// Reference to the wide-load instruction of an interleaved access
52  /// group.
53  Instruction *const Inst;
54 
55  /// Reference to the shuffle(s), consumer(s) of the (load) 'Inst'.
57 
58  /// Reference to the starting index of each user-shuffle.
59  ArrayRef<unsigned> Indices;
60 
61  /// Reference to the interleaving stride in terms of elements.
62  const unsigned Factor;
63 
64  /// Reference to the underlying target.
65  const X86Subtarget &Subtarget;
66 
67  const DataLayout &DL;
68 
69  IRBuilder<> &Builder;
70 
71  /// Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors
72  /// sub vectors of type \p T. Returns the sub-vectors in \p DecomposedVectors.
73  void decompose(Instruction *Inst, unsigned NumSubVectors, VectorType *T,
74  SmallVectorImpl<Instruction *> &DecomposedVectors);
75 
76  /// Performs matrix transposition on a 4x4 matrix \p InputVectors and
77  /// returns the transposed-vectors in \p TransposedVectors.
78  /// E.g.
79  /// InputVectors:
80  /// In-V0 = p1, p2, p3, p4
81  /// In-V1 = q1, q2, q3, q4
82  /// In-V2 = r1, r2, r3, r4
83  /// In-V3 = s1, s2, s3, s4
84  /// OutputVectors:
85  /// Out-V0 = p1, q1, r1, s1
86  /// Out-V1 = p2, q2, r2, s2
87  /// Out-V2 = p3, q3, r3, s3
88  /// Out-V3 = P4, q4, r4, s4
89  void transpose_4x4(ArrayRef<Instruction *> InputVectors,
90  SmallVectorImpl<Value *> &TransposedMatrix);
91  void interleave8bitStride4(ArrayRef<Instruction *> InputVectors,
92  SmallVectorImpl<Value *> &TransposedMatrix,
93  unsigned NumSubVecElems);
94  void interleave8bitStride4VF8(ArrayRef<Instruction *> InputVectors,
95  SmallVectorImpl<Value *> &TransposedMatrix);
96  void interleave8bitStride3(ArrayRef<Instruction *> InputVectors,
97  SmallVectorImpl<Value *> &TransposedMatrix,
98  unsigned NumSubVecElems);
99  void deinterleave8bitStride3(ArrayRef<Instruction *> InputVectors,
100  SmallVectorImpl<Value *> &TransposedMatrix,
101  unsigned NumSubVecElems);
102 
103 public:
104  /// In order to form an interleaved access group X86InterleavedAccessGroup
105  /// requires a wide-load instruction \p 'I', a group of interleaved-vectors
106  /// \p Shuffs, reference to the first indices of each interleaved-vector
107  /// \p 'Ind' and the interleaving stride factor \p F. In order to generate
108  /// X86-specific instructions/intrinsics it also requires the underlying
109  /// target information \p STarget.
110  explicit X86InterleavedAccessGroup(Instruction *I,
112  ArrayRef<unsigned> Ind, const unsigned F,
113  const X86Subtarget &STarget,
114  IRBuilder<> &B)
115  : Inst(I), Shuffles(Shuffs), Indices(Ind), Factor(F), Subtarget(STarget),
116  DL(Inst->getModule()->getDataLayout()), Builder(B) {}
117 
118  /// Returns true if this interleaved access group can be lowered into
119  /// x86-specific instructions/intrinsics, false otherwise.
120  bool isSupported() const;
121 
122  /// Lowers this interleaved access group into X86-specific
123  /// instructions/intrinsics.
124  bool lowerIntoOptimizedSequence();
125 };
126 
127 } // end anonymous namespace
128 
129 bool X86InterleavedAccessGroup::isSupported() const {
130  VectorType *ShuffleVecTy = Shuffles[0]->getType();
131  Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType();
132  unsigned ShuffleElemSize = DL.getTypeSizeInBits(ShuffleEltTy);
133  unsigned WideInstSize;
134 
135  // Currently, lowering is supported for the following vectors:
136  // Stride 4:
137  // 1. Store and load of 4-element vectors of 64 bits on AVX.
138  // 2. Store of 16/32-element vectors of 8 bits on AVX.
139  // Stride 3:
140  // 1. Load of 16/32-element vectors of 8 bits on AVX.
141  if (!Subtarget.hasAVX() || (Factor != 4 && Factor != 3))
142  return false;
143 
144  if (isa<LoadInst>(Inst)) {
145  WideInstSize = DL.getTypeSizeInBits(Inst->getType());
146  if (cast<LoadInst>(Inst)->getPointerAddressSpace())
147  return false;
148  } else
149  WideInstSize = DL.getTypeSizeInBits(Shuffles[0]->getType());
150 
151  // We support shuffle represents stride 4 for byte type with size of
152  // WideInstSize.
153  if (ShuffleElemSize == 64 && WideInstSize == 1024 && Factor == 4)
154  return true;
155 
156  if (ShuffleElemSize == 8 && isa<StoreInst>(Inst) && Factor == 4 &&
157  (WideInstSize == 256 || WideInstSize == 512 || WideInstSize == 1024 ||
158  WideInstSize == 2048))
159  return true;
160 
161  if (ShuffleElemSize == 8 && Factor == 3 &&
162  (WideInstSize == 384 || WideInstSize == 768 || WideInstSize == 1536))
163  return true;
164 
165  return false;
166 }
167 
168 void X86InterleavedAccessGroup::decompose(
169  Instruction *VecInst, unsigned NumSubVectors, VectorType *SubVecTy,
170  SmallVectorImpl<Instruction *> &DecomposedVectors) {
171  assert((isa<LoadInst>(VecInst) || isa<ShuffleVectorInst>(VecInst)) &&
172  "Expected Load or Shuffle");
173 
174  Type *VecWidth = VecInst->getType();
175  (void)VecWidth;
176  assert(VecWidth->isVectorTy() &&
177  DL.getTypeSizeInBits(VecWidth) >=
178  DL.getTypeSizeInBits(SubVecTy) * NumSubVectors &&
179  "Invalid Inst-size!!!");
180 
181  if (auto *SVI = dyn_cast<ShuffleVectorInst>(VecInst)) {
182  Value *Op0 = SVI->getOperand(0);
183  Value *Op1 = SVI->getOperand(1);
184 
185  // Generate N(= NumSubVectors) shuffles of T(= SubVecTy) type.
186  for (unsigned i = 0; i < NumSubVectors; ++i)
187  DecomposedVectors.push_back(
188  cast<ShuffleVectorInst>(Builder.CreateShuffleVector(
189  Op0, Op1,
190  createSequentialMask(Builder, Indices[i],
191  SubVecTy->getVectorNumElements(), 0))));
192  return;
193  }
194 
195  // Decompose the load instruction.
196  LoadInst *LI = cast<LoadInst>(VecInst);
197  Type *VecBasePtrTy = SubVecTy->getPointerTo(LI->getPointerAddressSpace());
198  Value *VecBasePtr;
199  unsigned int NumLoads = NumSubVectors;
200  // In the case of stride 3 with a vector of 32 elements load the information
201  // in the following way:
202  // [0,1...,VF/2-1,VF/2+VF,VF/2+VF+1,...,2VF-1]
203  unsigned VecLength = DL.getTypeSizeInBits(VecWidth);
204  if (VecLength == 768 || VecLength == 1536) {
205  Type *VecTran =
207  VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecTran);
208  NumLoads = NumSubVectors * (VecLength / 384);
209  } else
210  VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
211  // Generate N loads of T type.
212  for (unsigned i = 0; i < NumLoads; i++) {
213  // TODO: Support inbounds GEP.
214  Value *NewBasePtr = Builder.CreateGEP(VecBasePtr, Builder.getInt32(i));
215  Instruction *NewLoad =
216  Builder.CreateAlignedLoad(NewBasePtr, LI->getAlignment());
217  DecomposedVectors.push_back(NewLoad);
218  }
219 }
220 
221 // Changing the scale of the vector type by reducing the number of elements and
222 // doubling the scalar size.
223 static MVT scaleVectorType(MVT VT) {
224  unsigned ScalarSize = VT.getVectorElementType().getScalarSizeInBits() * 2;
225  return MVT::getVectorVT(MVT::getIntegerVT(ScalarSize),
226  VT.getVectorNumElements() / 2);
227 }
228 
229 static uint32_t Concat[] = {
230  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
231  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
232  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
233  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 };
234 
235 // genShuffleBland - Creates shuffle according to two vectors.This function is
236 // only works on instructions with lane inside 256 registers. According to
237 // the mask 'Mask' creates a new Mask 'Out' by the offset of the mask. The
238 // offset amount depends on the two integer, 'LowOffset' and 'HighOffset'.
239 // Where the 'LowOffset' refers to the first vector and the highOffset refers to
240 // the second vector.
241 // |a0....a5,b0....b4,c0....c4|a16..a21,b16..b20,c16..c20|
242 // |c5...c10,a5....a9,b5....b9|c21..c26,a22..a26,b21..b25|
243 // |b10..b15,c11..c15,a10..a15|b26..b31,c27..c31,a27..a31|
244 // For the sequence to work as a mirror to the load.
245 // We must consider the elements order as above.
246 // In this function we are combining two types of shuffles.
247 // The first one is vpshufed and the second is a type of "blend" shuffle.
248 // By computing the shuffle on a sequence of 16 elements(one lane) and add the
249 // correct offset. We are creating a vpsuffed + blend sequence between two
250 // shuffles.
252  SmallVectorImpl<uint32_t> &Out, int LowOffset,
253  int HighOffset) {
254  assert(VT.getSizeInBits() >= 256 &&
255  "This function doesn't accept width smaller then 256");
256  unsigned NumOfElm = VT.getVectorNumElements();
257  for (unsigned i = 0; i < Mask.size(); i++)
258  Out.push_back(Mask[i] + LowOffset);
259  for (unsigned i = 0; i < Mask.size(); i++)
260  Out.push_back(Mask[i] + HighOffset + NumOfElm);
261 }
262 
263 // reorderSubVector returns the data to is the original state. And de-facto is
264 // the opposite of the function concatSubVector.
265 
266 // For VecElems = 16
267 // Invec[0] - |0| TransposedMatrix[0] - |0|
268 // Invec[1] - |1| => TransposedMatrix[1] - |1|
269 // Invec[2] - |2| TransposedMatrix[2] - |2|
270 
271 // For VecElems = 32
272 // Invec[0] - |0|3| TransposedMatrix[0] - |0|1|
273 // Invec[1] - |1|4| => TransposedMatrix[1] - |2|3|
274 // Invec[2] - |2|5| TransposedMatrix[2] - |4|5|
275 
276 // For VecElems = 64
277 // Invec[0] - |0|3|6|9 | TransposedMatrix[0] - |0|1|2 |3 |
278 // Invec[1] - |1|4|7|10| => TransposedMatrix[1] - |4|5|6 |7 |
279 // Invec[2] - |2|5|8|11| TransposedMatrix[2] - |8|9|10|11|
280 
281 static void reorderSubVector(MVT VT, SmallVectorImpl<Value *> &TransposedMatrix,
283  unsigned VecElems, unsigned Stride,
284  IRBuilder<> Builder) {
285 
286  if (VecElems == 16) {
287  for (unsigned i = 0; i < Stride; i++)
288  TransposedMatrix[i] = Builder.CreateShuffleVector(
289  Vec[i], UndefValue::get(Vec[i]->getType()), VPShuf);
290  return;
291  }
292 
293  SmallVector<uint32_t, 32> OptimizeShuf;
294  Value *Temp[8];
295 
296  for (unsigned i = 0; i < (VecElems / 16) * Stride; i += 2) {
297  genShuffleBland(VT, VPShuf, OptimizeShuf, (i / Stride) * 16,
298  (i + 1) / Stride * 16);
299  Temp[i / 2] = Builder.CreateShuffleVector(
300  Vec[i % Stride], Vec[(i + 1) % Stride], OptimizeShuf);
301  OptimizeShuf.clear();
302  }
303 
304  if (VecElems == 32) {
305  std::copy(Temp, Temp + Stride, TransposedMatrix.begin());
306  return;
307  }
308  else
309  for (unsigned i = 0; i < Stride; i++)
310  TransposedMatrix[i] =
311  Builder.CreateShuffleVector(Temp[2 * i], Temp[2 * i + 1], Concat);
312 }
313 
314 void X86InterleavedAccessGroup::interleave8bitStride4VF8(
316  SmallVectorImpl<Value *> &TransposedMatrix) {
317  // Assuming we start from the following vectors:
318  // Matrix[0]= c0 c1 c2 c3 c4 ... c7
319  // Matrix[1]= m0 m1 m2 m3 m4 ... m7
320  // Matrix[2]= y0 y1 y2 y3 y4 ... y7
321  // Matrix[3]= k0 k1 k2 k3 k4 ... k7
322 
323  MVT VT = MVT::v8i16;
324  TransposedMatrix.resize(2);
326  SmallVector<uint32_t, 32> MaskLowTemp1, MaskLowWord;
327  SmallVector<uint32_t, 32> MaskHighTemp1, MaskHighWord;
328 
329  for (unsigned i = 0; i < 8; ++i) {
330  MaskLow.push_back(i);
331  MaskLow.push_back(i + 8);
332  }
333 
334  createUnpackShuffleMask<uint32_t>(VT, MaskLowTemp1, true, false);
335  createUnpackShuffleMask<uint32_t>(VT, MaskHighTemp1, false, false);
336  scaleShuffleMask<uint32_t>(2, MaskHighTemp1, MaskHighWord);
337  scaleShuffleMask<uint32_t>(2, MaskLowTemp1, MaskLowWord);
338  // IntrVec1Low = c0 m0 c1 m1 c2 m2 c3 m3 c4 m4 c5 m5 c6 m6 c7 m7
339  // IntrVec2Low = y0 k0 y1 k1 y2 k2 y3 k3 y4 k4 y5 k5 y6 k6 y7 k7
340  Value *IntrVec1Low =
341  Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
342  Value *IntrVec2Low =
343  Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
344 
345  // TransposedMatrix[0] = c0 m0 y0 k0 c1 m1 y1 k1 c2 m2 y2 k2 c3 m3 y3 k3
346  // TransposedMatrix[1] = c4 m4 y4 k4 c5 m5 y5 k5 c6 m6 y6 k6 c7 m7 y7 k7
347 
348  TransposedMatrix[0] =
349  Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskLowWord);
350  TransposedMatrix[1] =
351  Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskHighWord);
352 }
353 
354 void X86InterleavedAccessGroup::interleave8bitStride4(
355  ArrayRef<Instruction *> Matrix, SmallVectorImpl<Value *> &TransposedMatrix,
356  unsigned NumOfElm) {
357  // Example: Assuming we start from the following vectors:
358  // Matrix[0]= c0 c1 c2 c3 c4 ... c31
359  // Matrix[1]= m0 m1 m2 m3 m4 ... m31
360  // Matrix[2]= y0 y1 y2 y3 y4 ... y31
361  // Matrix[3]= k0 k1 k2 k3 k4 ... k31
362 
363  MVT VT = MVT::getVectorVT(MVT::i8, NumOfElm);
364  MVT HalfVT = scaleVectorType(VT);
365 
366  TransposedMatrix.resize(4);
367  SmallVector<uint32_t, 32> MaskHigh;
369  SmallVector<uint32_t, 32> LowHighMask[2];
370  SmallVector<uint32_t, 32> MaskHighTemp;
371  SmallVector<uint32_t, 32> MaskLowTemp;
372 
373  // MaskHighTemp and MaskLowTemp built in the vpunpckhbw and vpunpcklbw X86
374  // shuffle pattern.
375 
376  createUnpackShuffleMask<uint32_t>(VT, MaskLow, true, false);
377  createUnpackShuffleMask<uint32_t>(VT, MaskHigh, false, false);
378 
379  // MaskHighTemp1 and MaskLowTemp1 built in the vpunpckhdw and vpunpckldw X86
380  // shuffle pattern.
381 
382  createUnpackShuffleMask<uint32_t>(HalfVT, MaskLowTemp, true, false);
383  createUnpackShuffleMask<uint32_t>(HalfVT, MaskHighTemp, false, false);
384  scaleShuffleMask<uint32_t>(2, MaskLowTemp, LowHighMask[0]);
385  scaleShuffleMask<uint32_t>(2, MaskHighTemp, LowHighMask[1]);
386 
387  // IntrVec1Low = c0 m0 c1 m1 ... c7 m7 | c16 m16 c17 m17 ... c23 m23
388  // IntrVec1High = c8 m8 c9 m9 ... c15 m15 | c24 m24 c25 m25 ... c31 m31
389  // IntrVec2Low = y0 k0 y1 k1 ... y7 k7 | y16 k16 y17 k17 ... y23 k23
390  // IntrVec2High = y8 k8 y9 k9 ... y15 k15 | y24 k24 y25 k25 ... y31 k31
391  Value *IntrVec[4];
392 
393  IntrVec[0] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
394  IntrVec[1] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskHigh);
395  IntrVec[2] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
396  IntrVec[3] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskHigh);
397 
398  // cmyk4 cmyk5 cmyk6 cmyk7 | cmyk20 cmyk21 cmyk22 cmyk23
399  // cmyk12 cmyk13 cmyk14 cmyk15 | cmyk28 cmyk29 cmyk30 cmyk31
400  // cmyk0 cmyk1 cmyk2 cmyk3 | cmyk16 cmyk17 cmyk18 cmyk19
401  // cmyk8 cmyk9 cmyk10 cmyk11 | cmyk24 cmyk25 cmyk26 cmyk27
402 
403  Value *VecOut[4];
404  for (int i = 0; i < 4; i++)
405  VecOut[i] = Builder.CreateShuffleVector(IntrVec[i / 2], IntrVec[i / 2 + 2],
406  LowHighMask[i % 2]);
407 
408  // cmyk0 cmyk1 cmyk2 cmyk3 | cmyk4 cmyk5 cmyk6 cmyk7
409  // cmyk8 cmyk9 cmyk10 cmyk11 | cmyk12 cmyk13 cmyk14 cmyk15
410  // cmyk16 cmyk17 cmyk18 cmyk19 | cmyk20 cmyk21 cmyk22 cmyk23
411  // cmyk24 cmyk25 cmyk26 cmyk27 | cmyk28 cmyk29 cmyk30 cmyk31
412 
413  if (VT == MVT::v16i8) {
414  std::copy(VecOut, VecOut + 4, TransposedMatrix.begin());
415  return;
416  }
417 
418  reorderSubVector(VT, TransposedMatrix, VecOut, makeArrayRef(Concat, 16),
419  NumOfElm, 4, Builder);
420 }
421 
422 // createShuffleStride returns shuffle mask of size N.
423 // The shuffle pattern is as following :
424 // {0, Stride%(VF/Lane), (2*Stride%(VF/Lane))...(VF*Stride/Lane)%(VF/Lane),
425 // (VF/ Lane) ,(VF / Lane)+Stride%(VF/Lane),...,
426 // (VF / Lane)+(VF*Stride/Lane)%(VF/Lane)}
427 // Where Lane is the # of lanes in a register:
428 // VectorSize = 128 => Lane = 1
429 // VectorSize = 256 => Lane = 2
430 // For example shuffle pattern for VF 16 register size 256 -> lanes = 2
431 // {<[0|3|6|1|4|7|2|5]-[8|11|14|9|12|15|10|13]>}
432 static void createShuffleStride(MVT VT, int Stride,
434  int VectorSize = VT.getSizeInBits();
435  int VF = VT.getVectorNumElements();
436  int LaneCount = std::max(VectorSize / 128, 1);
437  for (int Lane = 0; Lane < LaneCount; Lane++)
438  for (int i = 0, LaneSize = VF / LaneCount; i != LaneSize; ++i)
439  Mask.push_back((i * Stride) % LaneSize + LaneSize * Lane);
440 }
441 
442 // setGroupSize sets 'SizeInfo' to the size(number of elements) of group
443 // inside mask a shuffleMask. A mask contains exactly 3 groups, where
444 // each group is a monotonically increasing sequence with stride 3.
445 // For example shuffleMask {0,3,6,1,4,7,2,5} => {3,3,2}
446 static void setGroupSize(MVT VT, SmallVectorImpl<uint32_t> &SizeInfo) {
447  int VectorSize = VT.getSizeInBits();
448  int VF = VT.getVectorNumElements() / std::max(VectorSize / 128, 1);
449  for (int i = 0, FirstGroupElement = 0; i < 3; i++) {
450  int GroupSize = std::ceil((VF - FirstGroupElement) / 3.0);
451  SizeInfo.push_back(GroupSize);
452  FirstGroupElement = ((GroupSize)*3 + FirstGroupElement) % VF;
453  }
454 }
455 
456 // DecodePALIGNRMask returns the shuffle mask of vpalign instruction.
457 // vpalign works according to lanes
458 // Where Lane is the # of lanes in a register:
459 // VectorWide = 128 => Lane = 1
460 // VectorWide = 256 => Lane = 2
461 // For Lane = 1 shuffle pattern is: {DiffToJump,...,DiffToJump+VF-1}.
462 // For Lane = 2 shuffle pattern is:
463 // {DiffToJump,...,VF/2-1,VF,...,DiffToJump+VF-1}.
464 // Imm variable sets the offset amount. The result of the
465 // function is stored inside ShuffleMask vector and it built as described in
466 // the begin of the description. AlignDirection is a boolean that indicates the
467 // direction of the alignment. (false - align to the "right" side while true -
468 // align to the "left" side)
469 static void DecodePALIGNRMask(MVT VT, unsigned Imm,
470  SmallVectorImpl<uint32_t> &ShuffleMask,
471  bool AlignDirection = true, bool Unary = false) {
472  unsigned NumElts = VT.getVectorNumElements();
473  unsigned NumLanes = std::max((int)VT.getSizeInBits() / 128, 1);
474  unsigned NumLaneElts = NumElts / NumLanes;
475 
476  Imm = AlignDirection ? Imm : (NumLaneElts - Imm);
477  unsigned Offset = Imm * (VT.getScalarSizeInBits() / 8);
478 
479  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
480  for (unsigned i = 0; i != NumLaneElts; ++i) {
481  unsigned Base = i + Offset;
482  // if i+offset is out of this lane then we actually need the other source
483  // If Unary the other source is the first source.
484  if (Base >= NumLaneElts)
485  Base = Unary ? Base % NumLaneElts : Base + NumElts - NumLaneElts;
486  ShuffleMask.push_back(Base + l);
487  }
488  }
489 }
490 
491 // concatSubVector - The function rebuilds the data to a correct expected
492 // order. An assumption(The shape of the matrix) was taken for the
493 // deinterleaved to work with lane's instructions like 'vpalign' or 'vphuf'.
494 // This function ensures that the data is built in correct way for the lane
495 // instructions. Each lane inside the vector is a 128-bit length.
496 //
497 // The 'InVec' argument contains the data in increasing order. In InVec[0] You
498 // can find the first 128 bit data. The number of different lanes inside a
499 // vector depends on the 'VecElems'.In general, the formula is
500 // VecElems * type / 128. The size of the array 'InVec' depends and equal to
501 // 'VecElems'.
502 
503 // For VecElems = 16
504 // Invec[0] - |0| Vec[0] - |0|
505 // Invec[1] - |1| => Vec[1] - |1|
506 // Invec[2] - |2| Vec[2] - |2|
507 
508 // For VecElems = 32
509 // Invec[0] - |0|1| Vec[0] - |0|3|
510 // Invec[1] - |2|3| => Vec[1] - |1|4|
511 // Invec[2] - |4|5| Vec[2] - |2|5|
512 
513 // For VecElems = 64
514 // Invec[0] - |0|1|2 |3 | Vec[0] - |0|3|6|9 |
515 // Invec[1] - |4|5|6 |7 | => Vec[1] - |1|4|7|10|
516 // Invec[2] - |8|9|10|11| Vec[2] - |2|5|8|11|
517 
519  unsigned VecElems, IRBuilder<> Builder) {
520  if (VecElems == 16) {
521  for (int i = 0; i < 3; i++)
522  Vec[i] = InVec[i];
523  return;
524  }
525 
526  for (unsigned j = 0; j < VecElems / 32; j++)
527  for (int i = 0; i < 3; i++)
528  Vec[i + j * 3] = Builder.CreateShuffleVector(
529  InVec[j * 6 + i], InVec[j * 6 + i + 3], makeArrayRef(Concat, 32));
530 
531  if (VecElems == 32)
532  return;
533 
534  for (int i = 0; i < 3; i++)
535  Vec[i] = Builder.CreateShuffleVector(Vec[i], Vec[i + 3], Concat);
536 }
537 
538 void X86InterleavedAccessGroup::deinterleave8bitStride3(
539  ArrayRef<Instruction *> InVec, SmallVectorImpl<Value *> &TransposedMatrix,
540  unsigned VecElems) {
541  // Example: Assuming we start from the following vectors:
542  // Matrix[0]= a0 b0 c0 a1 b1 c1 a2 b2
543  // Matrix[1]= c2 a3 b3 c3 a4 b4 c4 a5
544  // Matrix[2]= b5 c5 a6 b6 c6 a7 b7 c7
545 
546  TransposedMatrix.resize(3);
548  SmallVector<uint32_t, 32> VPAlign[2];
549  SmallVector<uint32_t, 32> VPAlign2;
550  SmallVector<uint32_t, 32> VPAlign3;
551  SmallVector<uint32_t, 3> GroupSize;
552  Value *Vec[6], *TempVector[3];
553 
554  MVT VT = MVT::getVT(Shuffles[0]->getType());
555 
556  createShuffleStride(VT, 3, VPShuf);
557  setGroupSize(VT, GroupSize);
558 
559  for (int i = 0; i < 2; i++)
560  DecodePALIGNRMask(VT, GroupSize[2 - i], VPAlign[i], false);
561 
562  DecodePALIGNRMask(VT, GroupSize[2] + GroupSize[1], VPAlign2, true, true);
563  DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, true, true);
564 
565  concatSubVector(Vec, InVec, VecElems, Builder);
566  // Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1
567  // Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4
568  // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
569 
570  for (int i = 0; i < 3; i++)
571  Vec[i] = Builder.CreateShuffleVector(
572  Vec[i], UndefValue::get(Vec[0]->getType()), VPShuf);
573 
574  // TempVector[0]= a6 a7 a0 a1 a2 b0 b1 b2
575  // TempVector[1]= c0 c1 c2 c3 c4 a3 a4 a5
576  // TempVector[2]= b3 b4 b5 b6 b7 c5 c6 c7
577 
578  for (int i = 0; i < 3; i++)
579  TempVector[i] =
580  Builder.CreateShuffleVector(Vec[(i + 2) % 3], Vec[i], VPAlign[0]);
581 
582  // Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2
583  // Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
584  // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
585 
586  for (int i = 0; i < 3; i++)
587  Vec[i] = Builder.CreateShuffleVector(TempVector[(i + 1) % 3], TempVector[i],
588  VPAlign[1]);
589 
590  // TransposedMatrix[0]= a0 a1 a2 a3 a4 a5 a6 a7
591  // TransposedMatrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
592  // TransposedMatrix[2]= c0 c1 c2 c3 c4 c5 c6 c7
593 
594  Value *TempVec = Builder.CreateShuffleVector(
595  Vec[1], UndefValue::get(Vec[1]->getType()), VPAlign3);
596  TransposedMatrix[0] = Builder.CreateShuffleVector(
597  Vec[0], UndefValue::get(Vec[1]->getType()), VPAlign2);
598  TransposedMatrix[1] = VecElems == 8 ? Vec[2] : TempVec;
599  TransposedMatrix[2] = VecElems == 8 ? TempVec : Vec[2];
600 }
601 
602 // group2Shuffle reorder the shuffle stride back into continuous order.
603 // For example For VF16 with Mask1 = {0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13} =>
604 // MaskResult = {0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5}.
606  SmallVectorImpl<uint32_t> &Output) {
607  int IndexGroup[3] = {0, 0, 0};
608  int Index = 0;
609  int VectorWidth = VT.getSizeInBits();
610  int VF = VT.getVectorNumElements();
611  // Find the index of the different groups.
612  int Lane = (VectorWidth / 128 > 0) ? VectorWidth / 128 : 1;
613  for (int i = 0; i < 3; i++) {
614  IndexGroup[(Index * 3) % (VF / Lane)] = Index;
615  Index += Mask[i];
616  }
617  // According to the index compute the convert mask.
618  for (int i = 0; i < VF / Lane; i++) {
619  Output.push_back(IndexGroup[i % 3]);
620  IndexGroup[i % 3]++;
621  }
622 }
623 
624 void X86InterleavedAccessGroup::interleave8bitStride3(
625  ArrayRef<Instruction *> InVec, SmallVectorImpl<Value *> &TransposedMatrix,
626  unsigned VecElems) {
627  // Example: Assuming we start from the following vectors:
628  // Matrix[0]= a0 a1 a2 a3 a4 a5 a6 a7
629  // Matrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
630  // Matrix[2]= c0 c1 c2 c3 c3 a7 b7 c7
631 
632  TransposedMatrix.resize(3);
633  SmallVector<uint32_t, 3> GroupSize;
635  SmallVector<uint32_t, 32> VPAlign[3];
636  SmallVector<uint32_t, 32> VPAlign2;
637  SmallVector<uint32_t, 32> VPAlign3;
638 
639  Value *Vec[3], *TempVector[3];
640  MVT VT = MVT::getVectorVT(MVT::i8, VecElems);
641 
642  setGroupSize(VT, GroupSize);
643 
644  for (int i = 0; i < 3; i++)
645  DecodePALIGNRMask(VT, GroupSize[i], VPAlign[i]);
646 
647  DecodePALIGNRMask(VT, GroupSize[1] + GroupSize[2], VPAlign2, false, true);
648  DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, false, true);
649 
650  // Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2
651  // Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
652  // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
653 
654  Vec[0] = Builder.CreateShuffleVector(
655  InVec[0], UndefValue::get(InVec[0]->getType()), VPAlign2);
656  Vec[1] = Builder.CreateShuffleVector(
657  InVec[1], UndefValue::get(InVec[1]->getType()), VPAlign3);
658  Vec[2] = InVec[2];
659 
660  // Vec[0]= a6 a7 a0 a1 a2 b0 b1 b2
661  // Vec[1]= c0 c1 c2 c3 c4 a3 a4 a5
662  // Vec[2]= b3 b4 b5 b6 b7 c5 c6 c7
663 
664  for (int i = 0; i < 3; i++)
665  TempVector[i] =
666  Builder.CreateShuffleVector(Vec[i], Vec[(i + 2) % 3], VPAlign[1]);
667 
668  // Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1
669  // Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4
670  // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
671 
672  for (int i = 0; i < 3; i++)
673  Vec[i] = Builder.CreateShuffleVector(TempVector[i], TempVector[(i + 1) % 3],
674  VPAlign[2]);
675 
676  // TransposedMatrix[0] = a0 b0 c0 a1 b1 c1 a2 b2
677  // TransposedMatrix[1] = c2 a3 b3 c3 a4 b4 c4 a5
678  // TransposedMatrix[2] = b5 c5 a6 b6 c6 a7 b7 c7
679 
680  unsigned NumOfElm = VT.getVectorNumElements();
681  group2Shuffle(VT, GroupSize, VPShuf);
682  reorderSubVector(VT, TransposedMatrix, Vec, VPShuf, NumOfElm,3, Builder);
683 }
684 
685 void X86InterleavedAccessGroup::transpose_4x4(
687  SmallVectorImpl<Value *> &TransposedMatrix) {
688  assert(Matrix.size() == 4 && "Invalid matrix size");
689  TransposedMatrix.resize(4);
690 
691  // dst = src1[0,1],src2[0,1]
692  uint32_t IntMask1[] = {0, 1, 4, 5};
693  ArrayRef<uint32_t> Mask = makeArrayRef(IntMask1, 4);
694  Value *IntrVec1 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
695  Value *IntrVec2 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
696 
697  // dst = src1[2,3],src2[2,3]
698  uint32_t IntMask2[] = {2, 3, 6, 7};
699  Mask = makeArrayRef(IntMask2, 4);
700  Value *IntrVec3 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
701  Value *IntrVec4 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
702 
703  // dst = src1[0],src2[0],src1[2],src2[2]
704  uint32_t IntMask3[] = {0, 4, 2, 6};
705  Mask = makeArrayRef(IntMask3, 4);
706  TransposedMatrix[0] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
707  TransposedMatrix[2] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
708 
709  // dst = src1[1],src2[1],src1[3],src2[3]
710  uint32_t IntMask4[] = {1, 5, 3, 7};
711  Mask = makeArrayRef(IntMask4, 4);
712  TransposedMatrix[1] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
713  TransposedMatrix[3] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
714 }
715 
716 // Lowers this interleaved access group into X86-specific
717 // instructions/intrinsics.
718 bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
719  SmallVector<Instruction *, 4> DecomposedVectors;
720  SmallVector<Value *, 4> TransposedVectors;
721  VectorType *ShuffleTy = Shuffles[0]->getType();
722 
723  if (isa<LoadInst>(Inst)) {
724  // Try to generate target-sized register(/instruction).
725  decompose(Inst, Factor, ShuffleTy, DecomposedVectors);
726 
727  Type *ShuffleEltTy = Inst->getType();
728  unsigned NumSubVecElems = ShuffleEltTy->getVectorNumElements() / Factor;
729  // Perform matrix-transposition in order to compute interleaved
730  // results by generating some sort of (optimized) target-specific
731  // instructions.
732 
733  switch (NumSubVecElems) {
734  default:
735  return false;
736  case 4:
737  transpose_4x4(DecomposedVectors, TransposedVectors);
738  break;
739  case 8:
740  case 16:
741  case 32:
742  case 64:
743  deinterleave8bitStride3(DecomposedVectors, TransposedVectors,
744  NumSubVecElems);
745  break;
746  }
747 
748  // Now replace the unoptimized-interleaved-vectors with the
749  // transposed-interleaved vectors.
750  for (unsigned i = 0, e = Shuffles.size(); i < e; ++i)
751  Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);
752 
753  return true;
754  }
755 
756  Type *ShuffleEltTy = ShuffleTy->getVectorElementType();
757  unsigned NumSubVecElems = ShuffleTy->getVectorNumElements() / Factor;
758 
759  // Lower the interleaved stores:
760  // 1. Decompose the interleaved wide shuffle into individual shuffle
761  // vectors.
762  decompose(Shuffles[0], Factor, VectorType::get(ShuffleEltTy, NumSubVecElems),
763  DecomposedVectors);
764 
765  // 2. Transpose the interleaved-vectors into vectors of contiguous
766  // elements.
767  switch (NumSubVecElems) {
768  case 4:
769  transpose_4x4(DecomposedVectors, TransposedVectors);
770  break;
771  case 8:
772  interleave8bitStride4VF8(DecomposedVectors, TransposedVectors);
773  break;
774  case 16:
775  case 32:
776  case 64:
777  if (Factor == 4)
778  interleave8bitStride4(DecomposedVectors, TransposedVectors,
779  NumSubVecElems);
780  if (Factor == 3)
781  interleave8bitStride3(DecomposedVectors, TransposedVectors,
782  NumSubVecElems);
783  break;
784  default:
785  return false;
786  }
787 
788  // 3. Concatenate the contiguous-vectors back into a wide vector.
789  Value *WideVec = concatenateVectors(Builder, TransposedVectors);
790 
791  // 4. Generate a store instruction for wide-vec.
792  StoreInst *SI = cast<StoreInst>(Inst);
793  Builder.CreateAlignedStore(WideVec, SI->getPointerOperand(),
794  SI->getAlignment());
795 
796  return true;
797 }
798 
799 // Lower interleaved load(s) into target specific instructions/
800 // intrinsics. Lowering sequence varies depending on the vector-types, factor,
801 // number of shuffles and ISA.
802 // Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX.
805  ArrayRef<unsigned> Indices, unsigned Factor) const {
806  assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
807  "Invalid interleave factor");
808  assert(!Shuffles.empty() && "Empty shufflevector input");
809  assert(Shuffles.size() == Indices.size() &&
810  "Unmatched number of shufflevectors and indices");
811 
812  // Create an interleaved access group.
813  IRBuilder<> Builder(LI);
814  X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget,
815  Builder);
816 
817  return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
818 }
819 
821  ShuffleVectorInst *SVI,
822  unsigned Factor) const {
823  assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
824  "Invalid interleave factor");
825 
826  assert(SVI->getType()->getVectorNumElements() % Factor == 0 &&
827  "Invalid interleaved store");
828 
829  // Holds the indices of SVI that correspond to the starting index of each
830  // interleaved shuffle.
831  SmallVector<unsigned, 4> Indices;
832  auto Mask = SVI->getShuffleMask();
833  for (unsigned i = 0; i < Factor; i++)
834  Indices.push_back(Mask[i]);
835 
837 
838  // Create an interleaved access group.
839  IRBuilder<> Builder(SI);
840  X86InterleavedAccessGroup Grp(SI, Shuffles, Indices, Factor, Subtarget,
841  Builder);
842 
843  return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
844 }
static void setGroupSize(MVT VT, SmallVectorImpl< uint32_t > &SizeInfo)
Type * getVectorElementType() const
Definition: Type.h:371
static MVT getIntegerVT(unsigned BitWidth)
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:111
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
This class represents lattice values for constants.
Definition: AllocatorList.h:24
static MVT getVectorVT(MVT VT, unsigned NumElements)
unsigned getVectorNumElements() const
This instruction constructs a fixed permutation of two input vectors.
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:705
static uint32_t Concat[]
F(f)
An instruction for reading from memory.
Definition: Instructions.h:168
static void genShuffleBland(MVT VT, ArrayRef< uint32_t > Mask, SmallVectorImpl< uint32_t > &Out, int LowOffset, int HighOffset)
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:230
Live Register Matrix
bool lowerInterleavedLoad(LoadInst *LI, ArrayRef< ShuffleVectorInst *> Shuffles, ArrayRef< unsigned > Indices, unsigned Factor) const override
Lower interleaved load(s) into target specific instructions/intrinsics.
ArrayRef< T > makeArrayRef(const T &OneElt)
Construct an ArrayRef from a single element.
Definition: ArrayRef.h:451
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
Definition: Type.cpp:652
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:42
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:743
unsigned getSizeInBits() const
Constant * createSequentialMask(IRBuilder<> &Builder, unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:245
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:33
An instruction for storing to memory.
Definition: Instructions.h:321
MVT getVectorElementType() const
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Machine Value Type.
Value * concatenateVectors(IRBuilder<> &Builder, ArrayRef< Value *> Vecs)
Concatenate a list of vectors.
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:46
unsigned getScalarSizeInBits() const
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:149
LLVM_ATTRIBUTE_ALWAYS_INLINE iterator begin()
Definition: SmallVector.h:129
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Value * getPointerOperand()
Definition: Instructions.h:285
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:281
static UndefValue * get(Type *T)
Static factory methods - Return an &#39;undef&#39; object of the specified type.
Definition: Constants.cpp:1415
static wasm::ValType getType(const TargetRegisterClass *RC)
static void concatSubVector(Value **Vec, ArrayRef< Instruction *> InVec, unsigned VecElems, IRBuilder<> Builder)
static void reorderSubVector(MVT VT, SmallVectorImpl< Value *> &TransposedMatrix, ArrayRef< Value *> Vec, ArrayRef< uint32_t > VPShuf, unsigned VecElems, unsigned Stride, IRBuilder<> Builder)
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override
Lower interleaved store(s) into target specific instructions/intrinsics.
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:847
Module.h This file contains the declarations for the Module class.
static void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static MVT scaleVectorType(MVT VT)
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2068
unsigned getVectorNumElements() const
Definition: DerivedTypes.h:462
Class to represent vector types.
Definition: DerivedTypes.h:393
unsigned getAlignment() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:241
#define I(x, y, z)
Definition: MD5.cpp:58
static void group2Shuffle(MVT VT, SmallVectorImpl< uint32_t > &Mask, SmallVectorImpl< uint32_t > &Output)
unsigned getAlignment() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:366
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:291
static void createShuffleStride(MVT VT, int Stride, SmallVectorImpl< uint32_t > &Mask)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
LLVM Value Representation.
Definition: Value.h:73
static VectorType * get(Type *ElementType, unsigned NumElements)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:606
std::underlying_type< E >::type Mask()
Get a bitmask with 1s in all places up to the high-order bit of E&#39;s largest value.
Definition: BitmaskEnum.h:81
void DecodePALIGNRMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1238
VectorType * getType() const
Overload to return most specific vector type.
Value * getPointerOperand()
Definition: Instructions.h:413
static IntegerType * getInt8Ty(LLVMContext &C)
Definition: Type.cpp:174
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:144
void resize(size_type N)
Definition: SmallVector.h:351