LLVM  8.0.1
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1 //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 /// \file
10 /// This file implements a TargetTransformInfo analysis pass specific to the
11 /// X86 target machine. It uses the target's detailed information to provide
12 /// more precise answers to certain TTI queries, while letting the target
13 /// independent and default TTI implementations handle the rest.
14 ///
15 //===----------------------------------------------------------------------===//
16 /// About Cost Model numbers used below it's necessary to say the following:
17 /// the numbers correspond to some "generic" X86 CPU instead of usage of
18 /// concrete CPU model. Usually the numbers correspond to CPU where the feature
19 /// apeared at the first time. For example, if we do Subtarget.hasSSE42() in
20 /// the lookups below the cost is based on Nehalem as that was the first CPU
21 /// to support that feature level and thus has most likely the worst case cost.
22 /// Some examples of other technologies/CPUs:
23 /// SSE 3 - Pentium4 / Athlon64
24 /// SSE 4.1 - Penryn
25 /// SSE 4.2 - Nehalem
26 /// AVX - Sandy Bridge
27 /// AVX2 - Haswell
28 /// AVX-512 - Xeon Phi / Skylake
29 /// And some examples of instruction target dependent costs (latency)
30 /// divss sqrtss rsqrtss
31 /// AMD K7 11-16 19 3
32 /// Piledriver 9-24 13-15 5
33 /// Jaguar 14 16 2
34 /// Pentium II,III 18 30 2
35 /// Nehalem 7-14 7-18 3
36 /// Haswell 10-13 11 5
37 /// TODO: Develop and implement the target dependent cost model and
38 /// specialize cost numbers for different Cost Model Targets such as throughput,
39 /// code size, latency and uop count.
40 //===----------------------------------------------------------------------===//
41 
42 #include "X86TargetTransformInfo.h"
45 #include "llvm/CodeGen/CostTable.h"
47 #include "llvm/IR/IntrinsicInst.h"
48 #include "llvm/Support/Debug.h"
49 
50 using namespace llvm;
51 
52 #define DEBUG_TYPE "x86tti"
53 
54 //===----------------------------------------------------------------------===//
55 //
56 // X86 cost model.
57 //
58 //===----------------------------------------------------------------------===//
59 
61 X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
62  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
63  // TODO: Currently the __builtin_popcount() implementation using SSE3
64  // instructions is inefficient. Once the problem is fixed, we should
65  // call ST->hasSSE3() instead of ST->hasPOPCNT().
67 }
68 
71  switch (Level) {
73  // - Penryn
74  // - Nehalem
75  // - Westmere
76  // - Sandy Bridge
77  // - Ivy Bridge
78  // - Haswell
79  // - Broadwell
80  // - Skylake
81  // - Kabylake
82  return 32 * 1024; // 32 KByte
84  // - Penryn
85  // - Nehalem
86  // - Westmere
87  // - Sandy Bridge
88  // - Ivy Bridge
89  // - Haswell
90  // - Broadwell
91  // - Skylake
92  // - Kabylake
93  return 256 * 1024; // 256 KByte
94  }
95 
96  llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
97 }
98 
101  // - Penryn
102  // - Nehalem
103  // - Westmere
104  // - Sandy Bridge
105  // - Ivy Bridge
106  // - Haswell
107  // - Broadwell
108  // - Skylake
109  // - Kabylake
110  switch (Level) {
114  return 8;
115  }
116 
117  llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
118 }
119 
120 unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) {
121  if (Vector && !ST->hasSSE1())
122  return 0;
123 
124  if (ST->is64Bit()) {
125  if (Vector && ST->hasAVX512())
126  return 32;
127  return 16;
128  }
129  return 8;
130 }
131 
132 unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const {
133  unsigned PreferVectorWidth = ST->getPreferVectorWidth();
134  if (Vector) {
135  if (ST->hasAVX512() && PreferVectorWidth >= 512)
136  return 512;
137  if (ST->hasAVX() && PreferVectorWidth >= 256)
138  return 256;
139  if (ST->hasSSE1() && PreferVectorWidth >= 128)
140  return 128;
141  return 0;
142  }
143 
144  if (ST->is64Bit())
145  return 64;
146 
147  return 32;
148 }
149 
150 unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
151  return getRegisterBitWidth(true);
152 }
153 
154 unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
155  // If the loop will not be vectorized, don't interleave the loop.
156  // Let regular unroll to unroll the loop, which saves the overflow
157  // check and memory check cost.
158  if (VF == 1)
159  return 1;
160 
161  if (ST->isAtom())
162  return 1;
163 
164  // Sandybridge and Haswell have multiple execution ports and pipelined
165  // vector units.
166  if (ST->hasAVX())
167  return 4;
168 
169  return 2;
170 }
171 
173  unsigned Opcode, Type *Ty,
175  TTI::OperandValueProperties Opd1PropInfo,
176  TTI::OperandValueProperties Opd2PropInfo,
178  // Legalize the type.
179  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
180 
181  int ISD = TLI->InstructionOpcodeToISD(Opcode);
182  assert(ISD && "Invalid opcode");
183 
184  static const CostTblEntry GLMCostTable[] = {
185  { ISD::FDIV, MVT::f32, 18 }, // divss
186  { ISD::FDIV, MVT::v4f32, 35 }, // divps
187  { ISD::FDIV, MVT::f64, 33 }, // divsd
188  { ISD::FDIV, MVT::v2f64, 65 }, // divpd
189  };
190 
191  if (ST->isGLM())
192  if (const auto *Entry = CostTableLookup(GLMCostTable, ISD,
193  LT.second))
194  return LT.first * Entry->Cost;
195 
196  static const CostTblEntry SLMCostTable[] = {
197  { ISD::MUL, MVT::v4i32, 11 }, // pmulld
198  { ISD::MUL, MVT::v8i16, 2 }, // pmullw
199  { ISD::MUL, MVT::v16i8, 14 }, // extend/pmullw/trunc sequence.
200  { ISD::FMUL, MVT::f64, 2 }, // mulsd
201  { ISD::FMUL, MVT::v2f64, 4 }, // mulpd
202  { ISD::FMUL, MVT::v4f32, 2 }, // mulps
203  { ISD::FDIV, MVT::f32, 17 }, // divss
204  { ISD::FDIV, MVT::v4f32, 39 }, // divps
205  { ISD::FDIV, MVT::f64, 32 }, // divsd
206  { ISD::FDIV, MVT::v2f64, 69 }, // divpd
207  { ISD::FADD, MVT::v2f64, 2 }, // addpd
208  { ISD::FSUB, MVT::v2f64, 2 }, // subpd
209  // v2i64/v4i64 mul is custom lowered as a series of long:
210  // multiplies(3), shifts(3) and adds(2)
211  // slm muldq version throughput is 2 and addq throughput 4
212  // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
213  // 3X4 (addq throughput) = 17
214  { ISD::MUL, MVT::v2i64, 17 },
215  // slm addq\subq throughput is 4
216  { ISD::ADD, MVT::v2i64, 4 },
217  { ISD::SUB, MVT::v2i64, 4 },
218  };
219 
220  if (ST->isSLM()) {
221  if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) {
222  // Check if the operands can be shrinked into a smaller datatype.
223  bool Op1Signed = false;
224  unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
225  bool Op2Signed = false;
226  unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
227 
228  bool signedMode = Op1Signed | Op2Signed;
229  unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
230 
231  if (OpMinSize <= 7)
232  return LT.first * 3; // pmullw/sext
233  if (!signedMode && OpMinSize <= 8)
234  return LT.first * 3; // pmullw/zext
235  if (OpMinSize <= 15)
236  return LT.first * 5; // pmullw/pmulhw/pshuf
237  if (!signedMode && OpMinSize <= 16)
238  return LT.first * 5; // pmullw/pmulhw/pshuf
239  }
240 
241  if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
242  LT.second)) {
243  return LT.first * Entry->Cost;
244  }
245  }
246 
247  if ((ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
248  ISD == ISD::UREM) &&
251  Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
252  if (ISD == ISD::SDIV || ISD == ISD::SREM) {
253  // On X86, vector signed division by constants power-of-two are
254  // normally expanded to the sequence SRA + SRL + ADD + SRA.
255  // The OperandValue properties may not be the same as that of the previous
256  // operation; conservatively assume OP_None.
257  int Cost =
258  2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info, Op2Info,
261  Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
264  Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info,
267 
268  if (ISD == ISD::SREM) {
269  // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
270  Cost += getArithmeticInstrCost(Instruction::Mul, Ty, Op1Info, Op2Info);
271  Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Op1Info, Op2Info);
272  }
273 
274  return Cost;
275  }
276 
277  // Vector unsigned division/remainder will be simplified to shifts/masks.
278  if (ISD == ISD::UDIV)
279  return getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
282 
283  if (ISD == ISD::UREM)
284  return getArithmeticInstrCost(Instruction::And, Ty, Op1Info, Op2Info,
287  }
288 
289  static const CostTblEntry AVX512BWUniformConstCostTable[] = {
290  { ISD::SHL, MVT::v64i8, 2 }, // psllw + pand.
291  { ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand.
292  { ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb.
293  };
294 
296  ST->hasBWI()) {
297  if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD,
298  LT.second))
299  return LT.first * Entry->Cost;
300  }
301 
302  static const CostTblEntry AVX512UniformConstCostTable[] = {
303  { ISD::SRA, MVT::v2i64, 1 },
304  { ISD::SRA, MVT::v4i64, 1 },
305  { ISD::SRA, MVT::v8i64, 1 },
306  };
307 
309  ST->hasAVX512()) {
310  if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD,
311  LT.second))
312  return LT.first * Entry->Cost;
313  }
314 
315  static const CostTblEntry AVX2UniformConstCostTable[] = {
316  { ISD::SHL, MVT::v32i8, 2 }, // psllw + pand.
317  { ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand.
318  { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb.
319 
320  { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.
321  };
322 
324  ST->hasAVX2()) {
325  if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD,
326  LT.second))
327  return LT.first * Entry->Cost;
328  }
329 
330  static const CostTblEntry SSE2UniformConstCostTable[] = {
331  { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand.
332  { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand.
333  { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
334 
335  { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split.
336  { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split.
337  { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
338  };
339 
340  // XOP has faster vXi8 shifts.
342  ST->hasSSE2() && !ST->hasXOP()) {
343  if (const auto *Entry =
344  CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
345  return LT.first * Entry->Cost;
346  }
347 
348  static const CostTblEntry AVX512BWConstCostTable[] = {
349  { ISD::SDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
350  { ISD::SREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
351  { ISD::UDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
352  { ISD::UREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
353  { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence
354  { ISD::SREM, MVT::v32i16, 8 }, // vpmulhw+mul+sub sequence
355  { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence
356  { ISD::UREM, MVT::v32i16, 8 }, // vpmulhuw+mul+sub sequence
357  };
358 
361  ST->hasBWI()) {
362  if (const auto *Entry =
363  CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
364  return LT.first * Entry->Cost;
365  }
366 
367  static const CostTblEntry AVX512ConstCostTable[] = {
368  { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
369  { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence
370  { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
371  { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence
372  };
373 
376  ST->hasAVX512()) {
377  if (const auto *Entry =
378  CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
379  return LT.first * Entry->Cost;
380  }
381 
382  static const CostTblEntry AVX2ConstCostTable[] = {
383  { ISD::SDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
384  { ISD::SREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
385  { ISD::UDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
386  { ISD::UREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
387  { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence
388  { ISD::SREM, MVT::v16i16, 8 }, // vpmulhw+mul+sub sequence
389  { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence
390  { ISD::UREM, MVT::v16i16, 8 }, // vpmulhuw+mul+sub sequence
391  { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence
392  { ISD::SREM, MVT::v8i32, 19 }, // vpmuldq+mul+sub sequence
393  { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence
394  { ISD::UREM, MVT::v8i32, 19 }, // vpmuludq+mul+sub sequence
395  };
396 
399  ST->hasAVX2()) {
400  if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
401  return LT.first * Entry->Cost;
402  }
403 
404  static const CostTblEntry SSE2ConstCostTable[] = {
405  { ISD::SDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
406  { ISD::SREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
407  { ISD::SDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
408  { ISD::SREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
409  { ISD::UDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
410  { ISD::UREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
411  { ISD::UDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
412  { ISD::UREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
413  { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
414  { ISD::SREM, MVT::v16i16, 16+2 }, // 2*pmulhw+mul+sub sequence + split.
415  { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
416  { ISD::SREM, MVT::v8i16, 8 }, // pmulhw+mul+sub sequence
417  { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split.
418  { ISD::UREM, MVT::v16i16, 16+2 }, // 2*pmulhuw+mul+sub sequence + split.
419  { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
420  { ISD::UREM, MVT::v8i16, 8 }, // pmulhuw+mul+sub sequence
421  { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split.
422  { ISD::SREM, MVT::v8i32, 48+2 }, // 2*pmuludq+mul+sub sequence + split.
423  { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
424  { ISD::SREM, MVT::v4i32, 24 }, // pmuludq+mul+sub sequence
425  { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split.
426  { ISD::UREM, MVT::v8i32, 40+2 }, // 2*pmuludq+mul+sub sequence + split.
427  { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
428  { ISD::UREM, MVT::v4i32, 20 }, // pmuludq+mul+sub sequence
429  };
430 
433  ST->hasSSE2()) {
434  // pmuldq sequence.
435  if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
436  return LT.first * 32;
437  if (ISD == ISD::SREM && LT.second == MVT::v8i32 && ST->hasAVX())
438  return LT.first * 38;
439  if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
440  return LT.first * 15;
441  if (ISD == ISD::SREM && LT.second == MVT::v4i32 && ST->hasSSE41())
442  return LT.first * 20;
443 
444  if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
445  return LT.first * Entry->Cost;
446  }
447 
448  static const CostTblEntry AVX2UniformCostTable[] = {
449  // Uniform splats are cheaper for the following instructions.
450  { ISD::SHL, MVT::v16i16, 1 }, // psllw.
451  { ISD::SRL, MVT::v16i16, 1 }, // psrlw.
452  { ISD::SRA, MVT::v16i16, 1 }, // psraw.
453  };
454 
455  if (ST->hasAVX2() &&
457  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
458  if (const auto *Entry =
459  CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
460  return LT.first * Entry->Cost;
461  }
462 
463  static const CostTblEntry SSE2UniformCostTable[] = {
464  // Uniform splats are cheaper for the following instructions.
465  { ISD::SHL, MVT::v8i16, 1 }, // psllw.
466  { ISD::SHL, MVT::v4i32, 1 }, // pslld
467  { ISD::SHL, MVT::v2i64, 1 }, // psllq.
468 
469  { ISD::SRL, MVT::v8i16, 1 }, // psrlw.
470  { ISD::SRL, MVT::v4i32, 1 }, // psrld.
471  { ISD::SRL, MVT::v2i64, 1 }, // psrlq.
472 
473  { ISD::SRA, MVT::v8i16, 1 }, // psraw.
474  { ISD::SRA, MVT::v4i32, 1 }, // psrad.
475  };
476 
477  if (ST->hasSSE2() &&
479  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
480  if (const auto *Entry =
481  CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
482  return LT.first * Entry->Cost;
483  }
484 
485  static const CostTblEntry AVX512DQCostTable[] = {
486  { ISD::MUL, MVT::v2i64, 1 },
487  { ISD::MUL, MVT::v4i64, 1 },
488  { ISD::MUL, MVT::v8i64, 1 }
489  };
490 
491  // Look for AVX512DQ lowering tricks for custom cases.
492  if (ST->hasDQI())
493  if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
494  return LT.first * Entry->Cost;
495 
496  static const CostTblEntry AVX512BWCostTable[] = {
497  { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw
498  { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw
499  { ISD::SRA, MVT::v8i16, 1 }, // vpsravw
500 
501  { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw
502  { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw
503  { ISD::SRA, MVT::v16i16, 1 }, // vpsravw
504 
505  { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw
506  { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
507  { ISD::SRA, MVT::v32i16, 1 }, // vpsravw
508 
509  { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence.
510  { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence.
511  { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence.
512 
513  { ISD::MUL, MVT::v64i8, 11 }, // extend/pmullw/trunc sequence.
514  { ISD::MUL, MVT::v32i8, 4 }, // extend/pmullw/trunc sequence.
515  { ISD::MUL, MVT::v16i8, 4 }, // extend/pmullw/trunc sequence.
516  };
517 
518  // Look for AVX512BW lowering tricks for custom cases.
519  if (ST->hasBWI())
520  if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
521  return LT.first * Entry->Cost;
522 
523  static const CostTblEntry AVX512CostTable[] = {
524  { ISD::SHL, MVT::v16i32, 1 },
525  { ISD::SRL, MVT::v16i32, 1 },
526  { ISD::SRA, MVT::v16i32, 1 },
527 
528  { ISD::SHL, MVT::v8i64, 1 },
529  { ISD::SRL, MVT::v8i64, 1 },
530 
531  { ISD::SRA, MVT::v2i64, 1 },
532  { ISD::SRA, MVT::v4i64, 1 },
533  { ISD::SRA, MVT::v8i64, 1 },
534 
535  { ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence.
536  { ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence.
537  { ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org)
538  { ISD::MUL, MVT::v8i32, 1 }, // pmulld (Skylake from agner.org)
539  { ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org)
540  { ISD::MUL, MVT::v8i64, 8 }, // 3*pmuludq/3*shift/2*add
541 
542  { ISD::FADD, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
543  { ISD::FSUB, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
544  { ISD::FMUL, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
545 
546  { ISD::FADD, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
547  { ISD::FSUB, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
548  { ISD::FMUL, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
549  };
550 
551  if (ST->hasAVX512())
552  if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
553  return LT.first * Entry->Cost;
554 
555  static const CostTblEntry AVX2ShiftCostTable[] = {
556  // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
557  // customize them to detect the cases where shift amount is a scalar one.
558  { ISD::SHL, MVT::v4i32, 1 },
559  { ISD::SRL, MVT::v4i32, 1 },
560  { ISD::SRA, MVT::v4i32, 1 },
561  { ISD::SHL, MVT::v8i32, 1 },
562  { ISD::SRL, MVT::v8i32, 1 },
563  { ISD::SRA, MVT::v8i32, 1 },
564  { ISD::SHL, MVT::v2i64, 1 },
565  { ISD::SRL, MVT::v2i64, 1 },
566  { ISD::SHL, MVT::v4i64, 1 },
567  { ISD::SRL, MVT::v4i64, 1 },
568  };
569 
570  // Look for AVX2 lowering tricks.
571  if (ST->hasAVX2()) {
572  if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
575  // On AVX2, a packed v16i16 shift left by a constant build_vector
576  // is lowered into a vector multiply (vpmullw).
577  return getArithmeticInstrCost(Instruction::Mul, Ty, Op1Info, Op2Info,
580 
581  if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
582  return LT.first * Entry->Cost;
583  }
584 
585  static const CostTblEntry XOPShiftCostTable[] = {
586  // 128bit shifts take 1cy, but right shifts require negation beforehand.
587  { ISD::SHL, MVT::v16i8, 1 },
588  { ISD::SRL, MVT::v16i8, 2 },
589  { ISD::SRA, MVT::v16i8, 2 },
590  { ISD::SHL, MVT::v8i16, 1 },
591  { ISD::SRL, MVT::v8i16, 2 },
592  { ISD::SRA, MVT::v8i16, 2 },
593  { ISD::SHL, MVT::v4i32, 1 },
594  { ISD::SRL, MVT::v4i32, 2 },
595  { ISD::SRA, MVT::v4i32, 2 },
596  { ISD::SHL, MVT::v2i64, 1 },
597  { ISD::SRL, MVT::v2i64, 2 },
598  { ISD::SRA, MVT::v2i64, 2 },
599  // 256bit shifts require splitting if AVX2 didn't catch them above.
600  { ISD::SHL, MVT::v32i8, 2+2 },
601  { ISD::SRL, MVT::v32i8, 4+2 },
602  { ISD::SRA, MVT::v32i8, 4+2 },
603  { ISD::SHL, MVT::v16i16, 2+2 },
604  { ISD::SRL, MVT::v16i16, 4+2 },
605  { ISD::SRA, MVT::v16i16, 4+2 },
606  { ISD::SHL, MVT::v8i32, 2+2 },
607  { ISD::SRL, MVT::v8i32, 4+2 },
608  { ISD::SRA, MVT::v8i32, 4+2 },
609  { ISD::SHL, MVT::v4i64, 2+2 },
610  { ISD::SRL, MVT::v4i64, 4+2 },
611  { ISD::SRA, MVT::v4i64, 4+2 },
612  };
613 
614  // Look for XOP lowering tricks.
615  if (ST->hasXOP()) {
616  // If the right shift is constant then we'll fold the negation so
617  // it's as cheap as a left shift.
618  int ShiftISD = ISD;
619  if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) &&
622  ShiftISD = ISD::SHL;
623  if (const auto *Entry =
624  CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
625  return LT.first * Entry->Cost;
626  }
627 
628  static const CostTblEntry SSE2UniformShiftCostTable[] = {
629  // Uniform splats are cheaper for the following instructions.
630  { ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split.
631  { ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split.
632  { ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split.
633 
634  { ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split.
635  { ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split.
636  { ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split.
637 
638  { ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split.
639  { ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split.
640  { ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle.
641  { ISD::SRA, MVT::v4i64, 8+2 }, // 2*(2*psrad + shuffle) + split.
642  };
643 
644  if (ST->hasSSE2() &&
646  (Op2Info == TargetTransformInfo::OK_UniformValue))) {
647 
648  // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table.
649  if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2())
650  return LT.first * 4; // 2*psrad + shuffle.
651 
652  if (const auto *Entry =
653  CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
654  return LT.first * Entry->Cost;
655  }
656 
657  if (ISD == ISD::SHL &&
659  MVT VT = LT.second;
660  // Vector shift left by non uniform constant can be lowered
661  // into vector multiply.
662  if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
663  ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
664  ISD = ISD::MUL;
665  }
666 
667  static const CostTblEntry AVX2CostTable[] = {
668  { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence.
669  { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
670 
671  { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence.
672  { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
673 
674  { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence.
675  { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence.
676  { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence.
677  { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence.
678 
679  { ISD::SUB, MVT::v32i8, 1 }, // psubb
680  { ISD::ADD, MVT::v32i8, 1 }, // paddb
681  { ISD::SUB, MVT::v16i16, 1 }, // psubw
682  { ISD::ADD, MVT::v16i16, 1 }, // paddw
683  { ISD::SUB, MVT::v8i32, 1 }, // psubd
684  { ISD::ADD, MVT::v8i32, 1 }, // paddd
685  { ISD::SUB, MVT::v4i64, 1 }, // psubq
686  { ISD::ADD, MVT::v4i64, 1 }, // paddq
687 
688  { ISD::MUL, MVT::v32i8, 17 }, // extend/pmullw/trunc sequence.
689  { ISD::MUL, MVT::v16i8, 7 }, // extend/pmullw/trunc sequence.
690  { ISD::MUL, MVT::v16i16, 1 }, // pmullw
691  { ISD::MUL, MVT::v8i32, 2 }, // pmulld (Haswell from agner.org)
692  { ISD::MUL, MVT::v4i64, 8 }, // 3*pmuludq/3*shift/2*add
693 
694  { ISD::FADD, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
695  { ISD::FADD, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
696  { ISD::FSUB, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
697  { ISD::FSUB, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
698  { ISD::FMUL, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
699  { ISD::FMUL, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
700 
701  { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/
702  { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
703  { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
704  { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/
705  { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
706  { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
707  };
708 
709  // Look for AVX2 lowering tricks for custom cases.
710  if (ST->hasAVX2())
711  if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
712  return LT.first * Entry->Cost;
713 
714  static const CostTblEntry AVX1CostTable[] = {
715  // We don't have to scalarize unsupported ops. We can issue two half-sized
716  // operations and we only need to extract the upper YMM half.
717  // Two ops + 1 extract + 1 insert = 4.
718  { ISD::MUL, MVT::v16i16, 4 },
719  { ISD::MUL, MVT::v8i32, 4 },
720  { ISD::SUB, MVT::v32i8, 4 },
721  { ISD::ADD, MVT::v32i8, 4 },
722  { ISD::SUB, MVT::v16i16, 4 },
723  { ISD::ADD, MVT::v16i16, 4 },
724  { ISD::SUB, MVT::v8i32, 4 },
725  { ISD::ADD, MVT::v8i32, 4 },
726  { ISD::SUB, MVT::v4i64, 4 },
727  { ISD::ADD, MVT::v4i64, 4 },
728 
729  // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
730  // are lowered as a series of long multiplies(3), shifts(3) and adds(2)
731  // Because we believe v4i64 to be a legal type, we must also include the
732  // extract+insert in the cost table. Therefore, the cost here is 18
733  // instead of 8.
734  { ISD::MUL, MVT::v4i64, 18 },
735 
736  { ISD::MUL, MVT::v32i8, 26 }, // extend/pmullw/trunc sequence.
737 
738  { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/
739  { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
740  { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
741  { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/
742  { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/
743  { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/
744  };
745 
746  if (ST->hasAVX())
747  if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
748  return LT.first * Entry->Cost;
749 
750  static const CostTblEntry SSE42CostTable[] = {
751  { ISD::FADD, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
752  { ISD::FADD, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
753  { ISD::FADD, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
754  { ISD::FADD, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
755 
756  { ISD::FSUB, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
757  { ISD::FSUB, MVT::f32 , 1 }, // Nehalem from http://www.agner.org/
758  { ISD::FSUB, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
759  { ISD::FSUB, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
760 
761  { ISD::FMUL, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
762  { ISD::FMUL, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
763  { ISD::FMUL, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
764  { ISD::FMUL, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
765 
766  { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/
767  { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
768  { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/
769  { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
770  };
771 
772  if (ST->hasSSE42())
773  if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
774  return LT.first * Entry->Cost;
775 
776  static const CostTblEntry SSE41CostTable[] = {
777  { ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence.
778  { ISD::SHL, MVT::v32i8, 2*11+2 }, // pblendvb sequence + split.
779  { ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence.
780  { ISD::SHL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
781  { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld
782  { ISD::SHL, MVT::v8i32, 2*4+2 }, // pslld/paddd/cvttps2dq/pmulld + split
783 
784  { ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence.
785  { ISD::SRL, MVT::v32i8, 2*12+2 }, // pblendvb sequence + split.
786  { ISD::SRL, MVT::v8i16, 14 }, // pblendvb sequence.
787  { ISD::SRL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
788  { ISD::SRL, MVT::v4i32, 11 }, // Shift each lane + blend.
789  { ISD::SRL, MVT::v8i32, 2*11+2 }, // Shift each lane + blend + split.
790 
791  { ISD::SRA, MVT::v16i8, 24 }, // pblendvb sequence.
792  { ISD::SRA, MVT::v32i8, 2*24+2 }, // pblendvb sequence + split.
793  { ISD::SRA, MVT::v8i16, 14 }, // pblendvb sequence.
794  { ISD::SRA, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
795  { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
796  { ISD::SRA, MVT::v8i32, 2*12+2 }, // Shift each lane + blend + split.
797 
798  { ISD::MUL, MVT::v4i32, 2 } // pmulld (Nehalem from agner.org)
799  };
800 
801  if (ST->hasSSE41())
802  if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
803  return LT.first * Entry->Cost;
804 
805  static const CostTblEntry SSE2CostTable[] = {
806  // We don't correctly identify costs of casts because they are marked as
807  // custom.
808  { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence.
809  { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence.
810  { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul.
811  { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence.
812  { ISD::SHL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split.
813 
814  { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence.
815  { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence.
816  { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
817  { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence.
818  { ISD::SRL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split.
819 
820  { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence.
821  { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence.
822  { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend.
823  { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence.
824  { ISD::SRA, MVT::v4i64, 2*12+2 }, // srl/xor/sub sequence+split.
825 
826  { ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence.
827  { ISD::MUL, MVT::v8i16, 1 }, // pmullw
828  { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle
829  { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add
830 
831  { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/
832  { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/
833  { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/
834  { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/
835 
836  { ISD::FADD, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
837  { ISD::FADD, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
838 
839  { ISD::FSUB, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
840  { ISD::FSUB, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
841  };
842 
843  if (ST->hasSSE2())
844  if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
845  return LT.first * Entry->Cost;
846 
847  static const CostTblEntry SSE1CostTable[] = {
848  { ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/
849  { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
850 
851  { ISD::FADD, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
852  { ISD::FADD, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
853 
854  { ISD::FSUB, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
855  { ISD::FSUB, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
856 
857  { ISD::ADD, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
858  { ISD::ADD, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
859  { ISD::ADD, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
860 
861  { ISD::SUB, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
862  { ISD::SUB, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
863  { ISD::SUB, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
864  };
865 
866  if (ST->hasSSE1())
867  if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
868  return LT.first * Entry->Cost;
869 
870  // It is not a good idea to vectorize division. We have to scalarize it and
871  // in the process we will often end up having to spilling regular
872  // registers. The overhead of division is going to dominate most kernels
873  // anyways so try hard to prevent vectorization of division - it is
874  // generally a bad idea. Assume somewhat arbitrarily that we have to be able
875  // to hide "20 cycles" for each lane.
876  if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM ||
877  ISD == ISD::UDIV || ISD == ISD::UREM)) {
878  int ScalarCost = getArithmeticInstrCost(
879  Opcode, Ty->getScalarType(), Op1Info, Op2Info,
881  return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
882  }
883 
884  // Fallback to the default implementation.
885  return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
886 }
887 
889  Type *SubTp) {
890  // 64-bit packed float vectors (v2f32) are widened to type v4f32.
891  // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
892  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
893 
894  // Treat Transpose as 2-op shuffles - there's no difference in lowering.
895  if (Kind == TTI::SK_Transpose)
896  Kind = TTI::SK_PermuteTwoSrc;
897 
898  // For Broadcasts we are splatting the first element from the first input
899  // register, so only need to reference that input and all the output
900  // registers are the same.
901  if (Kind == TTI::SK_Broadcast)
902  LT.first = 1;
903 
904  // Subvector extractions are free if they start at the beginning of a
905  // vector and cheap if the subvectors are aligned.
906  if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
907  int NumElts = LT.second.getVectorNumElements();
908  if ((Index % NumElts) == 0)
909  return 0;
910  std::pair<int, MVT> SubLT = TLI->getTypeLegalizationCost(DL, SubTp);
911  if (SubLT.second.isVector()) {
912  int NumSubElts = SubLT.second.getVectorNumElements();
913  if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
914  return SubLT.first;
915  }
916  }
917 
918  // We are going to permute multiple sources and the result will be in multiple
919  // destinations. Providing an accurate cost only for splits where the element
920  // type remains the same.
921  if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
922  MVT LegalVT = LT.second;
923  if (LegalVT.isVector() &&
924  LegalVT.getVectorElementType().getSizeInBits() ==
926  LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) {
927 
928  unsigned VecTySize = DL.getTypeStoreSize(Tp);
929  unsigned LegalVTSize = LegalVT.getStoreSize();
930  // Number of source vectors after legalization:
931  unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
932  // Number of destination vectors after legalization:
933  unsigned NumOfDests = LT.first;
934 
935  Type *SingleOpTy = VectorType::get(Tp->getVectorElementType(),
936  LegalVT.getVectorNumElements());
937 
938  unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
939  return NumOfShuffles *
940  getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr);
941  }
942 
943  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
944  }
945 
946  // For 2-input shuffles, we must account for splitting the 2 inputs into many.
947  if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
948  // We assume that source and destination have the same vector type.
949  int NumOfDests = LT.first;
950  int NumOfShufflesPerDest = LT.first * 2 - 1;
951  LT.first = NumOfDests * NumOfShufflesPerDest;
952  }
953 
954  static const CostTblEntry AVX512VBMIShuffleTbl[] = {
955  {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
956  {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
957 
958  {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
959  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
960 
961  {TTI::SK_PermuteTwoSrc, MVT::v64i8, 1}, // vpermt2b
962  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 1}, // vpermt2b
963  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1} // vpermt2b
964  };
965 
966  if (ST->hasVBMI())
967  if (const auto *Entry =
968  CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
969  return LT.first * Entry->Cost;
970 
971  static const CostTblEntry AVX512BWShuffleTbl[] = {
972  {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
973  {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
974 
975  {TTI::SK_Reverse, MVT::v32i16, 1}, // vpermw
976  {TTI::SK_Reverse, MVT::v16i16, 1}, // vpermw
977  {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
978 
979  {TTI::SK_PermuteSingleSrc, MVT::v32i16, 1}, // vpermw
980  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 1}, // vpermw
981  {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // vpermw
982  {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
983  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 3}, // vpermw + zext/trunc
984 
985  {TTI::SK_PermuteTwoSrc, MVT::v32i16, 1}, // vpermt2w
986  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 1}, // vpermt2w
987  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpermt2w
988  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 3}, // zext + vpermt2w + trunc
989  {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
990  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3} // zext + vpermt2w + trunc
991  };
992 
993  if (ST->hasBWI())
994  if (const auto *Entry =
995  CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
996  return LT.first * Entry->Cost;
997 
998  static const CostTblEntry AVX512ShuffleTbl[] = {
999  {TTI::SK_Broadcast, MVT::v8f64, 1}, // vbroadcastpd
1000  {TTI::SK_Broadcast, MVT::v16f32, 1}, // vbroadcastps
1001  {TTI::SK_Broadcast, MVT::v8i64, 1}, // vpbroadcastq
1002  {TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd
1003 
1004  {TTI::SK_Reverse, MVT::v8f64, 1}, // vpermpd
1005  {TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps
1006  {TTI::SK_Reverse, MVT::v8i64, 1}, // vpermq
1007  {TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd
1008 
1009  {TTI::SK_PermuteSingleSrc, MVT::v8f64, 1}, // vpermpd
1010  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1011  {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // vpermpd
1012  {TTI::SK_PermuteSingleSrc, MVT::v16f32, 1}, // vpermps
1013  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1014  {TTI::SK_PermuteSingleSrc, MVT::v4f32, 1}, // vpermps
1015  {TTI::SK_PermuteSingleSrc, MVT::v8i64, 1}, // vpermq
1016  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1017  {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // vpermq
1018  {TTI::SK_PermuteSingleSrc, MVT::v16i32, 1}, // vpermd
1019  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1020  {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // vpermd
1021  {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1022 
1023  {TTI::SK_PermuteTwoSrc, MVT::v8f64, 1}, // vpermt2pd
1024  {TTI::SK_PermuteTwoSrc, MVT::v16f32, 1}, // vpermt2ps
1025  {TTI::SK_PermuteTwoSrc, MVT::v8i64, 1}, // vpermt2q
1026  {TTI::SK_PermuteTwoSrc, MVT::v16i32, 1}, // vpermt2d
1027  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 1}, // vpermt2pd
1028  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 1}, // vpermt2ps
1029  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 1}, // vpermt2q
1030  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 1}, // vpermt2d
1031  {TTI::SK_PermuteTwoSrc, MVT::v2f64, 1}, // vpermt2pd
1032  {TTI::SK_PermuteTwoSrc, MVT::v4f32, 1}, // vpermt2ps
1033  {TTI::SK_PermuteTwoSrc, MVT::v2i64, 1}, // vpermt2q
1034  {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1} // vpermt2d
1035  };
1036 
1037  if (ST->hasAVX512())
1038  if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1039  return LT.first * Entry->Cost;
1040 
1041  static const CostTblEntry AVX2ShuffleTbl[] = {
1042  {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd
1043  {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps
1044  {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
1045  {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
1046  {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
1047  {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
1048 
1049  {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
1050  {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps
1051  {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
1052  {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
1053  {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
1054  {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
1055 
1056  {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
1057  {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
1058 
1059  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1060  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1061  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1062  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1063  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
1064  // + vpblendvb
1065  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
1066  // + vpblendvb
1067 
1068  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd
1069  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps
1070  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd
1071  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
1072  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
1073  // + vpblendvb
1074  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
1075  // + vpblendvb
1076  };
1077 
1078  if (ST->hasAVX2())
1079  if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
1080  return LT.first * Entry->Cost;
1081 
1082  static const CostTblEntry XOPShuffleTbl[] = {
1083  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd
1084  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps
1085  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd
1086  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps
1087  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
1088  // + vinsertf128
1089  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm
1090  // + vinsertf128
1091 
1092  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
1093  // + vinsertf128
1094  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm
1095  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm
1096  // + vinsertf128
1097  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm
1098  };
1099 
1100  if (ST->hasXOP())
1101  if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
1102  return LT.first * Entry->Cost;
1103 
1104  static const CostTblEntry AVX1ShuffleTbl[] = {
1105  {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1106  {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1107  {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1108  {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1109  {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
1110  {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
1111 
1112  {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1113  {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1114  {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1115  {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1116  {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
1117  // + vinsertf128
1118  {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
1119  // + vinsertf128
1120 
1121  {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd
1122  {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd
1123  {TTI::SK_Select, MVT::v8i32, 1}, // vblendps
1124  {TTI::SK_Select, MVT::v8f32, 1}, // vblendps
1125  {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
1126  {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
1127 
1128  {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
1129  {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd
1130  {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1131  {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1132  {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
1133  // + 2*por + vinsertf128
1134  {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
1135  // + 2*por + vinsertf128
1136 
1137  {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd
1138  {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd
1139  {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1140  {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1141  {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
1142  // + 4*por + vinsertf128
1143  {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
1144  // + 4*por + vinsertf128
1145  };
1146 
1147  if (ST->hasAVX())
1148  if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
1149  return LT.first * Entry->Cost;
1150 
1151  static const CostTblEntry SSE41ShuffleTbl[] = {
1152  {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
1153  {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1154  {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
1155  {TTI::SK_Select, MVT::v4f32, 1}, // blendps
1156  {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
1157  {TTI::SK_Select, MVT::v16i8, 1} // pblendvb
1158  };
1159 
1160  if (ST->hasSSE41())
1161  if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
1162  return LT.first * Entry->Cost;
1163 
1164  static const CostTblEntry SSSE3ShuffleTbl[] = {
1165  {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
1166  {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
1167 
1168  {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
1169  {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
1170 
1171  {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
1172  {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
1173 
1174  {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
1175  {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1176 
1177  {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
1178  {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
1179  };
1180 
1181  if (ST->hasSSSE3())
1182  if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
1183  return LT.first * Entry->Cost;
1184 
1185  static const CostTblEntry SSE2ShuffleTbl[] = {
1186  {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
1187  {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
1188  {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
1189  {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
1190  {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
1191 
1192  {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
1193  {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
1194  {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
1195  {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
1196  {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
1197  // + 2*pshufd + 2*unpck + packus
1198 
1199  {TTI::SK_Select, MVT::v2i64, 1}, // movsd
1200  {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1201  {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
1202  {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
1203  {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
1204 
1205  {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
1206  {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
1207  {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
1208  {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
1209  // + pshufd/unpck
1210  { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
1211  // + 2*pshufd + 2*unpck + 2*packus
1212 
1213  { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
1214  { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
1215  { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
1216  { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
1217  { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
1218  };
1219 
1220  if (ST->hasSSE2())
1221  if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
1222  return LT.first * Entry->Cost;
1223 
1224  static const CostTblEntry SSE1ShuffleTbl[] = {
1225  { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
1226  { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
1227  { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
1228  { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
1229  { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
1230  };
1231 
1232  if (ST->hasSSE1())
1233  if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
1234  return LT.first * Entry->Cost;
1235 
1236  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
1237 }
1238 
1239 int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1240  const Instruction *I) {
1241  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1242  assert(ISD && "Invalid opcode");
1243 
1244  // FIXME: Need a better design of the cost table to handle non-simple types of
1245  // potential massive combinations (elem_num x src_type x dst_type).
1246 
1247  static const TypeConversionCostTblEntry AVX512BWConversionTbl[] {
1250 
1251  // Mask sign extend has an instruction.
1258 
1259  // Mask zero extend is a load + broadcast.
1266  };
1267 
1268  static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
1275 
1282 
1289 
1296  };
1297 
1298  // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
1299  // 256-bit wide vectors.
1300 
1301  static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
1305 
1310 
1311  // v16i1 -> v16i32 - load + broadcast
1322 
1331 
1356 
1358 
1368  };
1369 
1370  static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
1387 
1394 
1397 
1399  };
1400 
1401  static const TypeConversionCostTblEntry AVXConversionTbl[] = {
1418 
1426 
1439 
1455  // The generic code to compute the scalar overhead is currently broken.
1456  // Workaround this limitation by estimating the scalarization overhead
1457  // here. We have roughly 10 instructions per scalar element.
1458  // Multiply that by the vector width.
1459  // FIXME: remove that when PR19268 is fixed.
1462 
1465  // This node is expanded into scalarized operations but BasicTTI is overly
1466  // optimistic estimating its cost. It computes 3 per element (one
1467  // vector-extract, one scalar conversion and one vector-insert). The
1468  // problem is that the inserts form a read-modify-write chain so latency
1469  // should be factored in too. Inflating the cost per element by 1.
1472 
1475  };
1476 
1477  static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
1484 
1503 
1511 
1513  };
1514 
1515  static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
1516  // These are somewhat magic numbers justified by looking at the output of
1517  // Intel's IACA, running some kernels and making sure when we take
1518  // legalization into account the throughput will be overestimated.
1520  { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
1527 
1528  { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
1536 
1538 
1540 
1565 
1575  };
1576 
1577  std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
1578  std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst);
1579 
1580  if (ST->hasSSE2() && !ST->hasAVX()) {
1581  if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
1582  LTDest.second, LTSrc.second))
1583  return LTSrc.first * Entry->Cost;
1584  }
1585 
1586  EVT SrcTy = TLI->getValueType(DL, Src);
1587  EVT DstTy = TLI->getValueType(DL, Dst);
1588 
1589  // The function getSimpleVT only handles simple value types.
1590  if (!SrcTy.isSimple() || !DstTy.isSimple())
1591  return BaseT::getCastInstrCost(Opcode, Dst, Src);
1592 
1593  MVT SimpleSrcTy = SrcTy.getSimpleVT();
1594  MVT SimpleDstTy = DstTy.getSimpleVT();
1595 
1596  // Make sure that neither type is going to be split before using the
1597  // AVX512 tables. This handles -mprefer-vector-width=256
1598  // with -min-legal-vector-width<=256
1599  if (TLI->getTypeAction(SimpleSrcTy) != TargetLowering::TypeSplitVector &&
1600  TLI->getTypeAction(SimpleDstTy) != TargetLowering::TypeSplitVector) {
1601  if (ST->hasBWI())
1602  if (const auto *Entry = ConvertCostTableLookup(AVX512BWConversionTbl, ISD,
1603  SimpleDstTy, SimpleSrcTy))
1604  return Entry->Cost;
1605 
1606  if (ST->hasDQI())
1607  if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD,
1608  SimpleDstTy, SimpleSrcTy))
1609  return Entry->Cost;
1610 
1611  if (ST->hasAVX512())
1612  if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD,
1613  SimpleDstTy, SimpleSrcTy))
1614  return Entry->Cost;
1615  }
1616 
1617  if (ST->hasAVX2()) {
1618  if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
1619  SimpleDstTy, SimpleSrcTy))
1620  return Entry->Cost;
1621  }
1622 
1623  if (ST->hasAVX()) {
1624  if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
1625  SimpleDstTy, SimpleSrcTy))
1626  return Entry->Cost;
1627  }
1628 
1629  if (ST->hasSSE41()) {
1630  if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
1631  SimpleDstTy, SimpleSrcTy))
1632  return Entry->Cost;
1633  }
1634 
1635  if (ST->hasSSE2()) {
1636  if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
1637  SimpleDstTy, SimpleSrcTy))
1638  return Entry->Cost;
1639  }
1640 
1641  return BaseT::getCastInstrCost(Opcode, Dst, Src, I);
1642 }
1643 
1644 int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
1645  const Instruction *I) {
1646  // Legalize the type.
1647  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1648 
1649  MVT MTy = LT.second;
1650 
1651  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1652  assert(ISD && "Invalid opcode");
1653 
1654  static const CostTblEntry SSE2CostTbl[] = {
1655  { ISD::SETCC, MVT::v2i64, 8 },
1656  { ISD::SETCC, MVT::v4i32, 1 },
1657  { ISD::SETCC, MVT::v8i16, 1 },
1658  { ISD::SETCC, MVT::v16i8, 1 },
1659  };
1660 
1661  static const CostTblEntry SSE42CostTbl[] = {
1662  { ISD::SETCC, MVT::v2f64, 1 },
1663  { ISD::SETCC, MVT::v4f32, 1 },
1664  { ISD::SETCC, MVT::v2i64, 1 },
1665  };
1666 
1667  static const CostTblEntry AVX1CostTbl[] = {
1668  { ISD::SETCC, MVT::v4f64, 1 },
1669  { ISD::SETCC, MVT::v8f32, 1 },
1670  // AVX1 does not support 8-wide integer compare.
1671  { ISD::SETCC, MVT::v4i64, 4 },
1672  { ISD::SETCC, MVT::v8i32, 4 },
1673  { ISD::SETCC, MVT::v16i16, 4 },
1674  { ISD::SETCC, MVT::v32i8, 4 },
1675  };
1676 
1677  static const CostTblEntry AVX2CostTbl[] = {
1678  { ISD::SETCC, MVT::v4i64, 1 },
1679  { ISD::SETCC, MVT::v8i32, 1 },
1680  { ISD::SETCC, MVT::v16i16, 1 },
1681  { ISD::SETCC, MVT::v32i8, 1 },
1682  };
1683 
1684  static const CostTblEntry AVX512CostTbl[] = {
1685  { ISD::SETCC, MVT::v8i64, 1 },
1686  { ISD::SETCC, MVT::v16i32, 1 },
1687  { ISD::SETCC, MVT::v8f64, 1 },
1688  { ISD::SETCC, MVT::v16f32, 1 },
1689  };
1690 
1691  static const CostTblEntry AVX512BWCostTbl[] = {
1692  { ISD::SETCC, MVT::v32i16, 1 },
1693  { ISD::SETCC, MVT::v64i8, 1 },
1694  };
1695 
1696  if (ST->hasBWI())
1697  if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
1698  return LT.first * Entry->Cost;
1699 
1700  if (ST->hasAVX512())
1701  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
1702  return LT.first * Entry->Cost;
1703 
1704  if (ST->hasAVX2())
1705  if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
1706  return LT.first * Entry->Cost;
1707 
1708  if (ST->hasAVX())
1709  if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
1710  return LT.first * Entry->Cost;
1711 
1712  if (ST->hasSSE42())
1713  if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
1714  return LT.first * Entry->Cost;
1715 
1716  if (ST->hasSSE2())
1717  if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
1718  return LT.first * Entry->Cost;
1719 
1720  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
1721 }
1722 
1724 
1727  unsigned ScalarizationCostPassed) {
1728  // Costs should match the codegen from:
1729  // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
1730  // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
1731  // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
1732  // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
1733  // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
1734  static const CostTblEntry AVX512CDCostTbl[] = {
1735  { ISD::CTLZ, MVT::v8i64, 1 },
1736  { ISD::CTLZ, MVT::v16i32, 1 },
1737  { ISD::CTLZ, MVT::v32i16, 8 },
1738  { ISD::CTLZ, MVT::v64i8, 20 },
1739  { ISD::CTLZ, MVT::v4i64, 1 },
1740  { ISD::CTLZ, MVT::v8i32, 1 },
1741  { ISD::CTLZ, MVT::v16i16, 4 },
1742  { ISD::CTLZ, MVT::v32i8, 10 },
1743  { ISD::CTLZ, MVT::v2i64, 1 },
1744  { ISD::CTLZ, MVT::v4i32, 1 },
1745  { ISD::CTLZ, MVT::v8i16, 4 },
1746  { ISD::CTLZ, MVT::v16i8, 4 },
1747  };
1748  static const CostTblEntry AVX512BWCostTbl[] = {
1749  { ISD::BITREVERSE, MVT::v8i64, 5 },
1750  { ISD::BITREVERSE, MVT::v16i32, 5 },
1751  { ISD::BITREVERSE, MVT::v32i16, 5 },
1752  { ISD::BITREVERSE, MVT::v64i8, 5 },
1753  { ISD::CTLZ, MVT::v8i64, 23 },
1754  { ISD::CTLZ, MVT::v16i32, 22 },
1755  { ISD::CTLZ, MVT::v32i16, 18 },
1756  { ISD::CTLZ, MVT::v64i8, 17 },
1757  { ISD::CTPOP, MVT::v8i64, 7 },
1758  { ISD::CTPOP, MVT::v16i32, 11 },
1759  { ISD::CTPOP, MVT::v32i16, 9 },
1760  { ISD::CTPOP, MVT::v64i8, 6 },
1761  { ISD::CTTZ, MVT::v8i64, 10 },
1762  { ISD::CTTZ, MVT::v16i32, 14 },
1763  { ISD::CTTZ, MVT::v32i16, 12 },
1764  { ISD::CTTZ, MVT::v64i8, 9 },
1765  { ISD::SADDSAT, MVT::v32i16, 1 },
1766  { ISD::SADDSAT, MVT::v64i8, 1 },
1767  { ISD::SSUBSAT, MVT::v32i16, 1 },
1768  { ISD::SSUBSAT, MVT::v64i8, 1 },
1769  { ISD::UADDSAT, MVT::v32i16, 1 },
1770  { ISD::UADDSAT, MVT::v64i8, 1 },
1771  { ISD::USUBSAT, MVT::v32i16, 1 },
1772  { ISD::USUBSAT, MVT::v64i8, 1 },
1773  };
1774  static const CostTblEntry AVX512CostTbl[] = {
1775  { ISD::BITREVERSE, MVT::v8i64, 36 },
1776  { ISD::BITREVERSE, MVT::v16i32, 24 },
1777  { ISD::CTLZ, MVT::v8i64, 29 },
1778  { ISD::CTLZ, MVT::v16i32, 35 },
1779  { ISD::CTPOP, MVT::v8i64, 16 },
1780  { ISD::CTPOP, MVT::v16i32, 24 },
1781  { ISD::CTTZ, MVT::v8i64, 20 },
1782  { ISD::CTTZ, MVT::v16i32, 28 },
1783  { ISD::USUBSAT, MVT::v16i32, 2 }, // pmaxud + psubd
1784  { ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq
1785  { ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq
1786  { ISD::USUBSAT, MVT::v8i64, 2 }, // pmaxuq + psubq
1787  };
1788  static const CostTblEntry XOPCostTbl[] = {
1789  { ISD::BITREVERSE, MVT::v4i64, 4 },
1790  { ISD::BITREVERSE, MVT::v8i32, 4 },
1791  { ISD::BITREVERSE, MVT::v16i16, 4 },
1792  { ISD::BITREVERSE, MVT::v32i8, 4 },
1793  { ISD::BITREVERSE, MVT::v2i64, 1 },
1794  { ISD::BITREVERSE, MVT::v4i32, 1 },
1795  { ISD::BITREVERSE, MVT::v8i16, 1 },
1796  { ISD::BITREVERSE, MVT::v16i8, 1 },
1797  { ISD::BITREVERSE, MVT::i64, 3 },
1798  { ISD::BITREVERSE, MVT::i32, 3 },
1799  { ISD::BITREVERSE, MVT::i16, 3 },
1800  { ISD::BITREVERSE, MVT::i8, 3 }
1801  };
1802  static const CostTblEntry AVX2CostTbl[] = {
1803  { ISD::BITREVERSE, MVT::v4i64, 5 },
1804  { ISD::BITREVERSE, MVT::v8i32, 5 },
1805  { ISD::BITREVERSE, MVT::v16i16, 5 },
1806  { ISD::BITREVERSE, MVT::v32i8, 5 },
1807  { ISD::BSWAP, MVT::v4i64, 1 },
1808  { ISD::BSWAP, MVT::v8i32, 1 },
1809  { ISD::BSWAP, MVT::v16i16, 1 },
1810  { ISD::CTLZ, MVT::v4i64, 23 },
1811  { ISD::CTLZ, MVT::v8i32, 18 },
1812  { ISD::CTLZ, MVT::v16i16, 14 },
1813  { ISD::CTLZ, MVT::v32i8, 9 },
1814  { ISD::CTPOP, MVT::v4i64, 7 },
1815  { ISD::CTPOP, MVT::v8i32, 11 },
1816  { ISD::CTPOP, MVT::v16i16, 9 },
1817  { ISD::CTPOP, MVT::v32i8, 6 },
1818  { ISD::CTTZ, MVT::v4i64, 10 },
1819  { ISD::CTTZ, MVT::v8i32, 14 },
1820  { ISD::CTTZ, MVT::v16i16, 12 },
1821  { ISD::CTTZ, MVT::v32i8, 9 },
1822  { ISD::SADDSAT, MVT::v16i16, 1 },
1823  { ISD::SADDSAT, MVT::v32i8, 1 },
1824  { ISD::SSUBSAT, MVT::v16i16, 1 },
1825  { ISD::SSUBSAT, MVT::v32i8, 1 },
1826  { ISD::UADDSAT, MVT::v16i16, 1 },
1827  { ISD::UADDSAT, MVT::v32i8, 1 },
1828  { ISD::USUBSAT, MVT::v16i16, 1 },
1829  { ISD::USUBSAT, MVT::v32i8, 1 },
1830  { ISD::USUBSAT, MVT::v8i32, 2 }, // pmaxud + psubd
1831  { ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/
1832  { ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
1833  { ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
1834  { ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/
1835  { ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
1836  { ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
1837  };
1838  static const CostTblEntry AVX1CostTbl[] = {
1839  { ISD::BITREVERSE, MVT::v4i64, 12 }, // 2 x 128-bit Op + extract/insert
1840  { ISD::BITREVERSE, MVT::v8i32, 12 }, // 2 x 128-bit Op + extract/insert
1841  { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert
1842  { ISD::BITREVERSE, MVT::v32i8, 12 }, // 2 x 128-bit Op + extract/insert
1843  { ISD::BSWAP, MVT::v4i64, 4 },
1844  { ISD::BSWAP, MVT::v8i32, 4 },
1845  { ISD::BSWAP, MVT::v16i16, 4 },
1846  { ISD::CTLZ, MVT::v4i64, 48 }, // 2 x 128-bit Op + extract/insert
1847  { ISD::CTLZ, MVT::v8i32, 38 }, // 2 x 128-bit Op + extract/insert
1848  { ISD::CTLZ, MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert
1849  { ISD::CTLZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
1850  { ISD::CTPOP, MVT::v4i64, 16 }, // 2 x 128-bit Op + extract/insert
1851  { ISD::CTPOP, MVT::v8i32, 24 }, // 2 x 128-bit Op + extract/insert
1852  { ISD::CTPOP, MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert
1853  { ISD::CTPOP, MVT::v32i8, 14 }, // 2 x 128-bit Op + extract/insert
1854  { ISD::CTTZ, MVT::v4i64, 22 }, // 2 x 128-bit Op + extract/insert
1855  { ISD::CTTZ, MVT::v8i32, 30 }, // 2 x 128-bit Op + extract/insert
1856  { ISD::CTTZ, MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert
1857  { ISD::CTTZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
1858  { ISD::SADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
1859  { ISD::SADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
1860  { ISD::SSUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
1861  { ISD::SSUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
1862  { ISD::UADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
1863  { ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
1864  { ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
1865  { ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
1866  { ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert
1867  { ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/
1868  { ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
1869  { ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
1870  { ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/
1871  { ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/
1872  { ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/
1873  };
1874  static const CostTblEntry GLMCostTbl[] = {
1875  { ISD::FSQRT, MVT::f32, 19 }, // sqrtss
1876  { ISD::FSQRT, MVT::v4f32, 37 }, // sqrtps
1877  { ISD::FSQRT, MVT::f64, 34 }, // sqrtsd
1878  { ISD::FSQRT, MVT::v2f64, 67 }, // sqrtpd
1879  };
1880  static const CostTblEntry SLMCostTbl[] = {
1881  { ISD::FSQRT, MVT::f32, 20 }, // sqrtss
1882  { ISD::FSQRT, MVT::v4f32, 40 }, // sqrtps
1883  { ISD::FSQRT, MVT::f64, 35 }, // sqrtsd
1884  { ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd
1885  };
1886  static const CostTblEntry SSE42CostTbl[] = {
1887  { ISD::USUBSAT, MVT::v4i32, 2 }, // pmaxud + psubd
1888  { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
1889  { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
1890  };
1891  static const CostTblEntry SSSE3CostTbl[] = {
1892  { ISD::BITREVERSE, MVT::v2i64, 5 },
1893  { ISD::BITREVERSE, MVT::v4i32, 5 },
1894  { ISD::BITREVERSE, MVT::v8i16, 5 },
1895  { ISD::BITREVERSE, MVT::v16i8, 5 },
1896  { ISD::BSWAP, MVT::v2i64, 1 },
1897  { ISD::BSWAP, MVT::v4i32, 1 },
1898  { ISD::BSWAP, MVT::v8i16, 1 },
1899  { ISD::CTLZ, MVT::v2i64, 23 },
1900  { ISD::CTLZ, MVT::v4i32, 18 },
1901  { ISD::CTLZ, MVT::v8i16, 14 },
1902  { ISD::CTLZ, MVT::v16i8, 9 },
1903  { ISD::CTPOP, MVT::v2i64, 7 },
1904  { ISD::CTPOP, MVT::v4i32, 11 },
1905  { ISD::CTPOP, MVT::v8i16, 9 },
1906  { ISD::CTPOP, MVT::v16i8, 6 },
1907  { ISD::CTTZ, MVT::v2i64, 10 },
1908  { ISD::CTTZ, MVT::v4i32, 14 },
1909  { ISD::CTTZ, MVT::v8i16, 12 },
1910  { ISD::CTTZ, MVT::v16i8, 9 }
1911  };
1912  static const CostTblEntry SSE2CostTbl[] = {
1913  { ISD::BITREVERSE, MVT::v2i64, 29 },
1914  { ISD::BITREVERSE, MVT::v4i32, 27 },
1915  { ISD::BITREVERSE, MVT::v8i16, 27 },
1916  { ISD::BITREVERSE, MVT::v16i8, 20 },
1917  { ISD::BSWAP, MVT::v2i64, 7 },
1918  { ISD::BSWAP, MVT::v4i32, 7 },
1919  { ISD::BSWAP, MVT::v8i16, 7 },
1920  { ISD::CTLZ, MVT::v2i64, 25 },
1921  { ISD::CTLZ, MVT::v4i32, 26 },
1922  { ISD::CTLZ, MVT::v8i16, 20 },
1923  { ISD::CTLZ, MVT::v16i8, 17 },
1924  { ISD::CTPOP, MVT::v2i64, 12 },
1925  { ISD::CTPOP, MVT::v4i32, 15 },
1926  { ISD::CTPOP, MVT::v8i16, 13 },
1927  { ISD::CTPOP, MVT::v16i8, 10 },
1928  { ISD::CTTZ, MVT::v2i64, 14 },
1929  { ISD::CTTZ, MVT::v4i32, 18 },
1930  { ISD::CTTZ, MVT::v8i16, 16 },
1931  { ISD::CTTZ, MVT::v16i8, 13 },
1932  { ISD::SADDSAT, MVT::v8i16, 1 },
1933  { ISD::SADDSAT, MVT::v16i8, 1 },
1934  { ISD::SSUBSAT, MVT::v8i16, 1 },
1935  { ISD::SSUBSAT, MVT::v16i8, 1 },
1936  { ISD::UADDSAT, MVT::v8i16, 1 },
1937  { ISD::UADDSAT, MVT::v16i8, 1 },
1938  { ISD::USUBSAT, MVT::v8i16, 1 },
1939  { ISD::USUBSAT, MVT::v16i8, 1 },
1940  { ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/
1941  { ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/
1942  };
1943  static const CostTblEntry SSE1CostTbl[] = {
1944  { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/
1945  { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
1946  };
1947  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
1948  { ISD::BITREVERSE, MVT::i64, 14 }
1949  };
1950  static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1951  { ISD::BITREVERSE, MVT::i32, 14 },
1952  { ISD::BITREVERSE, MVT::i16, 14 },
1953  { ISD::BITREVERSE, MVT::i8, 11 }
1954  };
1955 
1956  unsigned ISD = ISD::DELETED_NODE;
1957  switch (IID) {
1958  default:
1959  break;
1960  case Intrinsic::bitreverse:
1961  ISD = ISD::BITREVERSE;
1962  break;
1963  case Intrinsic::bswap:
1964  ISD = ISD::BSWAP;
1965  break;
1966  case Intrinsic::ctlz:
1967  ISD = ISD::CTLZ;
1968  break;
1969  case Intrinsic::ctpop:
1970  ISD = ISD::CTPOP;
1971  break;
1972  case Intrinsic::cttz:
1973  ISD = ISD::CTTZ;
1974  break;
1975  case Intrinsic::sadd_sat:
1976  ISD = ISD::SADDSAT;
1977  break;
1978  case Intrinsic::ssub_sat:
1979  ISD = ISD::SSUBSAT;
1980  break;
1981  case Intrinsic::uadd_sat:
1982  ISD = ISD::UADDSAT;
1983  break;
1984  case Intrinsic::usub_sat:
1985  ISD = ISD::USUBSAT;
1986  break;
1987  case Intrinsic::sqrt:
1988  ISD = ISD::FSQRT;
1989  break;
1990  }
1991 
1992  if (ISD != ISD::DELETED_NODE) {
1993  // Legalize the type.
1994  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
1995  MVT MTy = LT.second;
1996 
1997  // Attempt to lookup cost.
1998  if (ST->isGLM())
1999  if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
2000  return LT.first * Entry->Cost;
2001 
2002  if (ST->isSLM())
2003  if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
2004  return LT.first * Entry->Cost;
2005 
2006  if (ST->hasCDI())
2007  if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
2008  return LT.first * Entry->Cost;
2009 
2010  if (ST->hasBWI())
2011  if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
2012  return LT.first * Entry->Cost;
2013 
2014  if (ST->hasAVX512())
2015  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
2016  return LT.first * Entry->Cost;
2017 
2018  if (ST->hasXOP())
2019  if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
2020  return LT.first * Entry->Cost;
2021 
2022  if (ST->hasAVX2())
2023  if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
2024  return LT.first * Entry->Cost;
2025 
2026  if (ST->hasAVX())
2027  if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
2028  return LT.first * Entry->Cost;
2029 
2030  if (ST->hasSSE42())
2031  if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
2032  return LT.first * Entry->Cost;
2033 
2034  if (ST->hasSSSE3())
2035  if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
2036  return LT.first * Entry->Cost;
2037 
2038  if (ST->hasSSE2())
2039  if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
2040  return LT.first * Entry->Cost;
2041 
2042  if (ST->hasSSE1())
2043  if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
2044  return LT.first * Entry->Cost;
2045 
2046  if (ST->is64Bit())
2047  if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
2048  return LT.first * Entry->Cost;
2049 
2050  if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
2051  return LT.first * Entry->Cost;
2052  }
2053 
2054  return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF, ScalarizationCostPassed);
2055 }
2056 
2059  unsigned VF) {
2060  static const CostTblEntry AVX512CostTbl[] = {
2061  { ISD::ROTL, MVT::v8i64, 1 },
2062  { ISD::ROTL, MVT::v4i64, 1 },
2063  { ISD::ROTL, MVT::v2i64, 1 },
2064  { ISD::ROTL, MVT::v16i32, 1 },
2065  { ISD::ROTL, MVT::v8i32, 1 },
2066  { ISD::ROTL, MVT::v4i32, 1 },
2067  { ISD::ROTR, MVT::v8i64, 1 },
2068  { ISD::ROTR, MVT::v4i64, 1 },
2069  { ISD::ROTR, MVT::v2i64, 1 },
2070  { ISD::ROTR, MVT::v16i32, 1 },
2071  { ISD::ROTR, MVT::v8i32, 1 },
2072  { ISD::ROTR, MVT::v4i32, 1 }
2073  };
2074  // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
2075  static const CostTblEntry XOPCostTbl[] = {
2076  { ISD::ROTL, MVT::v4i64, 4 },
2077  { ISD::ROTL, MVT::v8i32, 4 },
2078  { ISD::ROTL, MVT::v16i16, 4 },
2079  { ISD::ROTL, MVT::v32i8, 4 },
2080  { ISD::ROTL, MVT::v2i64, 1 },
2081  { ISD::ROTL, MVT::v4i32, 1 },
2082  { ISD::ROTL, MVT::v8i16, 1 },
2083  { ISD::ROTL, MVT::v16i8, 1 },
2084  { ISD::ROTR, MVT::v4i64, 6 },
2085  { ISD::ROTR, MVT::v8i32, 6 },
2086  { ISD::ROTR, MVT::v16i16, 6 },
2087  { ISD::ROTR, MVT::v32i8, 6 },
2088  { ISD::ROTR, MVT::v2i64, 2 },
2089  { ISD::ROTR, MVT::v4i32, 2 },
2090  { ISD::ROTR, MVT::v8i16, 2 },
2091  { ISD::ROTR, MVT::v16i8, 2 }
2092  };
2093  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
2094  { ISD::ROTL, MVT::i64, 1 },
2095  { ISD::ROTR, MVT::i64, 1 },
2096  { ISD::FSHL, MVT::i64, 4 }
2097  };
2098  static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
2099  { ISD::ROTL, MVT::i32, 1 },
2100  { ISD::ROTL, MVT::i16, 1 },
2101  { ISD::ROTL, MVT::i8, 1 },
2102  { ISD::ROTR, MVT::i32, 1 },
2103  { ISD::ROTR, MVT::i16, 1 },
2104  { ISD::ROTR, MVT::i8, 1 },
2105  { ISD::FSHL, MVT::i32, 4 },
2106  { ISD::FSHL, MVT::i16, 4 },
2107  { ISD::FSHL, MVT::i8, 4 }
2108  };
2109 
2110  unsigned ISD = ISD::DELETED_NODE;
2111  switch (IID) {
2112  default:
2113  break;
2114  case Intrinsic::fshl:
2115  ISD = ISD::FSHL;
2116  if (Args[0] == Args[1])
2117  ISD = ISD::ROTL;
2118  break;
2119  case Intrinsic::fshr:
2120  // FSHR has same costs so don't duplicate.
2121  ISD = ISD::FSHL;
2122  if (Args[0] == Args[1])
2123  ISD = ISD::ROTR;
2124  break;
2125  }
2126 
2127  if (ISD != ISD::DELETED_NODE) {
2128  // Legalize the type.
2129  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
2130  MVT MTy = LT.second;
2131 
2132  // Attempt to lookup cost.
2133  if (ST->hasAVX512())
2134  if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
2135  return LT.first * Entry->Cost;
2136 
2137  if (ST->hasXOP())
2138  if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
2139  return LT.first * Entry->Cost;
2140 
2141  if (ST->is64Bit())
2142  if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
2143  return LT.first * Entry->Cost;
2144 
2145  if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
2146  return LT.first * Entry->Cost;
2147  }
2148 
2149  return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF);
2150 }
2151 
2152 int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
2153  assert(Val->isVectorTy() && "This must be a vector type");
2154 
2155  Type *ScalarType = Val->getScalarType();
2156 
2157  if (Index != -1U) {
2158  // Legalize the type.
2159  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
2160 
2161  // This type is legalized to a scalar type.
2162  if (!LT.second.isVector())
2163  return 0;
2164 
2165  // The type may be split. Normalize the index to the new type.
2166  unsigned Width = LT.second.getVectorNumElements();
2167  Index = Index % Width;
2168 
2169  // Floating point scalars are already located in index #0.
2170  if (ScalarType->isFloatingPointTy() && Index == 0)
2171  return 0;
2172  }
2173 
2174  // Add to the base cost if we know that the extracted element of a vector is
2175  // destined to be moved to and used in the integer register file.
2176  int RegisterFileMoveCost = 0;
2177  if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy())
2178  RegisterFileMoveCost = 1;
2179 
2180  return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
2181 }
2182 
2183 int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
2184  unsigned AddressSpace, const Instruction *I) {
2185  // Handle non-power-of-two vectors such as <3 x float>
2186  if (VectorType *VTy = dyn_cast<VectorType>(Src)) {
2187  unsigned NumElem = VTy->getVectorNumElements();
2188 
2189  // Handle a few common cases:
2190  // <3 x float>
2191  if (NumElem == 3 && VTy->getScalarSizeInBits() == 32)
2192  // Cost = 64 bit store + extract + 32 bit store.
2193  return 3;
2194 
2195  // <3 x double>
2196  if (NumElem == 3 && VTy->getScalarSizeInBits() == 64)
2197  // Cost = 128 bit store + unpack + 64 bit store.
2198  return 3;
2199 
2200  // Assume that all other non-power-of-two numbers are scalarized.
2201  if (!isPowerOf2_32(NumElem)) {
2202  int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment,
2203  AddressSpace);
2204  int SplitCost = getScalarizationOverhead(Src, Opcode == Instruction::Load,
2205  Opcode == Instruction::Store);
2206  return NumElem * Cost + SplitCost;
2207  }
2208  }
2209 
2210  // Legalize the type.
2211  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
2212  assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
2213  "Invalid Opcode");
2214 
2215  // Each load/store unit costs 1.
2216  int Cost = LT.first * 1;
2217 
2218  // This isn't exactly right. We're using slow unaligned 32-byte accesses as a
2219  // proxy for a double-pumped AVX memory interface such as on Sandybridge.
2220  if (LT.second.getStoreSize() == 32 && ST->isUnalignedMem32Slow())
2221  Cost *= 2;
2222 
2223  return Cost;
2224 }
2225 
2226 int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
2227  unsigned Alignment,
2228  unsigned AddressSpace) {
2229  VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy);
2230  if (!SrcVTy)
2231  // To calculate scalar take the regular cost, without mask
2232  return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace);
2233 
2234  unsigned NumElem = SrcVTy->getVectorNumElements();
2235  VectorType *MaskTy =
2236  VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
2237  if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy)) ||
2238  (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy)) ||
2239  !isPowerOf2_32(NumElem)) {
2240  // Scalarization
2241  int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true);
2242  int ScalarCompareCost = getCmpSelInstrCost(
2243  Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr);
2244  int BranchCost = getCFInstrCost(Instruction::Br);
2245  int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
2246 
2247  int ValueSplitCost = getScalarizationOverhead(
2248  SrcVTy, Opcode == Instruction::Load, Opcode == Instruction::Store);
2249  int MemopCost =
2250  NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
2251  Alignment, AddressSpace);
2252  return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
2253  }
2254 
2255  // Legalize the type.
2256  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);
2257  auto VT = TLI->getValueType(DL, SrcVTy);
2258  int Cost = 0;
2259  if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
2260  LT.second.getVectorNumElements() == NumElem)
2261  // Promotion requires expand/truncate for data and a shuffle for mask.
2262  Cost += getShuffleCost(TTI::SK_Select, SrcVTy, 0, nullptr) +
2263  getShuffleCost(TTI::SK_Select, MaskTy, 0, nullptr);
2264 
2265  else if (LT.second.getVectorNumElements() > NumElem) {
2266  VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(),
2267  LT.second.getVectorNumElements());
2268  // Expanding requires fill mask with zeroes
2269  Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy);
2270  }
2271  if (!ST->hasAVX512())
2272  return Cost + LT.first*4; // Each maskmov costs 4
2273 
2274  // AVX-512 masked load/store is cheapper
2275  return Cost+LT.first;
2276 }
2277 
2279  const SCEV *Ptr) {
2280  // Address computations in vectorized code with non-consecutive addresses will
2281  // likely result in more instructions compared to scalar code where the
2282  // computation can more often be merged into the index mode. The resulting
2283  // extra micro-ops can significantly decrease throughput.
2284  unsigned NumVectorInstToHideOverhead = 10;
2285 
2286  // Cost modeling of Strided Access Computation is hidden by the indexing
2287  // modes of X86 regardless of the stride value. We dont believe that there
2288  // is a difference between constant strided access in gerenal and constant
2289  // strided value which is less than or equal to 64.
2290  // Even in the case of (loop invariant) stride whose value is not known at
2291  // compile time, the address computation will not incur more than one extra
2292  // ADD instruction.
2293  if (Ty->isVectorTy() && SE) {
2294  if (!BaseT::isStridedAccess(Ptr))
2295  return NumVectorInstToHideOverhead;
2296  if (!BaseT::getConstantStrideStep(SE, Ptr))
2297  return 1;
2298  }
2299 
2300  return BaseT::getAddressComputationCost(Ty, SE, Ptr);
2301 }
2302 
2303 int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
2304  bool IsPairwise) {
2305 
2306  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2307 
2308  MVT MTy = LT.second;
2309 
2310  int ISD = TLI->InstructionOpcodeToISD(Opcode);
2311  assert(ISD && "Invalid opcode");
2312 
2313  // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
2314  // and make it as the cost.
2315 
2316  static const CostTblEntry SSE42CostTblPairWise[] = {
2317  { ISD::FADD, MVT::v2f64, 2 },
2318  { ISD::FADD, MVT::v4f32, 4 },
2319  { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
2320  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5".
2321  { ISD::ADD, MVT::v8i16, 5 },
2322  };
2323 
2324  static const CostTblEntry AVX1CostTblPairWise[] = {
2325  { ISD::FADD, MVT::v4f32, 4 },
2326  { ISD::FADD, MVT::v4f64, 5 },
2327  { ISD::FADD, MVT::v8f32, 7 },
2328  { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
2329  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5".
2330  { ISD::ADD, MVT::v4i64, 5 }, // The data reported by the IACA tool is "4.8".
2331  { ISD::ADD, MVT::v8i16, 5 },
2332  { ISD::ADD, MVT::v8i32, 5 },
2333  };
2334 
2335  static const CostTblEntry SSE42CostTblNoPairWise[] = {
2336  { ISD::FADD, MVT::v2f64, 2 },
2337  { ISD::FADD, MVT::v4f32, 4 },
2338  { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
2339  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
2340  { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
2341  };
2342 
2343  static const CostTblEntry AVX1CostTblNoPairWise[] = {
2344  { ISD::FADD, MVT::v4f32, 3 },
2345  { ISD::FADD, MVT::v4f64, 3 },
2346  { ISD::FADD, MVT::v8f32, 4 },
2347  { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
2348  { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "2.8".
2349  { ISD::ADD, MVT::v4i64, 3 },
2350  { ISD::ADD, MVT::v8i16, 4 },
2351  { ISD::ADD, MVT::v8i32, 5 },
2352  };
2353 
2354  if (IsPairwise) {
2355  if (ST->hasAVX())
2356  if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
2357  return LT.first * Entry->Cost;
2358 
2359  if (ST->hasSSE42())
2360  if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
2361  return LT.first * Entry->Cost;
2362  } else {
2363  if (ST->hasAVX())
2364  if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
2365  return LT.first * Entry->Cost;
2366 
2367  if (ST->hasSSE42())
2368  if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
2369  return LT.first * Entry->Cost;
2370  }
2371 
2372  return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise);
2373 }
2374 
2376  bool IsPairwise, bool IsUnsigned) {
2377  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2378 
2379  MVT MTy = LT.second;
2380 
2381  int ISD;
2382  if (ValTy->isIntOrIntVectorTy()) {
2383  ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN;
2384  } else {
2385  assert(ValTy->isFPOrFPVectorTy() &&
2386  "Expected float point or integer vector type.");
2387  ISD = ISD::FMINNUM;
2388  }
2389 
2390  // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
2391  // and make it as the cost.
2392 
2393  static const CostTblEntry SSE42CostTblPairWise[] = {
2394  {ISD::FMINNUM, MVT::v2f64, 3},
2395  {ISD::FMINNUM, MVT::v4f32, 2},
2396  {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
2397  {ISD::UMIN, MVT::v2i64, 8}, // The data reported by the IACA is "8.6"
2398  {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5"
2399  {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8"
2400  {ISD::SMIN, MVT::v8i16, 2},
2401  {ISD::UMIN, MVT::v8i16, 2},
2402  };
2403 
2404  static const CostTblEntry AVX1CostTblPairWise[] = {
2405  {ISD::FMINNUM, MVT::v4f32, 1},
2406  {ISD::FMINNUM, MVT::v4f64, 1},
2407  {ISD::FMINNUM, MVT::v8f32, 2},
2408  {ISD::SMIN, MVT::v2i64, 3},
2409  {ISD::UMIN, MVT::v2i64, 3},
2410  {ISD::SMIN, MVT::v4i32, 1},
2411  {ISD::UMIN, MVT::v4i32, 1},
2412  {ISD::SMIN, MVT::v8i16, 1},
2413  {ISD::UMIN, MVT::v8i16, 1},
2414  {ISD::SMIN, MVT::v8i32, 3},
2415  {ISD::UMIN, MVT::v8i32, 3},
2416  };
2417 
2418  static const CostTblEntry AVX2CostTblPairWise[] = {
2419  {ISD::SMIN, MVT::v4i64, 2},
2420  {ISD::UMIN, MVT::v4i64, 2},
2421  {ISD::SMIN, MVT::v8i32, 1},
2422  {ISD::UMIN, MVT::v8i32, 1},
2423  {ISD::SMIN, MVT::v16i16, 1},
2424  {ISD::UMIN, MVT::v16i16, 1},
2425  {ISD::SMIN, MVT::v32i8, 2},
2426  {ISD::UMIN, MVT::v32i8, 2},
2427  };
2428 
2429  static const CostTblEntry AVX512CostTblPairWise[] = {
2430  {ISD::FMINNUM, MVT::v8f64, 1},
2431  {ISD::FMINNUM, MVT::v16f32, 2},
2432  {ISD::SMIN, MVT::v8i64, 2},
2433  {ISD::UMIN, MVT::v8i64, 2},
2434  {ISD::SMIN, MVT::v16i32, 1},
2435  {ISD::UMIN, MVT::v16i32, 1},
2436  };
2437 
2438  static const CostTblEntry SSE42CostTblNoPairWise[] = {
2439  {ISD::FMINNUM, MVT::v2f64, 3},
2440  {ISD::FMINNUM, MVT::v4f32, 3},
2441  {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
2442  {ISD::UMIN, MVT::v2i64, 9}, // The data reported by the IACA is "8.6"
2443  {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5"
2444  {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8"
2445  {ISD::SMIN, MVT::v8i16, 1}, // The data reported by the IACA is "1.5"
2446  {ISD::UMIN, MVT::v8i16, 2}, // The data reported by the IACA is "1.8"
2447  };
2448 
2449  static const CostTblEntry AVX1CostTblNoPairWise[] = {
2450  {ISD::FMINNUM, MVT::v4f32, 1},
2451  {ISD::FMINNUM, MVT::v4f64, 1},
2452  {ISD::FMINNUM, MVT::v8f32, 1},
2453  {ISD::SMIN, MVT::v2i64, 3},
2454  {ISD::UMIN, MVT::v2i64, 3},
2455  {ISD::SMIN, MVT::v4i32, 1},
2456  {ISD::UMIN, MVT::v4i32, 1},
2457  {ISD::SMIN, MVT::v8i16, 1},
2458  {ISD::UMIN, MVT::v8i16, 1},
2459  {ISD::SMIN, MVT::v8i32, 2},
2460  {ISD::UMIN, MVT::v8i32, 2},
2461  };
2462 
2463  static const CostTblEntry AVX2CostTblNoPairWise[] = {
2464  {ISD::SMIN, MVT::v4i64, 1},
2465  {ISD::UMIN, MVT::v4i64, 1},
2466  {ISD::SMIN, MVT::v8i32, 1},
2467  {ISD::UMIN, MVT::v8i32, 1},
2468  {ISD::SMIN, MVT::v16i16, 1},
2469  {ISD::UMIN, MVT::v16i16, 1},
2470  {ISD::SMIN, MVT::v32i8, 1},
2471  {ISD::UMIN, MVT::v32i8, 1},
2472  };
2473 
2474  static const CostTblEntry AVX512CostTblNoPairWise[] = {
2475  {ISD::FMINNUM, MVT::v8f64, 1},
2476  {ISD::FMINNUM, MVT::v16f32, 2},
2477  {ISD::SMIN, MVT::v8i64, 1},
2478  {ISD::UMIN, MVT::v8i64, 1},
2479  {ISD::SMIN, MVT::v16i32, 1},
2480  {ISD::UMIN, MVT::v16i32, 1},
2481  };
2482 
2483  if (IsPairwise) {
2484  if (ST->hasAVX512())
2485  if (const auto *Entry = CostTableLookup(AVX512CostTblPairWise, ISD, MTy))
2486  return LT.first * Entry->Cost;
2487 
2488  if (ST->hasAVX2())
2489  if (const auto *Entry = CostTableLookup(AVX2CostTblPairWise, ISD, MTy))
2490  return LT.first * Entry->Cost;
2491 
2492  if (ST->hasAVX())
2493  if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
2494  return LT.first * Entry->Cost;
2495 
2496  if (ST->hasSSE42())
2497  if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
2498  return LT.first * Entry->Cost;
2499  } else {
2500  if (ST->hasAVX512())
2501  if (const auto *Entry =
2502  CostTableLookup(AVX512CostTblNoPairWise, ISD, MTy))
2503  return LT.first * Entry->Cost;
2504 
2505  if (ST->hasAVX2())
2506  if (const auto *Entry = CostTableLookup(AVX2CostTblNoPairWise, ISD, MTy))
2507  return LT.first * Entry->Cost;
2508 
2509  if (ST->hasAVX())
2510  if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
2511  return LT.first * Entry->Cost;
2512 
2513  if (ST->hasSSE42())
2514  if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
2515  return LT.first * Entry->Cost;
2516  }
2517 
2518  return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned);
2519 }
2520 
2521 /// Calculate the cost of materializing a 64-bit value. This helper
2522 /// method might only calculate a fraction of a larger immediate. Therefore it
2523 /// is valid to return a cost of ZERO.
2524 int X86TTIImpl::getIntImmCost(int64_t Val) {
2525  if (Val == 0)
2526  return TTI::TCC_Free;
2527 
2528  if (isInt<32>(Val))
2529  return TTI::TCC_Basic;
2530 
2531  return 2 * TTI::TCC_Basic;
2532 }
2533 
2534 int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
2535  assert(Ty->isIntegerTy());
2536 
2537  unsigned BitSize = Ty->getPrimitiveSizeInBits();
2538  if (BitSize == 0)
2539  return ~0U;
2540 
2541  // Never hoist constants larger than 128bit, because this might lead to
2542  // incorrect code generation or assertions in codegen.
2543  // Fixme: Create a cost model for types larger than i128 once the codegen
2544  // issues have been fixed.
2545  if (BitSize > 128)
2546  return TTI::TCC_Free;
2547 
2548  if (Imm == 0)
2549  return TTI::TCC_Free;
2550 
2551  // Sign-extend all constants to a multiple of 64-bit.
2552  APInt ImmVal = Imm;
2553  if (BitSize % 64 != 0)
2554  ImmVal = Imm.sext(alignTo(BitSize, 64));
2555 
2556  // Split the constant into 64-bit chunks and calculate the cost for each
2557  // chunk.
2558  int Cost = 0;
2559  for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
2560  APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
2561  int64_t Val = Tmp.getSExtValue();
2562  Cost += getIntImmCost(Val);
2563  }
2564  // We need at least one instruction to materialize the constant.
2565  return std::max(1, Cost);
2566 }
2567 
2568 int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
2569  Type *Ty) {
2570  assert(Ty->isIntegerTy());
2571 
2572  unsigned BitSize = Ty->getPrimitiveSizeInBits();
2573  // There is no cost model for constants with a bit size of 0. Return TCC_Free
2574  // here, so that constant hoisting will ignore this constant.
2575  if (BitSize == 0)
2576  return TTI::TCC_Free;
2577 
2578  unsigned ImmIdx = ~0U;
2579  switch (Opcode) {
2580  default:
2581  return TTI::TCC_Free;
2582  case Instruction::GetElementPtr:
2583  // Always hoist the base address of a GetElementPtr. This prevents the
2584  // creation of new constants for every base constant that gets constant
2585  // folded with the offset.
2586  if (Idx == 0)
2587  return 2 * TTI::TCC_Basic;
2588  return TTI::TCC_Free;
2589  case Instruction::Store:
2590  ImmIdx = 0;
2591  break;
2592  case Instruction::ICmp:
2593  // This is an imperfect hack to prevent constant hoisting of
2594  // compares that might be trying to check if a 64-bit value fits in
2595  // 32-bits. The backend can optimize these cases using a right shift by 32.
2596  // Ideally we would check the compare predicate here. There also other
2597  // similar immediates the backend can use shifts for.
2598  if (Idx == 1 && Imm.getBitWidth() == 64) {
2599  uint64_t ImmVal = Imm.getZExtValue();
2600  if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
2601  return TTI::TCC_Free;
2602  }
2603  ImmIdx = 1;
2604  break;
2605  case Instruction::And:
2606  // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
2607  // by using a 32-bit operation with implicit zero extension. Detect such
2608  // immediates here as the normal path expects bit 31 to be sign extended.
2609  if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue()))
2610  return TTI::TCC_Free;
2611  ImmIdx = 1;
2612  break;
2613  case Instruction::Add:
2614  case Instruction::Sub:
2615  // For add/sub, we can use the opposite instruction for INT32_MIN.
2616  if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000)
2617  return TTI::TCC_Free;
2618  ImmIdx = 1;
2619  break;
2620  case Instruction::UDiv:
2621  case Instruction::SDiv:
2622  case Instruction::URem:
2623  case Instruction::SRem:
2624  // Division by constant is typically expanded later into a different
2625  // instruction sequence. This completely changes the constants.
2626  // Report them as "free" to stop ConstantHoist from marking them as opaque.
2627  return TTI::TCC_Free;
2628  case Instruction::Mul:
2629  case Instruction::Or:
2630  case Instruction::Xor:
2631  ImmIdx = 1;
2632  break;
2633  // Always return TCC_Free for the shift value of a shift instruction.
2634  case Instruction::Shl:
2635  case Instruction::LShr:
2636  case Instruction::AShr:
2637  if (Idx == 1)
2638  return TTI::TCC_Free;
2639  break;
2640  case Instruction::Trunc:
2641  case Instruction::ZExt:
2642  case Instruction::SExt:
2643  case Instruction::IntToPtr:
2644  case Instruction::PtrToInt:
2645  case Instruction::BitCast:
2646  case Instruction::PHI:
2647  case Instruction::Call:
2648  case Instruction::Select:
2649  case Instruction::Ret:
2650  case Instruction::Load:
2651  break;
2652  }
2653 
2654  if (Idx == ImmIdx) {
2655  int NumConstants = divideCeil(BitSize, 64);
2656  int Cost = X86TTIImpl::getIntImmCost(Imm, Ty);
2657  return (Cost <= NumConstants * TTI::TCC_Basic)
2658  ? static_cast<int>(TTI::TCC_Free)
2659  : Cost;
2660  }
2661 
2662  return X86TTIImpl::getIntImmCost(Imm, Ty);
2663 }
2664 
2665 int X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
2666  Type *Ty) {
2667  assert(Ty->isIntegerTy());
2668 
2669  unsigned BitSize = Ty->getPrimitiveSizeInBits();
2670  // There is no cost model for constants with a bit size of 0. Return TCC_Free
2671  // here, so that constant hoisting will ignore this constant.
2672  if (BitSize == 0)
2673  return TTI::TCC_Free;
2674 
2675  switch (IID) {
2676  default:
2677  return TTI::TCC_Free;
2684  if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue()))
2685  return TTI::TCC_Free;
2686  break;
2688  if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
2689  return TTI::TCC_Free;
2690  break;
2693  if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
2694  return TTI::TCC_Free;
2695  break;
2696  }
2697  return X86TTIImpl::getIntImmCost(Imm, Ty);
2698 }
2699 
2700 unsigned X86TTIImpl::getUserCost(const User *U,
2701  ArrayRef<const Value *> Operands) {
2702  if (isa<StoreInst>(U)) {
2703  Value *Ptr = U->getOperand(1);
2704  // Store instruction with index and scale costs 2 Uops.
2705  // Check the preceding GEP to identify non-const indices.
2706  if (auto GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
2707  if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
2708  return TTI::TCC_Basic * 2;
2709  }
2710  return TTI::TCC_Basic;
2711  }
2712  return BaseT::getUserCost(U, Operands);
2713 }
2714 
2715 // Return an average cost of Gather / Scatter instruction, maybe improved later
2716 int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
2717  unsigned Alignment, unsigned AddressSpace) {
2718 
2719  assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
2720  unsigned VF = SrcVTy->getVectorNumElements();
2721 
2722  // Try to reduce index size from 64 bit (default for GEP)
2723  // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
2724  // operation will use 16 x 64 indices which do not fit in a zmm and needs
2725  // to split. Also check that the base pointer is the same for all lanes,
2726  // and that there's at most one variable index.
2727  auto getIndexSizeInBits = [](Value *Ptr, const DataLayout& DL) {
2728  unsigned IndexSize = DL.getPointerSizeInBits();
2730  if (IndexSize < 64 || !GEP)
2731  return IndexSize;
2732 
2733  unsigned NumOfVarIndices = 0;
2734  Value *Ptrs = GEP->getPointerOperand();
2735  if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
2736  return IndexSize;
2737  for (unsigned i = 1; i < GEP->getNumOperands(); ++i) {
2738  if (isa<Constant>(GEP->getOperand(i)))
2739  continue;
2740  Type *IndxTy = GEP->getOperand(i)->getType();
2741  if (IndxTy->isVectorTy())
2742  IndxTy = IndxTy->getVectorElementType();
2743  if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
2744  !isa<SExtInst>(GEP->getOperand(i))) ||
2745  ++NumOfVarIndices > 1)
2746  return IndexSize; // 64
2747  }
2748  return (unsigned)32;
2749  };
2750 
2751 
2752  // Trying to reduce IndexSize to 32 bits for vector 16.
2753  // By default the IndexSize is equal to pointer size.
2754  unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
2755  ? getIndexSizeInBits(Ptr, DL)
2757 
2758  Type *IndexVTy = VectorType::get(IntegerType::get(SrcVTy->getContext(),
2759  IndexSize), VF);
2760  std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy);
2761  std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy);
2762  int SplitFactor = std::max(IdxsLT.first, SrcLT.first);
2763  if (SplitFactor > 1) {
2764  // Handle splitting of vector of pointers
2765  Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
2766  return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
2767  AddressSpace);
2768  }
2769 
2770  // The gather / scatter cost is given by Intel architects. It is a rough
2771  // number since we are looking at one instruction in a time.
2772  const int GSOverhead = (Opcode == Instruction::Load)
2773  ? ST->getGatherOverhead()
2774  : ST->getScatterOverhead();
2775  return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
2776  Alignment, AddressSpace);
2777 }
2778 
2779 /// Return the cost of full scalarization of gather / scatter operation.
2780 ///
2781 /// Opcode - Load or Store instruction.
2782 /// SrcVTy - The type of the data vector that should be gathered or scattered.
2783 /// VariableMask - The mask is non-constant at compile time.
2784 /// Alignment - Alignment for one element.
2785 /// AddressSpace - pointer[s] address space.
2786 ///
2787 int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
2788  bool VariableMask, unsigned Alignment,
2789  unsigned AddressSpace) {
2790  unsigned VF = SrcVTy->getVectorNumElements();
2791 
2792  int MaskUnpackCost = 0;
2793  if (VariableMask) {
2794  VectorType *MaskTy =
2795  VectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
2796  MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true);
2797  int ScalarCompareCost =
2798  getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()),
2799  nullptr);
2800  int BranchCost = getCFInstrCost(Instruction::Br);
2801  MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
2802  }
2803 
2804  // The cost of the scalar loads/stores.
2805  int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
2806  Alignment, AddressSpace);
2807 
2808  int InsertExtractCost = 0;
2809  if (Opcode == Instruction::Load)
2810  for (unsigned i = 0; i < VF; ++i)
2811  // Add the cost of inserting each scalar load into the vector
2812  InsertExtractCost +=
2813  getVectorInstrCost(Instruction::InsertElement, SrcVTy, i);
2814  else
2815  for (unsigned i = 0; i < VF; ++i)
2816  // Add the cost of extracting each element out of the data vector
2817  InsertExtractCost +=
2818  getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i);
2819 
2820  return MemoryOpCost + MaskUnpackCost + InsertExtractCost;
2821 }
2822 
2823 /// Calculate the cost of Gather / Scatter operation
2824 int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
2825  Value *Ptr, bool VariableMask,
2826  unsigned Alignment) {
2827  assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
2828  unsigned VF = SrcVTy->getVectorNumElements();
2829  PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
2830  if (!PtrTy && Ptr->getType()->isVectorTy())
2831  PtrTy = dyn_cast<PointerType>(Ptr->getType()->getVectorElementType());
2832  assert(PtrTy && "Unexpected type for Ptr argument");
2833  unsigned AddressSpace = PtrTy->getAddressSpace();
2834 
2835  bool Scalarize = false;
2836  if ((Opcode == Instruction::Load && !isLegalMaskedGather(SrcVTy)) ||
2837  (Opcode == Instruction::Store && !isLegalMaskedScatter(SrcVTy)))
2838  Scalarize = true;
2839  // Gather / Scatter for vector 2 is not profitable on KNL / SKX
2840  // Vector-4 of gather/scatter instruction does not exist on KNL.
2841  // We can extend it to 8 elements, but zeroing upper bits of
2842  // the mask vector will add more instructions. Right now we give the scalar
2843  // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction
2844  // is better in the VariableMask case.
2845  if (ST->hasAVX512() && (VF == 2 || (VF == 4 && !ST->hasVLX())))
2846  Scalarize = true;
2847 
2848  if (Scalarize)
2849  return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment,
2850  AddressSpace);
2851 
2852  return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
2853 }
2854 
2857  // X86 specific here are "instruction number 1st priority".
2858  return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
2859  C1.NumIVMuls, C1.NumBaseAdds,
2860  C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
2861  std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
2862  C2.NumIVMuls, C2.NumBaseAdds,
2863  C2.ScaleCost, C2.ImmCost, C2.SetupCost);
2864 }
2865 
2867  return ST->hasMacroFusion();
2868 }
2869 
2871  // The backend can't handle a single element vector.
2872  if (isa<VectorType>(DataTy) && DataTy->getVectorNumElements() == 1)
2873  return false;
2874  Type *ScalarTy = DataTy->getScalarType();
2875  int DataWidth = isa<PointerType>(ScalarTy) ?
2877 
2878  return ((DataWidth == 32 || DataWidth == 64) && ST->hasAVX()) ||
2879  ((DataWidth == 8 || DataWidth == 16) && ST->hasBWI());
2880 }
2881 
2883  return isLegalMaskedLoad(DataType);
2884 }
2885 
2887  // This function is called now in two cases: from the Loop Vectorizer
2888  // and from the Scalarizer.
2889  // When the Loop Vectorizer asks about legality of the feature,
2890  // the vectorization factor is not calculated yet. The Loop Vectorizer
2891  // sends a scalar type and the decision is based on the width of the
2892  // scalar element.
2893  // Later on, the cost model will estimate usage this intrinsic based on
2894  // the vector type.
2895  // The Scalarizer asks again about legality. It sends a vector type.
2896  // In this case we can reject non-power-of-2 vectors.
2897  // We also reject single element vectors as the type legalizer can't
2898  // scalarize it.
2899  if (isa<VectorType>(DataTy)) {
2900  unsigned NumElts = DataTy->getVectorNumElements();
2901  if (NumElts == 1 || !isPowerOf2_32(NumElts))
2902  return false;
2903  }
2904  Type *ScalarTy = DataTy->getScalarType();
2905  int DataWidth = isa<PointerType>(ScalarTy) ?
2907 
2908  // Some CPUs have better gather performance than others.
2909  // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
2910  // enable gather with a -march.
2911  return (DataWidth == 32 || DataWidth == 64) &&
2912  (ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2()));
2913 }
2914 
2916  // AVX2 doesn't support scatter
2917  if (!ST->hasAVX512())
2918  return false;
2919  return isLegalMaskedGather(DataType);
2920 }
2921 
2922 bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
2923  EVT VT = TLI->getValueType(DL, DataType);
2924  return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
2925 }
2926 
2928  return false;
2929 }
2930 
2932  const Function *Callee) const {
2933  const TargetMachine &TM = getTLI()->getTargetMachine();
2934 
2935  // Work this as a subsetting of subtarget features.
2936  const FeatureBitset &CallerBits =
2937  TM.getSubtargetImpl(*Caller)->getFeatureBits();
2938  const FeatureBitset &CalleeBits =
2939  TM.getSubtargetImpl(*Callee)->getFeatureBits();
2940 
2941  // FIXME: This is likely too limiting as it will include subtarget features
2942  // that we might not care about for inlining, but it is conservatively
2943  // correct.
2944  return (CallerBits & CalleeBits) == CalleeBits;
2945 }
2946 
2948 X86TTIImpl::enableMemCmpExpansion(bool IsZeroCmp) const {
2949  // Only enable vector loads for equality comparison.
2950  // Right now the vector version is not as fast, see #33329.
2951  static const auto ThreeWayOptions = [this]() {
2953  if (ST->is64Bit()) {
2954  Options.LoadSizes.push_back(8);
2955  }
2956  Options.LoadSizes.push_back(4);
2957  Options.LoadSizes.push_back(2);
2958  Options.LoadSizes.push_back(1);
2959  return Options;
2960  }();
2961  static const auto EqZeroOptions = [this]() {
2963  // TODO: enable AVX512 when the DAG is ready.
2964  // if (ST->hasAVX512()) Options.LoadSizes.push_back(64);
2965  if (ST->hasAVX2()) Options.LoadSizes.push_back(32);
2966  if (ST->hasSSE2()) Options.LoadSizes.push_back(16);
2967  if (ST->is64Bit()) {
2968  Options.LoadSizes.push_back(8);
2969  }
2970  Options.LoadSizes.push_back(4);
2971  Options.LoadSizes.push_back(2);
2972  Options.LoadSizes.push_back(1);
2973  // All GPR and vector loads can be unaligned. SIMD compare requires integer
2974  // vectors (SSE2/AVX2).
2975  Options.AllowOverlappingLoads = true;
2976  return Options;
2977  }();
2978  return IsZeroCmp ? &EqZeroOptions : &ThreeWayOptions;
2979 }
2980 
2982  // TODO: We expect this to be beneficial regardless of arch,
2983  // but there are currently some unexplained performance artifacts on Atom.
2984  // As a temporary solution, disable on Atom.
2985  return !(ST->isAtom());
2986 }
2987 
2988 // Get estimation for interleaved load/store operations for AVX2.
2989 // \p Factor is the interleaved-access factor (stride) - number of
2990 // (interleaved) elements in the group.
2991 // \p Indices contains the indices for a strided load: when the
2992 // interleaved load has gaps they indicate which elements are used.
2993 // If Indices is empty (or if the number of indices is equal to the size
2994 // of the interleaved-access as given in \p Factor) the access has no gaps.
2995 //
2996 // As opposed to AVX-512, AVX2 does not have generic shuffles that allow
2997 // computing the cost using a generic formula as a function of generic
2998 // shuffles. We therefore use a lookup table instead, filled according to
2999 // the instruction sequences that codegen currently generates.
3001  unsigned Factor,
3002  ArrayRef<unsigned> Indices,
3003  unsigned Alignment,
3004  unsigned AddressSpace,
3005  bool UseMaskForCond,
3006  bool UseMaskForGaps) {
3007 
3008  if (UseMaskForCond || UseMaskForGaps)
3009  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3010  Alignment, AddressSpace,
3011  UseMaskForCond, UseMaskForGaps);
3012 
3013  // We currently Support only fully-interleaved groups, with no gaps.
3014  // TODO: Support also strided loads (interleaved-groups with gaps).
3015  if (Indices.size() && Indices.size() != Factor)
3016  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3017  Alignment, AddressSpace);
3018 
3019  // VecTy for interleave memop is <VF*Factor x Elt>.
3020  // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
3021  // VecTy = <12 x i32>.
3022  MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
3023 
3024  // This function can be called with VecTy=<6xi128>, Factor=3, in which case
3025  // the VF=2, while v2i128 is an unsupported MVT vector type
3026  // (see MachineValueType.h::getVectorVT()).
3027  if (!LegalVT.isVector())
3028  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3029  Alignment, AddressSpace);
3030 
3031  unsigned VF = VecTy->getVectorNumElements() / Factor;
3032  Type *ScalarTy = VecTy->getVectorElementType();
3033 
3034  // Calculate the number of memory operations (NumOfMemOps), required
3035  // for load/store the VecTy.
3036  unsigned VecTySize = DL.getTypeStoreSize(VecTy);
3037  unsigned LegalVTSize = LegalVT.getStoreSize();
3038  unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
3039 
3040  // Get the cost of one memory operation.
3041  Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
3042  LegalVT.getVectorNumElements());
3043  unsigned MemOpCost =
3044  getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
3045 
3046  VectorType *VT = VectorType::get(ScalarTy, VF);
3047  EVT ETy = TLI->getValueType(DL, VT);
3048  if (!ETy.isSimple())
3049  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3050  Alignment, AddressSpace);
3051 
3052  // TODO: Complete for other data-types and strides.
3053  // Each combination of Stride, ElementTy and VF results in a different
3054  // sequence; The cost tables are therefore accessed with:
3055  // Factor (stride) and VectorType=VFxElemType.
3056  // The Cost accounts only for the shuffle sequence;
3057  // The cost of the loads/stores is accounted for separately.
3058  //
3059  static const CostTblEntry AVX2InterleavedLoadTbl[] = {
3060  { 2, MVT::v4i64, 6 }, //(load 8i64 and) deinterleave into 2 x 4i64
3061  { 2, MVT::v4f64, 6 }, //(load 8f64 and) deinterleave into 2 x 4f64
3062 
3063  { 3, MVT::v2i8, 10 }, //(load 6i8 and) deinterleave into 3 x 2i8
3064  { 3, MVT::v4i8, 4 }, //(load 12i8 and) deinterleave into 3 x 4i8
3065  { 3, MVT::v8i8, 9 }, //(load 24i8 and) deinterleave into 3 x 8i8
3066  { 3, MVT::v16i8, 11}, //(load 48i8 and) deinterleave into 3 x 16i8
3067  { 3, MVT::v32i8, 13}, //(load 96i8 and) deinterleave into 3 x 32i8
3068  { 3, MVT::v8f32, 17 }, //(load 24f32 and)deinterleave into 3 x 8f32
3069 
3070  { 4, MVT::v2i8, 12 }, //(load 8i8 and) deinterleave into 4 x 2i8
3071  { 4, MVT::v4i8, 4 }, //(load 16i8 and) deinterleave into 4 x 4i8
3072  { 4, MVT::v8i8, 20 }, //(load 32i8 and) deinterleave into 4 x 8i8
3073  { 4, MVT::v16i8, 39 }, //(load 64i8 and) deinterleave into 4 x 16i8
3074  { 4, MVT::v32i8, 80 }, //(load 128i8 and) deinterleave into 4 x 32i8
3075 
3076  { 8, MVT::v8f32, 40 } //(load 64f32 and)deinterleave into 8 x 8f32
3077  };
3078 
3079  static const CostTblEntry AVX2InterleavedStoreTbl[] = {
3080  { 2, MVT::v4i64, 6 }, //interleave into 2 x 4i64 into 8i64 (and store)
3081  { 2, MVT::v4f64, 6 }, //interleave into 2 x 4f64 into 8f64 (and store)
3082 
3083  { 3, MVT::v2i8, 7 }, //interleave 3 x 2i8 into 6i8 (and store)
3084  { 3, MVT::v4i8, 8 }, //interleave 3 x 4i8 into 12i8 (and store)
3085  { 3, MVT::v8i8, 11 }, //interleave 3 x 8i8 into 24i8 (and store)
3086  { 3, MVT::v16i8, 11 }, //interleave 3 x 16i8 into 48i8 (and store)
3087  { 3, MVT::v32i8, 13 }, //interleave 3 x 32i8 into 96i8 (and store)
3088 
3089  { 4, MVT::v2i8, 12 }, //interleave 4 x 2i8 into 8i8 (and store)
3090  { 4, MVT::v4i8, 9 }, //interleave 4 x 4i8 into 16i8 (and store)
3091  { 4, MVT::v8i8, 10 }, //interleave 4 x 8i8 into 32i8 (and store)
3092  { 4, MVT::v16i8, 10 }, //interleave 4 x 16i8 into 64i8 (and store)
3093  { 4, MVT::v32i8, 12 } //interleave 4 x 32i8 into 128i8 (and store)
3094  };
3095 
3096  if (Opcode == Instruction::Load) {
3097  if (const auto *Entry =
3098  CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT()))
3099  return NumOfMemOps * MemOpCost + Entry->Cost;
3100  } else {
3101  assert(Opcode == Instruction::Store &&
3102  "Expected Store Instruction at this point");
3103  if (const auto *Entry =
3104  CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT()))
3105  return NumOfMemOps * MemOpCost + Entry->Cost;
3106  }
3107 
3108  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3109  Alignment, AddressSpace);
3110 }
3111 
3112 // Get estimation for interleaved load/store operations and strided load.
3113 // \p Indices contains indices for strided load.
3114 // \p Factor - the factor of interleaving.
3115 // AVX-512 provides 3-src shuffles that significantly reduces the cost.
3117  unsigned Factor,
3118  ArrayRef<unsigned> Indices,
3119  unsigned Alignment,
3120  unsigned AddressSpace,
3121  bool UseMaskForCond,
3122  bool UseMaskForGaps) {
3123 
3124  if (UseMaskForCond || UseMaskForGaps)
3125  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3126  Alignment, AddressSpace,
3127  UseMaskForCond, UseMaskForGaps);
3128 
3129  // VecTy for interleave memop is <VF*Factor x Elt>.
3130  // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
3131  // VecTy = <12 x i32>.
3132 
3133  // Calculate the number of memory operations (NumOfMemOps), required
3134  // for load/store the VecTy.
3135  MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
3136  unsigned VecTySize = DL.getTypeStoreSize(VecTy);
3137  unsigned LegalVTSize = LegalVT.getStoreSize();
3138  unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
3139 
3140  // Get the cost of one memory operation.
3141  Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
3142  LegalVT.getVectorNumElements());
3143  unsigned MemOpCost =
3144  getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
3145 
3146  unsigned VF = VecTy->getVectorNumElements() / Factor;
3147  MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
3148 
3149  if (Opcode == Instruction::Load) {
3150  // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
3151  // contain the cost of the optimized shuffle sequence that the
3152  // X86InterleavedAccess pass will generate.
3153  // The cost of loads and stores are computed separately from the table.
3154 
3155  // X86InterleavedAccess support only the following interleaved-access group.
3156  static const CostTblEntry AVX512InterleavedLoadTbl[] = {
3157  {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
3158  {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
3159  {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
3160  };
3161 
3162  if (const auto *Entry =
3163  CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
3164  return NumOfMemOps * MemOpCost + Entry->Cost;
3165  //If an entry does not exist, fallback to the default implementation.
3166 
3167  // Kind of shuffle depends on number of loaded values.
3168  // If we load the entire data in one register, we can use a 1-src shuffle.
3169  // Otherwise, we'll merge 2 sources in each operation.
3170  TTI::ShuffleKind ShuffleKind =
3171  (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
3172 
3173  unsigned ShuffleCost =
3174  getShuffleCost(ShuffleKind, SingleMemOpTy, 0, nullptr);
3175 
3176  unsigned NumOfLoadsInInterleaveGrp =
3177  Indices.size() ? Indices.size() : Factor;
3178  Type *ResultTy = VectorType::get(VecTy->getVectorElementType(),
3179  VecTy->getVectorNumElements() / Factor);
3180  unsigned NumOfResults =
3181  getTLI()->getTypeLegalizationCost(DL, ResultTy).first *
3182  NumOfLoadsInInterleaveGrp;
3183 
3184  // About a half of the loads may be folded in shuffles when we have only
3185  // one result. If we have more than one result, we do not fold loads at all.
3186  unsigned NumOfUnfoldedLoads =
3187  NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
3188 
3189  // Get a number of shuffle operations per result.
3190  unsigned NumOfShufflesPerResult =
3191  std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
3192 
3193  // The SK_MergeTwoSrc shuffle clobbers one of src operands.
3194  // When we have more than one destination, we need additional instructions
3195  // to keep sources.
3196  unsigned NumOfMoves = 0;
3197  if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
3198  NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
3199 
3200  int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
3201  NumOfUnfoldedLoads * MemOpCost + NumOfMoves;
3202 
3203  return Cost;
3204  }
3205 
3206  // Store.
3207  assert(Opcode == Instruction::Store &&
3208  "Expected Store Instruction at this point");
3209  // X86InterleavedAccess support only the following interleaved-access group.
3210  static const CostTblEntry AVX512InterleavedStoreTbl[] = {
3211  {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
3212  {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
3213  {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
3214 
3215  {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
3216  {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
3217  {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
3218  {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
3219  };
3220 
3221  if (const auto *Entry =
3222  CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
3223  return NumOfMemOps * MemOpCost + Entry->Cost;
3224  //If an entry does not exist, fallback to the default implementation.
3225 
3226  // There is no strided stores meanwhile. And store can't be folded in
3227  // shuffle.
3228  unsigned NumOfSources = Factor; // The number of values to be merged.
3229  unsigned ShuffleCost =
3230  getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr);
3231  unsigned NumOfShufflesPerStore = NumOfSources - 1;
3232 
3233  // The SK_MergeTwoSrc shuffle clobbers one of src operands.
3234  // We need additional instructions to keep sources.
3235  unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
3236  int Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
3237  NumOfMoves;
3238  return Cost;
3239 }
3240 
3241 int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
3242  unsigned Factor,
3243  ArrayRef<unsigned> Indices,
3244  unsigned Alignment,
3245  unsigned AddressSpace,
3246  bool UseMaskForCond,
3247  bool UseMaskForGaps) {
3248  auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) {
3249  Type *EltTy = VecTy->getVectorElementType();
3250  if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
3251  EltTy->isIntegerTy(32) || EltTy->isPointerTy())
3252  return true;
3253  if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8))
3254  return HasBW;
3255  return false;
3256  };
3257  if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI()))
3258  return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
3259  Alignment, AddressSpace,
3260  UseMaskForCond, UseMaskForGaps);
3261  if (ST->hasAVX2())
3262  return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices,
3263  Alignment, AddressSpace,
3264  UseMaskForCond, UseMaskForGaps);
3265 
3266  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3267  Alignment, AddressSpace,
3268  UseMaskForCond, UseMaskForGaps);
3269 }
bool hasAVX() const
Definition: X86Subtarget.h:560
Type * getVectorElementType() const
Definition: Type.h:371
constexpr bool isUInt< 32 >(uint64_t x)
Definition: MathExtras.h:349
X = FP_ROUND(Y, TRUNC) - Rounding &#39;Y&#39; from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:538
bool is64Bit() const
Is this x86_64? (disregarding specific ABI / programming model)
Definition: X86Subtarget.h:522
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:111
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:594
unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >())
Definition: BasicTTIImpl.h:568
static IntegerType * getInt1Ty(LLVMContext &C)
Definition: Type.cpp:173
llvm::Optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1563
APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:834
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:42
bool hasSSE41() const
Definition: X86Subtarget.h:558
This class represents lattice values for constants.
Definition: AllocatorList.h:24
unsigned minRequiredElementSize(const Value *Val, bool &isSigned)
static MVT getVectorVT(MVT VT, unsigned NumElements)
Cost tables and simple lookup functions.
int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty)
int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, bool VariableMask, unsigned Alignment)
Calculate the cost of Gather / Scatter operation.
bool hasPOPCNT() const
Definition: X86Subtarget.h:568
bool hasAVX2() const
Definition: X86Subtarget.h:561
#define LLVM_FALLTHROUGH
Definition: Compiler.h:86
const Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
bool isVector() const
Return true if this is a vector value type.
unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:507
void push_back(const T &Elt)
Definition: SmallVector.h:218
This file a TargetTransformInfo::Concept conforming object specific to the X86 target machine...
The main scalar evolution driver.
unsigned getRegisterBitWidth(bool Vector) const
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:253
unsigned getVectorNumElements() const
bool isLegalMaskedScatter(Type *DataType)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1186
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:363
Type Conversion Cost Table.
Definition: CostTable.h:45
uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the next integer (mod 2**64) that is greater than or equal to Value and is a multiple of Alig...
Definition: MathExtras.h:685
[US]{MIN/MAX} - Binary minimum or maximum or signed or unsigned integers.
Definition: ISDOpcodes.h:384
Hexagon Common GEP
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:230
unsigned getAtomicMemIntrinsicMaxElementSize() const
Cost Table Entry.
Definition: CostTable.h:25
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1509
unsigned getNumberOfRegisters(bool Vector)
int getGatherOverhead() const
Definition: X86Subtarget.h:621
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:130
unsigned getMaxInterleaveFactor(unsigned VF)
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:210
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
bool isUnalignedMem32Slow() const
Definition: X86Subtarget.h:620
unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef< Value * > Args, FastMathFlags FMF, unsigned VF=1)
Get intrinsic cost based on arguments.
bool isLegalMaskedStore(Type *DataType)
unsigned getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
int getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value *> Args=ArrayRef< const Value *>())
int getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond=false, bool UseMaskForGaps=false)
const FeatureBitset & getFeatureBits() const
Shift and rotation operations.
Definition: ISDOpcodes.h:410
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:197
unsigned getArithmeticReductionCost(unsigned Opcode, Type *Ty, bool IsPairwise)
Try to calculate arithmetic and shuffle op costs for reduction operations.
const TypeConversionCostTblEntry * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntry > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table, TypeTy must be comparable to CompareTy by ==. ...
Definition: CostTable.h:55
unsigned getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isLSRCostLess(TargetTransformInfo::LSRCost &C1, TargetTransformInfo::LSRCost &C2)
bool hasVLX() const
Definition: X86Subtarget.h:657
unsigned getSizeInBits() const
unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I)
Definition: BasicTTIImpl.h:772
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1575
unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:634
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:245
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:478
PopcntSupportKind
Flags indicating the kind of support for population count.
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:884
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:201
llvm::Optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:33
Selects elements from the corresponding lane of either source operand.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:203
int getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr)
Reverse the order of the vector.
bool hasDQI() const
Definition: X86Subtarget.h:655
MVT getVectorElementType() const
Value * getOperand(unsigned i) const
Definition: User.h:170
Class to represent pointers.
Definition: DerivedTypes.h:467
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:524
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return &#39;this&#39;.
Definition: Type.h:304
ExtractSubvector Index indicates start offset.
bool isFloatTy() const
Return true if this is &#39;float&#39;, a 32-bit IEEE fp type.
Definition: Type.h:147
an instruction for type-safe pointer arithmetic to access elements of arrays and structs ...
Definition: Instructions.h:854
bool isSLM() const
Definition: X86Subtarget.h:708
bool hasSSSE3() const
Definition: X86Subtarget.h:557
If not nullptr, enable inline expansion of memcmp.
* if(!EatIfPresent(lltok::kw_thread_local)) return false
ParseOptionalThreadLocal := /*empty.
Container class for subtarget features.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:429
Machine Value Type.
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:46
unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond=false, bool UseMaskForGaps=false)
Definition: BasicTTIImpl.h:850
Simple binary floating point operators.
Definition: ISDOpcodes.h:283
bool isLegalMaskedGather(Type *DataType)
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:149
unsigned getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwise, bool)
Try to calculate op costs for min/max reduction operations.
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:224
int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond=false, bool UseMaskForGaps=false)
Expected to fold away in lowering.
unsigned getUserCost(const User *U, ArrayRef< const Value * > Operands)
bool isLegalMaskedLoad(Type *DataType)
const TTI::MemCmpExpansionOptions * enableMemCmpExpansion(bool IsZeroCmp) const
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:495
Merge elements from two source vectors into one with any shuffle mask.
int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace)
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:281
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
bool hasXOP() const
Definition: X86Subtarget.h:584
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:556
bool hasSSE42() const
Definition: X86Subtarget.h:559
Extended Value Type.
Definition: ValueTypes.h:34
int getArithmeticReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm)
unsigned getUserCost(const User *U, ArrayRef< const Value *> Operands)
const TargetMachine & getTargetMachine() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace, const Instruction *I=nullptr)
int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp)
const SCEVConstant * getConstantStrideStep(ScalarEvolution *SE, const SCEV *Ptr)
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
OperandValueProperties
Additional properties of an operand&#39;s values.
unsigned getCFInstrCost(unsigned Opcode)
Definition: BasicTTIImpl.h:767
bool isAtom() const
TODO: to be removed later and replaced with suitable properties.
Definition: X86Subtarget.h:707
unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp)
Definition: BasicTTIImpl.h:615
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:947
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:265
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:240
unsigned getNumOperands() const
Definition: User.h:192
constexpr bool isInt< 32 >(int64_t x)
Definition: MathExtras.h:309
unsigned getPreferVectorWidth() const
Definition: X86Subtarget.h:678
AddressSpace
Definition: NVPTXBaseInfo.h:22
bool hasVBMI() const
Definition: X86Subtarget.h:594
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:413
int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I=nullptr)
int getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwiseForm, bool IsUnsigned)
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target&#39;s TargetSubtargetInf...
unsigned getVectorNumElements() const
Definition: DerivedTypes.h:462
Class to represent vector types.
Definition: DerivedTypes.h:393
Class for arbitrary precision integers.
Definition: APInt.h:70
int getScatterOverhead() const
Definition: X86Subtarget.h:622
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:468
unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:819
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
unsigned getLoadStoreVecRegBitWidth(unsigned AS) const
const CostTblEntry * CostTableLookup(ArrayRef< CostTblEntry > Tbl, int ISD, MVT Ty)
Find in cost table, TypeTy must be comparable to CompareTy by ==.
Definition: CostTable.h:32
This class represents an analyzed expression in the program.
bool isGLM() const
Definition: X86Subtarget.h:709
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return &#39;Legal&#39;) or we ...
int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, const Instruction *I=nullptr)
#define I(x, y, z)
Definition: MD5.cpp:58
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:323
unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: BasicTTIImpl.h:812
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:273
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:699
InsertSubvector. Index indicates start offset.
unsigned Insns
TODO: Some of these could be merged.
bool hasCDI() const
Definition: X86Subtarget.h:651
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:185
const unsigned Kind
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
The cost of a typical &#39;add&#39; instruction.
bool hasSSE1() const
Definition: X86Subtarget.h:554
unsigned getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition: Type.cpp:115
LLVM Value Representation.
Definition: Value.h:73
uint64_t getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type...
Definition: DataLayout.h:419
bool hasDivRemOp(Type *DataType, bool IsSigned)
static VectorType * get(Type *ElementType, unsigned NumElements)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:606
bool hasMacroFusion() const
Definition: X86Subtarget.h:641
Broadcast element 0 to all other elements.
bool hasAVX512() const
Definition: X86Subtarget.h:562
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:59
Convenience struct for specifying and reasoning about fast-math flags.
Definition: Operator.h:160
bool hasBWI() const
Definition: X86Subtarget.h:656
OperandValueKind
Additional information about an operand&#39;s possible values.
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:443
This pass exposes codegen information to IR-level passes.
Conversion operators.
Definition: ISDOpcodes.h:465
bool hasFastGather() const
Definition: X86Subtarget.h:634
CacheLevel
The possible cache levels.
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:474
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:126
bool isDoubleTy() const
Return true if this is &#39;double&#39;, a 64-bit IEEE fp type.
Definition: Type.h:150
int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef< Type *> Tys, FastMathFlags FMF, unsigned ScalarizationCostPassed=UINT_MAX)
static IntegerType * getInt8Ty(LLVMContext &C)
Definition: Type.cpp:174
int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond=false, bool UseMaskForGaps=false)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
std::pair< int, MVT > getTypeLegalizationCost(const DataLayout &DL, Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
bool hasSSE2() const
Definition: X86Subtarget.h:555
int getIntImmCost(int64_t)
Calculate the cost of materializing a 64-bit value.
This file describes how to lower LLVM code to machine code.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Shuffle elements of single source vector with any shuffle mask.