LLVM  8.0.1
TargetTransformInfo.h
Go to the documentation of this file.
1 //===- TargetTransformInfo.h ------------------------------------*- C++ -*-===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 /// \file
10 /// This pass exposes codegen information to IR-level passes. Every
11 /// transformation that uses codegen information is broken into three parts:
12 /// 1. The IR-level analysis pass.
13 /// 2. The IR-level transformation interface which provides the needed
14 /// information.
15 /// 3. Codegen-level implementation which uses target-specific hooks.
16 ///
17 /// This file defines #2, which is the interface that IR-level transformations
18 /// use for querying the codegen.
19 ///
20 //===----------------------------------------------------------------------===//
21 
22 #ifndef LLVM_ANALYSIS_TARGETTRANSFORMINFO_H
23 #define LLVM_ANALYSIS_TARGETTRANSFORMINFO_H
24 
25 #include "llvm/ADT/Optional.h"
26 #include "llvm/IR/Operator.h"
27 #include "llvm/IR/PassManager.h"
28 #include "llvm/Pass.h"
30 #include "llvm/Support/DataTypes.h"
31 #include <functional>
32 
33 namespace llvm {
34 
35 namespace Intrinsic {
36 enum ID : unsigned;
37 }
38 
39 class Function;
40 class GlobalValue;
41 class IntrinsicInst;
42 class LoadInst;
43 class Loop;
44 class SCEV;
45 class ScalarEvolution;
46 class StoreInst;
47 class SwitchInst;
48 class Type;
49 class User;
50 class Value;
51 
52 /// Information about a load/store intrinsic defined by the target.
54  /// This is the pointer that the intrinsic is loading from or storing to.
55  /// If this is non-null, then analysis/optimization passes can assume that
56  /// this intrinsic is functionally equivalent to a load/store from this
57  /// pointer.
58  Value *PtrVal = nullptr;
59 
60  // Ordering for atomic operations.
62 
63  // Same Id is set by the target for corresponding load/store intrinsics.
64  unsigned short MatchingId = 0;
65 
66  bool ReadMem = false;
67  bool WriteMem = false;
68  bool IsVolatile = false;
69 
70  bool isUnordered() const {
71  return (Ordering == AtomicOrdering::NotAtomic ||
72  Ordering == AtomicOrdering::Unordered) && !IsVolatile;
73  }
74 };
75 
76 /// This pass provides access to the codegen interfaces that are needed
77 /// for IR-level transformations.
79 public:
80  /// Construct a TTI object using a type implementing the \c Concept
81  /// API below.
82  ///
83  /// This is used by targets to construct a TTI wrapping their target-specific
84  /// implementaion that encodes appropriate costs for their target.
85  template <typename T> TargetTransformInfo(T Impl);
86 
87  /// Construct a baseline TTI object using a minimal implementation of
88  /// the \c Concept API below.
89  ///
90  /// The TTI implementation will reflect the information in the DataLayout
91  /// provided if non-null.
92  explicit TargetTransformInfo(const DataLayout &DL);
93 
94  // Provide move semantics.
96  TargetTransformInfo &operator=(TargetTransformInfo &&RHS);
97 
98  // We need to define the destructor out-of-line to define our sub-classes
99  // out-of-line.
101 
102  /// Handle the invalidation of this information.
103  ///
104  /// When used as a result of \c TargetIRAnalysis this method will be called
105  /// when the function this was computed for changes. When it returns false,
106  /// the information is preserved across those changes.
109  // FIXME: We should probably in some way ensure that the subtarget
110  // information for a function hasn't changed.
111  return false;
112  }
113 
114  /// \name Generic Target Information
115  /// @{
116 
117  /// The kind of cost model.
118  ///
119  /// There are several different cost models that can be customized by the
120  /// target. The normalization of each cost model may be target specific.
122  TCK_RecipThroughput, ///< Reciprocal throughput.
123  TCK_Latency, ///< The latency of instruction.
124  TCK_CodeSize ///< Instruction code size.
125  };
126 
127  /// Query the cost of a specified instruction.
128  ///
129  /// Clients should use this interface to query the cost of an existing
130  /// instruction. The instruction must have a valid parent (basic block).
131  ///
132  /// Note, this method does not cache the cost calculation and it
133  /// can be expensive in some cases.
134  int getInstructionCost(const Instruction *I, enum TargetCostKind kind) const {
135  switch (kind){
136  case TCK_RecipThroughput:
137  return getInstructionThroughput(I);
138 
139  case TCK_Latency:
140  return getInstructionLatency(I);
141 
142  case TCK_CodeSize:
143  return getUserCost(I);
144  }
145  llvm_unreachable("Unknown instruction cost kind");
146  }
147 
148  /// Underlying constants for 'cost' values in this interface.
149  ///
150  /// Many APIs in this interface return a cost. This enum defines the
151  /// fundamental values that should be used to interpret (and produce) those
152  /// costs. The costs are returned as an int rather than a member of this
153  /// enumeration because it is expected that the cost of one IR instruction
154  /// may have a multiplicative factor to it or otherwise won't fit directly
155  /// into the enum. Moreover, it is common to sum or average costs which works
156  /// better as simple integral values. Thus this enum only provides constants.
157  /// Also note that the returned costs are signed integers to make it natural
158  /// to add, subtract, and test with zero (a common boundary condition). It is
159  /// not expected that 2^32 is a realistic cost to be modeling at any point.
160  ///
161  /// Note that these costs should usually reflect the intersection of code-size
162  /// cost and execution cost. A free instruction is typically one that folds
163  /// into another instruction. For example, reg-to-reg moves can often be
164  /// skipped by renaming the registers in the CPU, but they still are encoded
165  /// and thus wouldn't be considered 'free' here.
167  TCC_Free = 0, ///< Expected to fold away in lowering.
168  TCC_Basic = 1, ///< The cost of a typical 'add' instruction.
169  TCC_Expensive = 4 ///< The cost of a 'div' instruction on x86.
170  };
171 
172  /// Estimate the cost of a specific operation when lowered.
173  ///
174  /// Note that this is designed to work on an arbitrary synthetic opcode, and
175  /// thus work for hypothetical queries before an instruction has even been
176  /// formed. However, this does *not* work for GEPs, and must not be called
177  /// for a GEP instruction. Instead, use the dedicated getGEPCost interface as
178  /// analyzing a GEP's cost required more information.
179  ///
180  /// Typically only the result type is required, and the operand type can be
181  /// omitted. However, if the opcode is one of the cast instructions, the
182  /// operand type is required.
183  ///
184  /// The returned cost is defined in terms of \c TargetCostConstants, see its
185  /// comments for a detailed explanation of the cost values.
186  int getOperationCost(unsigned Opcode, Type *Ty, Type *OpTy = nullptr) const;
187 
188  /// Estimate the cost of a GEP operation when lowered.
189  ///
190  /// The contract for this function is the same as \c getOperationCost except
191  /// that it supports an interface that provides extra information specific to
192  /// the GEP operation.
193  int getGEPCost(Type *PointeeType, const Value *Ptr,
194  ArrayRef<const Value *> Operands) const;
195 
196  /// Estimate the cost of a EXT operation when lowered.
197  ///
198  /// The contract for this function is the same as \c getOperationCost except
199  /// that it supports an interface that provides extra information specific to
200  /// the EXT operation.
201  int getExtCost(const Instruction *I, const Value *Src) const;
202 
203  /// Estimate the cost of a function call when lowered.
204  ///
205  /// The contract for this is the same as \c getOperationCost except that it
206  /// supports an interface that provides extra information specific to call
207  /// instructions.
208  ///
209  /// This is the most basic query for estimating call cost: it only knows the
210  /// function type and (potentially) the number of arguments at the call site.
211  /// The latter is only interesting for varargs function types.
212  int getCallCost(FunctionType *FTy, int NumArgs = -1) const;
213 
214  /// Estimate the cost of calling a specific function when lowered.
215  ///
216  /// This overload adds the ability to reason about the particular function
217  /// being called in the event it is a library call with special lowering.
218  int getCallCost(const Function *F, int NumArgs = -1) const;
219 
220  /// Estimate the cost of calling a specific function when lowered.
221  ///
222  /// This overload allows specifying a set of candidate argument values.
223  int getCallCost(const Function *F, ArrayRef<const Value *> Arguments) const;
224 
225  /// \returns A value by which our inlining threshold should be multiplied.
226  /// This is primarily used to bump up the inlining threshold wholesale on
227  /// targets where calls are unusually expensive.
228  ///
229  /// TODO: This is a rather blunt instrument. Perhaps altering the costs of
230  /// individual classes of instructions would be better.
231  unsigned getInliningThresholdMultiplier() const;
232 
233  /// Estimate the cost of an intrinsic when lowered.
234  ///
235  /// Mirrors the \c getCallCost method but uses an intrinsic identifier.
236  int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
237  ArrayRef<Type *> ParamTys) const;
238 
239  /// Estimate the cost of an intrinsic when lowered.
240  ///
241  /// Mirrors the \c getCallCost method but uses an intrinsic identifier.
242  int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
244 
245  /// \return The estimated number of case clusters when lowering \p 'SI'.
246  /// \p JTSize Set a jump table size only when \p SI is suitable for a jump
247  /// table.
248  unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
249  unsigned &JTSize) const;
250 
251  /// Estimate the cost of a given IR user when lowered.
252  ///
253  /// This can estimate the cost of either a ConstantExpr or Instruction when
254  /// lowered. It has two primary advantages over the \c getOperationCost and
255  /// \c getGEPCost above, and one significant disadvantage: it can only be
256  /// used when the IR construct has already been formed.
257  ///
258  /// The advantages are that it can inspect the SSA use graph to reason more
259  /// accurately about the cost. For example, all-constant-GEPs can often be
260  /// folded into a load or other instruction, but if they are used in some
261  /// other context they may not be folded. This routine can distinguish such
262  /// cases.
263  ///
264  /// \p Operands is a list of operands which can be a result of transformations
265  /// of the current operands. The number of the operands on the list must equal
266  /// to the number of the current operands the IR user has. Their order on the
267  /// list must be the same as the order of the current operands the IR user
268  /// has.
269  ///
270  /// The returned cost is defined in terms of \c TargetCostConstants, see its
271  /// comments for a detailed explanation of the cost values.
272  int getUserCost(const User *U, ArrayRef<const Value *> Operands) const;
273 
274  /// This is a helper function which calls the two-argument getUserCost
275  /// with \p Operands which are the current operands U has.
276  int getUserCost(const User *U) const {
278  U->value_op_end());
279  return getUserCost(U, Operands);
280  }
281 
282  /// Return true if branch divergence exists.
283  ///
284  /// Branch divergence has a significantly negative impact on GPU performance
285  /// when threads in the same wavefront take different paths due to conditional
286  /// branches.
287  bool hasBranchDivergence() const;
288 
289  /// Returns whether V is a source of divergence.
290  ///
291  /// This function provides the target-dependent information for
292  /// the target-independent LegacyDivergenceAnalysis. LegacyDivergenceAnalysis first
293  /// builds the dependency graph, and then runs the reachability algorithm
294  /// starting with the sources of divergence.
295  bool isSourceOfDivergence(const Value *V) const;
296 
297  // Returns true for the target specific
298  // set of operations which produce uniform result
299  // even taking non-unform arguments
300  bool isAlwaysUniform(const Value *V) const;
301 
302  /// Returns the address space ID for a target's 'flat' address space. Note
303  /// this is not necessarily the same as addrspace(0), which LLVM sometimes
304  /// refers to as the generic address space. The flat address space is a
305  /// generic address space that can be used access multiple segments of memory
306  /// with different address spaces. Access of a memory location through a
307  /// pointer with this address space is expected to be legal but slower
308  /// compared to the same memory location accessed through a pointer with a
309  /// different address space.
310  //
311  /// This is for targets with different pointer representations which can
312  /// be converted with the addrspacecast instruction. If a pointer is converted
313  /// to this address space, optimizations should attempt to replace the access
314  /// with the source address space.
315  ///
316  /// \returns ~0u if the target does not have such a flat address space to
317  /// optimize away.
318  unsigned getFlatAddressSpace() const;
319 
320  /// Test whether calls to a function lower to actual program function
321  /// calls.
322  ///
323  /// The idea is to test whether the program is likely to require a 'call'
324  /// instruction or equivalent in order to call the given function.
325  ///
326  /// FIXME: It's not clear that this is a good or useful query API. Client's
327  /// should probably move to simpler cost metrics using the above.
328  /// Alternatively, we could split the cost interface into distinct code-size
329  /// and execution-speed costs. This would allow modelling the core of this
330  /// query more accurately as a call is a single small instruction, but
331  /// incurs significant execution cost.
332  bool isLoweredToCall(const Function *F) const;
333 
334  struct LSRCost {
335  /// TODO: Some of these could be merged. Also, a lexical ordering
336  /// isn't always optimal.
337  unsigned Insns;
338  unsigned NumRegs;
339  unsigned AddRecCost;
340  unsigned NumIVMuls;
341  unsigned NumBaseAdds;
342  unsigned ImmCost;
343  unsigned SetupCost;
344  unsigned ScaleCost;
345  };
346 
347  /// Parameters that control the generic loop unrolling transformation.
349  /// The cost threshold for the unrolled loop. Should be relative to the
350  /// getUserCost values returned by this API, and the expectation is that
351  /// the unrolled loop's instructions when run through that interface should
352  /// not exceed this cost. However, this is only an estimate. Also, specific
353  /// loops may be unrolled even with a cost above this threshold if deemed
354  /// profitable. Set this to UINT_MAX to disable the loop body cost
355  /// restriction.
356  unsigned Threshold;
357  /// If complete unrolling will reduce the cost of the loop, we will boost
358  /// the Threshold by a certain percent to allow more aggressive complete
359  /// unrolling. This value provides the maximum boost percentage that we
360  /// can apply to Threshold (The value should be no less than 100).
361  /// BoostedThreshold = Threshold * min(RolledCost / UnrolledCost,
362  /// MaxPercentThresholdBoost / 100)
363  /// E.g. if complete unrolling reduces the loop execution time by 50%
364  /// then we boost the threshold by the factor of 2x. If unrolling is not
365  /// expected to reduce the running time, then we do not increase the
366  /// threshold.
368  /// The cost threshold for the unrolled loop when optimizing for size (set
369  /// to UINT_MAX to disable).
371  /// The cost threshold for the unrolled loop, like Threshold, but used
372  /// for partial/runtime unrolling (set to UINT_MAX to disable).
374  /// The cost threshold for the unrolled loop when optimizing for size, like
375  /// OptSizeThreshold, but used for partial/runtime unrolling (set to
376  /// UINT_MAX to disable).
378  /// A forced unrolling factor (the number of concatenated bodies of the
379  /// original loop in the unrolled loop body). When set to 0, the unrolling
380  /// transformation will select an unrolling factor based on the current cost
381  /// threshold and other factors.
382  unsigned Count;
383  /// A forced peeling factor (the number of bodied of the original loop
384  /// that should be peeled off before the loop body). When set to 0, the
385  /// unrolling transformation will select a peeling factor based on profile
386  /// information and other factors.
387  unsigned PeelCount;
388  /// Default unroll count for loops with run-time trip count.
390  // Set the maximum unrolling factor. The unrolling factor may be selected
391  // using the appropriate cost threshold, but may not exceed this number
392  // (set to UINT_MAX to disable). This does not apply in cases where the
393  // loop is being fully unrolled.
394  unsigned MaxCount;
395  /// Set the maximum unrolling factor for full unrolling. Like MaxCount, but
396  /// applies even if full unrolling is selected. This allows a target to fall
397  /// back to Partial unrolling if full unrolling is above FullUnrollMaxCount.
399  // Represents number of instructions optimized when "back edge"
400  // becomes "fall through" in unrolled loop.
401  // For now we count a conditional branch on a backedge and a comparison
402  // feeding it.
403  unsigned BEInsns;
404  /// Allow partial unrolling (unrolling of loops to expand the size of the
405  /// loop body, not only to eliminate small constant-trip-count loops).
406  bool Partial;
407  /// Allow runtime unrolling (unrolling of loops to expand the size of the
408  /// loop body even when the number of loop iterations is not known at
409  /// compile time).
410  bool Runtime;
411  /// Allow generation of a loop remainder (extra iterations after unroll).
413  /// Allow emitting expensive instructions (such as divisions) when computing
414  /// the trip count of a loop for runtime unrolling.
416  /// Apply loop unroll on any kind of loop
417  /// (mainly to loops that fail runtime unrolling).
418  bool Force;
419  /// Allow using trip count upper bound to unroll loops.
421  /// Allow peeling off loop iterations for loops with low dynamic tripcount.
423  /// Allow unrolling of all the iterations of the runtime loop remainder.
425  /// Allow unroll and jam. Used to enable unroll and jam for the target.
427  /// Threshold for unroll and jam, for inner loop size. The 'Threshold'
428  /// value above is used during unroll and jam for the outer loop size.
429  /// This value is used in the same manner to limit the size of the inner
430  /// loop.
432  };
433 
434  /// Get target-customized preferences for the generic loop unrolling
435  /// transformation. The caller will initialize UP with the current
436  /// target-independent defaults.
437  void getUnrollingPreferences(Loop *L, ScalarEvolution &,
438  UnrollingPreferences &UP) const;
439 
440  /// @}
441 
442  /// \name Scalar Target Information
443  /// @{
444 
445  /// Flags indicating the kind of support for population count.
446  ///
447  /// Compared to the SW implementation, HW support is supposed to
448  /// significantly boost the performance when the population is dense, and it
449  /// may or may not degrade performance if the population is sparse. A HW
450  /// support is considered as "Fast" if it can outperform, or is on a par
451  /// with, SW implementation when the population is sparse; otherwise, it is
452  /// considered as "Slow".
453  enum PopcntSupportKind { PSK_Software, PSK_SlowHardware, PSK_FastHardware };
454 
455  /// Return true if the specified immediate is legal add immediate, that
456  /// is the target has add instructions which can add a register with the
457  /// immediate without having to materialize the immediate into a register.
458  bool isLegalAddImmediate(int64_t Imm) const;
459 
460  /// Return true if the specified immediate is legal icmp immediate,
461  /// that is the target has icmp instructions which can compare a register
462  /// against the immediate without having to materialize the immediate into a
463  /// register.
464  bool isLegalICmpImmediate(int64_t Imm) const;
465 
466  /// Return true if the addressing mode represented by AM is legal for
467  /// this target, for a load/store of the specified type.
468  /// The type may be VoidTy, in which case only return true if the addressing
469  /// mode is legal for a load/store of any legal type.
470  /// If target returns true in LSRWithInstrQueries(), I may be valid.
471  /// TODO: Handle pre/postinc as well.
472  bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
473  bool HasBaseReg, int64_t Scale,
474  unsigned AddrSpace = 0,
475  Instruction *I = nullptr) const;
476 
477  /// Return true if LSR cost of C1 is lower than C1.
478  bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
479  TargetTransformInfo::LSRCost &C2) const;
480 
481  /// Return true if the target can fuse a compare and branch.
482  /// Loop-strength-reduction (LSR) uses that knowledge to adjust its cost
483  /// calculation for the instructions in a loop.
484  bool canMacroFuseCmp() const;
485 
486  /// \return True is LSR should make efforts to create/preserve post-inc
487  /// addressing mode expressions.
488  bool shouldFavorPostInc() const;
489 
490  /// Return true if the target supports masked load/store
491  /// AVX2 and AVX-512 targets allow masks for consecutive load and store
492  bool isLegalMaskedStore(Type *DataType) const;
493  bool isLegalMaskedLoad(Type *DataType) const;
494 
495  /// Return true if the target supports masked gather/scatter
496  /// AVX-512 fully supports gather and scatter for vectors with 32 and 64
497  /// bits scalar type.
498  bool isLegalMaskedScatter(Type *DataType) const;
499  bool isLegalMaskedGather(Type *DataType) const;
500 
501  /// Return true if the target has a unified operation to calculate division
502  /// and remainder. If so, the additional implicit multiplication and
503  /// subtraction required to calculate a remainder from division are free. This
504  /// can enable more aggressive transformations for division and remainder than
505  /// would typically be allowed using throughput or size cost models.
506  bool hasDivRemOp(Type *DataType, bool IsSigned) const;
507 
508  /// Return true if the given instruction (assumed to be a memory access
509  /// instruction) has a volatile variant. If that's the case then we can avoid
510  /// addrspacecast to generic AS for volatile loads/stores. Default
511  /// implementation returns false, which prevents address space inference for
512  /// volatile loads/stores.
513  bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) const;
514 
515  /// Return true if target doesn't mind addresses in vectors.
516  bool prefersVectorizedAddressing() const;
517 
518  /// Return the cost of the scaling factor used in the addressing
519  /// mode represented by AM for this target, for a load/store
520  /// of the specified type.
521  /// If the AM is supported, the return value must be >= 0.
522  /// If the AM is not supported, it returns a negative value.
523  /// TODO: Handle pre/postinc as well.
524  int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
525  bool HasBaseReg, int64_t Scale,
526  unsigned AddrSpace = 0) const;
527 
528  /// Return true if the loop strength reduce pass should make
529  /// Instruction* based TTI queries to isLegalAddressingMode(). This is
530  /// needed on SystemZ, where e.g. a memcpy can only have a 12 bit unsigned
531  /// immediate offset and no index register.
532  bool LSRWithInstrQueries() const;
533 
534  /// Return true if it's free to truncate a value of type Ty1 to type
535  /// Ty2. e.g. On x86 it's free to truncate a i32 value in register EAX to i16
536  /// by referencing its sub-register AX.
537  bool isTruncateFree(Type *Ty1, Type *Ty2) const;
538 
539  /// Return true if it is profitable to hoist instruction in the
540  /// then/else to before if.
541  bool isProfitableToHoist(Instruction *I) const;
542 
543  bool useAA() const;
544 
545  /// Return true if this type is legal.
546  bool isTypeLegal(Type *Ty) const;
547 
548  /// Returns the target's jmp_buf alignment in bytes.
549  unsigned getJumpBufAlignment() const;
550 
551  /// Returns the target's jmp_buf size in bytes.
552  unsigned getJumpBufSize() const;
553 
554  /// Return true if switches should be turned into lookup tables for the
555  /// target.
556  bool shouldBuildLookupTables() const;
557 
558  /// Return true if switches should be turned into lookup tables
559  /// containing this constant value for the target.
560  bool shouldBuildLookupTablesForConstant(Constant *C) const;
561 
562  /// Return true if the input function which is cold at all call sites,
563  /// should use coldcc calling convention.
564  bool useColdCCForColdCall(Function &F) const;
565 
566  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
567 
568  unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
569  unsigned VF) const;
570 
571  /// If target has efficient vector element load/store instructions, it can
572  /// return true here so that insertion/extraction costs are not added to
573  /// the scalarization cost of a load/store.
574  bool supportsEfficientVectorElementLoadStore() const;
575 
576  /// Don't restrict interleaved unrolling to small loops.
577  bool enableAggressiveInterleaving(bool LoopHasReductions) const;
578 
579  /// If not nullptr, enable inline expansion of memcmp. IsZeroCmp is
580  /// true if this is the expansion of memcmp(p1, p2, s) == 0.
582  // The list of available load sizes (in bytes), sorted in decreasing order.
584  // Set to true to allow overlapping loads. For example, 7-byte compares can
585  // be done with two 4-byte compares instead of 4+2+1-byte compares. This
586  // requires all loads in LoadSizes to be doable in an unaligned way.
587  bool AllowOverlappingLoads = false;
588  };
589  const MemCmpExpansionOptions *enableMemCmpExpansion(bool IsZeroCmp) const;
590 
591  /// Enable matching of interleaved access groups.
592  bool enableInterleavedAccessVectorization() const;
593 
594  /// Enable matching of interleaved access groups that contain predicated
595  /// accesses or gaps and therefore vectorized using masked
596  /// vector loads/stores.
597  bool enableMaskedInterleavedAccessVectorization() const;
598 
599  /// Indicate that it is potentially unsafe to automatically vectorize
600  /// floating-point operations because the semantics of vector and scalar
601  /// floating-point semantics may differ. For example, ARM NEON v7 SIMD math
602  /// does not support IEEE-754 denormal numbers, while depending on the
603  /// platform, scalar floating-point math does.
604  /// This applies to floating-point math operations and calls, not memory
605  /// operations, shuffles, or casts.
606  bool isFPVectorizationPotentiallyUnsafe() const;
607 
608  /// Determine if the target supports unaligned memory accesses.
609  bool allowsMisalignedMemoryAccesses(LLVMContext &Context,
610  unsigned BitWidth, unsigned AddressSpace = 0,
611  unsigned Alignment = 1,
612  bool *Fast = nullptr) const;
613 
614  /// Return hardware support for population count.
615  PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const;
616 
617  /// Return true if the hardware has a fast square-root instruction.
618  bool haveFastSqrt(Type *Ty) const;
619 
620  /// Return true if it is faster to check if a floating-point value is NaN
621  /// (or not-NaN) versus a comparison against a constant FP zero value.
622  /// Targets should override this if materializing a 0.0 for comparison is
623  /// generally as cheap as checking for ordered/unordered.
624  bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) const;
625 
626  /// Return the expected cost of supporting the floating point operation
627  /// of the specified type.
628  int getFPOpCost(Type *Ty) const;
629 
630  /// Return the expected cost of materializing for the given integer
631  /// immediate of the specified type.
632  int getIntImmCost(const APInt &Imm, Type *Ty) const;
633 
634  /// Return the expected cost of materialization for the given integer
635  /// immediate of the specified type for a given instruction. The cost can be
636  /// zero if the immediate can be folded into the specified instruction.
637  int getIntImmCost(unsigned Opc, unsigned Idx, const APInt &Imm,
638  Type *Ty) const;
639  int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
640  Type *Ty) const;
641 
642  /// Return the expected cost for the given integer when optimising
643  /// for size. This is different than the other integer immediate cost
644  /// functions in that it is subtarget agnostic. This is useful when you e.g.
645  /// target one ISA such as Aarch32 but smaller encodings could be possible
646  /// with another such as Thumb. This return value is used as a penalty when
647  /// the total costs for a constant is calculated (the bigger the cost, the
648  /// more beneficial constant hoisting is).
649  int getIntImmCodeSizeCost(unsigned Opc, unsigned Idx, const APInt &Imm,
650  Type *Ty) const;
651  /// @}
652 
653  /// \name Vector Target Information
654  /// @{
655 
656  /// The various kinds of shuffle patterns for vector queries.
657  enum ShuffleKind {
658  SK_Broadcast, ///< Broadcast element 0 to all other elements.
659  SK_Reverse, ///< Reverse the order of the vector.
660  SK_Select, ///< Selects elements from the corresponding lane of
661  ///< either source operand. This is equivalent to a
662  ///< vector select with a constant condition operand.
663  SK_Transpose, ///< Transpose two vectors.
664  SK_InsertSubvector, ///< InsertSubvector. Index indicates start offset.
665  SK_ExtractSubvector,///< ExtractSubvector Index indicates start offset.
666  SK_PermuteTwoSrc, ///< Merge elements from two source vectors into one
667  ///< with any shuffle mask.
668  SK_PermuteSingleSrc ///< Shuffle elements of single source vector with any
669  ///< shuffle mask.
670  };
671 
672  /// Additional information about an operand's possible values.
674  OK_AnyValue, // Operand can have any value.
675  OK_UniformValue, // Operand is uniform (splat of a value).
676  OK_UniformConstantValue, // Operand is uniform constant.
677  OK_NonUniformConstantValue // Operand is a non uniform constant value.
678  };
679 
680  /// Additional properties of an operand's values.
681  enum OperandValueProperties { OP_None = 0, OP_PowerOf2 = 1 };
682 
683  /// \return The number of scalar or vector registers that the target has.
684  /// If 'Vectors' is true, it returns the number of vector registers. If it is
685  /// set to false, it returns the number of scalar registers.
686  unsigned getNumberOfRegisters(bool Vector) const;
687 
688  /// \return The width of the largest scalar or vector register type.
689  unsigned getRegisterBitWidth(bool Vector) const;
690 
691  /// \return The width of the smallest vector register type.
692  unsigned getMinVectorRegisterBitWidth() const;
693 
694  /// \return True if the vectorization factor should be chosen to
695  /// make the vector of the smallest element type match the size of a
696  /// vector register. For wider element types, this could result in
697  /// creating vectors that span multiple vector registers.
698  /// If false, the vectorization factor will be chosen based on the
699  /// size of the widest element type.
700  bool shouldMaximizeVectorBandwidth(bool OptSize) const;
701 
702  /// \return The minimum vectorization factor for types of given element
703  /// bit width, or 0 if there is no mimimum VF. The returned value only
704  /// applies when shouldMaximizeVectorBandwidth returns true.
705  unsigned getMinimumVF(unsigned ElemWidth) const;
706 
707  /// \return True if it should be considered for address type promotion.
708  /// \p AllowPromotionWithoutCommonHeader Set true if promoting \p I is
709  /// profitable without finding other extensions fed by the same input.
710  bool shouldConsiderAddressTypePromotion(
711  const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const;
712 
713  /// \return The size of a cache line in bytes.
714  unsigned getCacheLineSize() const;
715 
716  /// The possible cache levels
717  enum class CacheLevel {
718  L1D, // The L1 data cache
719  L2D, // The L2 data cache
720 
721  // We currently do not model L3 caches, as their sizes differ widely between
722  // microarchitectures. Also, we currently do not have a use for L3 cache
723  // size modeling yet.
724  };
725 
726  /// \return The size of the cache level in bytes, if available.
727  llvm::Optional<unsigned> getCacheSize(CacheLevel Level) const;
728 
729  /// \return The associativity of the cache level, if available.
730  llvm::Optional<unsigned> getCacheAssociativity(CacheLevel Level) const;
731 
732  /// \return How much before a load we should place the prefetch instruction.
733  /// This is currently measured in number of instructions.
734  unsigned getPrefetchDistance() const;
735 
736  /// \return Some HW prefetchers can handle accesses up to a certain constant
737  /// stride. This is the minimum stride in bytes where it makes sense to start
738  /// adding SW prefetches. The default is 1, i.e. prefetch with any stride.
739  unsigned getMinPrefetchStride() const;
740 
741  /// \return The maximum number of iterations to prefetch ahead. If the
742  /// required number of iterations is more than this number, no prefetching is
743  /// performed.
744  unsigned getMaxPrefetchIterationsAhead() const;
745 
746  /// \return The maximum interleave factor that any transform should try to
747  /// perform for this target. This number depends on the level of parallelism
748  /// and the number of execution units in the CPU.
749  unsigned getMaxInterleaveFactor(unsigned VF) const;
750 
751  /// Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
752  static OperandValueKind getOperandInfo(Value *V,
753  OperandValueProperties &OpProps);
754 
755  /// This is an approximation of reciprocal throughput of a math/logic op.
756  /// A higher cost indicates less expected throughput.
757  /// From Agner Fog's guides, reciprocal throughput is "the average number of
758  /// clock cycles per instruction when the instructions are not part of a
759  /// limiting dependency chain."
760  /// Therefore, costs should be scaled to account for multiple execution units
761  /// on the target that can process this type of instruction. For example, if
762  /// there are 5 scalar integer units and 2 vector integer units that can
763  /// calculate an 'add' in a single cycle, this model should indicate that the
764  /// cost of the vector add instruction is 2.5 times the cost of the scalar
765  /// add instruction.
766  /// \p Args is an optional argument which holds the instruction operands
767  /// values so the TTI can analyze those values searching for special
768  /// cases or optimizations based on those values.
769  int getArithmeticInstrCost(
770  unsigned Opcode, Type *Ty, OperandValueKind Opd1Info = OK_AnyValue,
771  OperandValueKind Opd2Info = OK_AnyValue,
772  OperandValueProperties Opd1PropInfo = OP_None,
773  OperandValueProperties Opd2PropInfo = OP_None,
775 
776  /// \return The cost of a shuffle instruction of kind Kind and of type Tp.
777  /// The index and subtype parameters are used by the subvector insertion and
778  /// extraction shuffle kinds to show the insert/extract point and the type of
779  /// the subvector being inserted/extracted.
780  /// NOTE: For subvector extractions Tp represents the source type.
781  int getShuffleCost(ShuffleKind Kind, Type *Tp, int Index = 0,
782  Type *SubTp = nullptr) const;
783 
784  /// \return The expected cost of cast instructions, such as bitcast, trunc,
785  /// zext, etc. If there is an existing instruction that holds Opcode, it
786  /// may be passed in the 'I' parameter.
787  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
788  const Instruction *I = nullptr) const;
789 
790  /// \return The expected cost of a sign- or zero-extended vector extract. Use
791  /// -1 to indicate that there is no information about the index value.
792  int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
793  unsigned Index = -1) const;
794 
795  /// \return The expected cost of control-flow related instructions such as
796  /// Phi, Ret, Br.
797  int getCFInstrCost(unsigned Opcode) const;
798 
799  /// \returns The expected cost of compare and select instructions. If there
800  /// is an existing instruction that holds Opcode, it may be passed in the
801  /// 'I' parameter.
802  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
803  Type *CondTy = nullptr, const Instruction *I = nullptr) const;
804 
805  /// \return The expected cost of vector Insert and Extract.
806  /// Use -1 to indicate that there is no information on the index value.
807  int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index = -1) const;
808 
809  /// \return The cost of Load and Store instructions.
810  int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
811  unsigned AddressSpace, const Instruction *I = nullptr) const;
812 
813  /// \return The cost of masked Load and Store instructions.
814  int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
815  unsigned AddressSpace) const;
816 
817  /// \return The cost of Gather or Scatter operation
818  /// \p Opcode - is a type of memory access Load or Store
819  /// \p DataTy - a vector type of the data to be loaded or stored
820  /// \p Ptr - pointer [or vector of pointers] - address[es] in memory
821  /// \p VariableMask - true when the memory access is predicated with a mask
822  /// that is not a compile-time constant
823  /// \p Alignment - alignment of single element
824  int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
825  bool VariableMask, unsigned Alignment) const;
826 
827  /// \return The cost of the interleaved memory operation.
828  /// \p Opcode is the memory operation code
829  /// \p VecTy is the vector type of the interleaved access.
830  /// \p Factor is the interleave factor
831  /// \p Indices is the indices for interleaved load members (as interleaved
832  /// load allows gaps)
833  /// \p Alignment is the alignment of the memory operation
834  /// \p AddressSpace is address space of the pointer.
835  /// \p UseMaskForCond indicates if the memory access is predicated.
836  /// \p UseMaskForGaps indicates if gaps should be masked.
837  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
838  ArrayRef<unsigned> Indices, unsigned Alignment,
839  unsigned AddressSpace,
840  bool UseMaskForCond = false,
841  bool UseMaskForGaps = false) const;
842 
843  /// Calculate the cost of performing a vector reduction.
844  ///
845  /// This is the cost of reducing the vector value of type \p Ty to a scalar
846  /// value using the operation denoted by \p Opcode. The form of the reduction
847  /// can either be a pairwise reduction or a reduction that splits the vector
848  /// at every reduction level.
849  ///
850  /// Pairwise:
851  /// (v0, v1, v2, v3)
852  /// ((v0+v1), (v2+v3), undef, undef)
853  /// Split:
854  /// (v0, v1, v2, v3)
855  /// ((v0+v2), (v1+v3), undef, undef)
856  int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
857  bool IsPairwiseForm) const;
858  int getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwiseForm,
859  bool IsUnsigned) const;
860 
861  /// \returns The cost of Intrinsic instructions. Analyses the real arguments.
862  /// Three cases are handled: 1. scalar instruction 2. vector instruction
863  /// 3. scalar instruction which is to be vectorized with VF.
864  int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
866  unsigned VF = 1) const;
867 
868  /// \returns The cost of Intrinsic instructions. Types analysis only.
869  /// If ScalarizationCostPassed is UINT_MAX, the cost of scalarizing the
870  /// arguments and the return value will be computed based on types.
871  int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
873  unsigned ScalarizationCostPassed = UINT_MAX) const;
874 
875  /// \returns The cost of Call instructions.
876  int getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys) const;
877 
878  /// \returns The number of pieces into which the provided type must be
879  /// split during legalization. Zero is returned when the answer is unknown.
880  unsigned getNumberOfParts(Type *Tp) const;
881 
882  /// \returns The cost of the address computation. For most targets this can be
883  /// merged into the instruction indexing mode. Some targets might want to
884  /// distinguish between address computation for memory operations on vector
885  /// types and scalar types. Such targets should override this function.
886  /// The 'SE' parameter holds pointer for the scalar evolution object which
887  /// is used in order to get the Ptr step value in case of constant stride.
888  /// The 'Ptr' parameter holds SCEV of the access pointer.
889  int getAddressComputationCost(Type *Ty, ScalarEvolution *SE = nullptr,
890  const SCEV *Ptr = nullptr) const;
891 
892  /// \returns The cost, if any, of keeping values of the given types alive
893  /// over a callsite.
894  ///
895  /// Some types may require the use of register classes that do not have
896  /// any callee-saved registers, so would require a spill and fill.
897  unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) const;
898 
899  /// \returns True if the intrinsic is a supported memory intrinsic. Info
900  /// will contain additional information - whether the intrinsic may write
901  /// or read to memory, volatility and the pointer. Info is undefined
902  /// if false is returned.
903  bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const;
904 
905  /// \returns The maximum element size, in bytes, for an element
906  /// unordered-atomic memory intrinsic.
907  unsigned getAtomicMemIntrinsicMaxElementSize() const;
908 
909  /// \returns A value which is the result of the given memory intrinsic. New
910  /// instructions may be created to extract the result from the given intrinsic
911  /// memory operation. Returns nullptr if the target cannot create a result
912  /// from the given intrinsic.
913  Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
914  Type *ExpectedType) const;
915 
916  /// \returns The type to use in a loop expansion of a memcpy call.
917  Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
918  unsigned SrcAlign, unsigned DestAlign) const;
919 
920  /// \param[out] OpsOut The operand types to copy RemainingBytes of memory.
921  /// \param RemainingBytes The number of bytes to copy.
922  ///
923  /// Calculates the operand types to use when copying \p RemainingBytes of
924  /// memory, where source and destination alignments are \p SrcAlign and
925  /// \p DestAlign respectively.
926  void getMemcpyLoopResidualLoweringType(SmallVectorImpl<Type *> &OpsOut,
927  LLVMContext &Context,
928  unsigned RemainingBytes,
929  unsigned SrcAlign,
930  unsigned DestAlign) const;
931 
932  /// \returns True if the two functions have compatible attributes for inlining
933  /// purposes.
934  bool areInlineCompatible(const Function *Caller,
935  const Function *Callee) const;
936 
937  /// \returns True if the caller and callee agree on how \p Args will be passed
938  /// to the callee.
939  /// \param[out] Args The list of compatible arguments. The implementation may
940  /// filter out any incompatible args from this list.
941  bool areFunctionArgsABICompatible(const Function *Caller,
942  const Function *Callee,
943  SmallPtrSetImpl<Argument *> &Args) const;
944 
945  /// The type of load/store indexing.
947  MIM_Unindexed, ///< No indexing.
948  MIM_PreInc, ///< Pre-incrementing.
949  MIM_PreDec, ///< Pre-decrementing.
950  MIM_PostInc, ///< Post-incrementing.
951  MIM_PostDec ///< Post-decrementing.
952  };
953 
954  /// \returns True if the specified indexed load for the given type is legal.
955  bool isIndexedLoadLegal(enum MemIndexedMode Mode, Type *Ty) const;
956 
957  /// \returns True if the specified indexed store for the given type is legal.
958  bool isIndexedStoreLegal(enum MemIndexedMode Mode, Type *Ty) const;
959 
960  /// \returns The bitwidth of the largest vector type that should be used to
961  /// load/store in the given address space.
962  unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
963 
964  /// \returns True if the load instruction is legal to vectorize.
965  bool isLegalToVectorizeLoad(LoadInst *LI) const;
966 
967  /// \returns True if the store instruction is legal to vectorize.
968  bool isLegalToVectorizeStore(StoreInst *SI) const;
969 
970  /// \returns True if it is legal to vectorize the given load chain.
971  bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
972  unsigned Alignment,
973  unsigned AddrSpace) const;
974 
975  /// \returns True if it is legal to vectorize the given store chain.
976  bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
977  unsigned Alignment,
978  unsigned AddrSpace) const;
979 
980  /// \returns The new vector factor value if the target doesn't support \p
981  /// SizeInBytes loads or has a better vector factor.
982  unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
983  unsigned ChainSizeInBytes,
984  VectorType *VecTy) const;
985 
986  /// \returns The new vector factor value if the target doesn't support \p
987  /// SizeInBytes stores or has a better vector factor.
988  unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
989  unsigned ChainSizeInBytes,
990  VectorType *VecTy) const;
991 
992  /// Flags describing the kind of vector reduction.
993  struct ReductionFlags {
994  ReductionFlags() : IsMaxOp(false), IsSigned(false), NoNaN(false) {}
995  bool IsMaxOp; ///< If the op a min/max kind, true if it's a max operation.
996  bool IsSigned; ///< Whether the operation is a signed int reduction.
997  bool NoNaN; ///< If op is an fp min/max, whether NaNs may be present.
998  };
999 
1000  /// \returns True if the target wants to handle the given reduction idiom in
1001  /// the intrinsics form instead of the shuffle form.
1002  bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
1003  ReductionFlags Flags) const;
1004 
1005  /// \returns True if the target wants to expand the given reduction intrinsic
1006  /// into a shuffle sequence.
1007  bool shouldExpandReduction(const IntrinsicInst *II) const;
1008  /// @}
1009 
1010 private:
1011  /// Estimate the latency of specified instruction.
1012  /// Returns 1 as the default value.
1013  int getInstructionLatency(const Instruction *I) const;
1014 
1015  /// Returns the expected throughput cost of the instruction.
1016  /// Returns -1 if the cost is unknown.
1017  int getInstructionThroughput(const Instruction *I) const;
1018 
1019  /// The abstract base class used to type erase specific TTI
1020  /// implementations.
1021  class Concept;
1022 
1023  /// The template model for the base class which wraps a concrete
1024  /// implementation in a type erased interface.
1025  template <typename T> class Model;
1026 
1027  std::unique_ptr<Concept> TTIImpl;
1028 };
1029 
1031 public:
1032  virtual ~Concept() = 0;
1033  virtual const DataLayout &getDataLayout() const = 0;
1034  virtual int getOperationCost(unsigned Opcode, Type *Ty, Type *OpTy) = 0;
1035  virtual int getGEPCost(Type *PointeeType, const Value *Ptr,
1036  ArrayRef<const Value *> Operands) = 0;
1037  virtual int getExtCost(const Instruction *I, const Value *Src) = 0;
1038  virtual int getCallCost(FunctionType *FTy, int NumArgs) = 0;
1039  virtual int getCallCost(const Function *F, int NumArgs) = 0;
1040  virtual int getCallCost(const Function *F,
1042  virtual unsigned getInliningThresholdMultiplier() = 0;
1043  virtual int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
1044  ArrayRef<Type *> ParamTys) = 0;
1045  virtual int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
1046  ArrayRef<const Value *> Arguments) = 0;
1047  virtual unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
1048  unsigned &JTSize) = 0;
1049  virtual int
1050  getUserCost(const User *U, ArrayRef<const Value *> Operands) = 0;
1051  virtual bool hasBranchDivergence() = 0;
1052  virtual bool isSourceOfDivergence(const Value *V) = 0;
1053  virtual bool isAlwaysUniform(const Value *V) = 0;
1054  virtual unsigned getFlatAddressSpace() = 0;
1055  virtual bool isLoweredToCall(const Function *F) = 0;
1056  virtual void getUnrollingPreferences(Loop *L, ScalarEvolution &,
1057  UnrollingPreferences &UP) = 0;
1058  virtual bool isLegalAddImmediate(int64_t Imm) = 0;
1059  virtual bool isLegalICmpImmediate(int64_t Imm) = 0;
1060  virtual bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
1061  int64_t BaseOffset, bool HasBaseReg,
1062  int64_t Scale,
1063  unsigned AddrSpace,
1064  Instruction *I) = 0;
1065  virtual bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
1067  virtual bool canMacroFuseCmp() = 0;
1068  virtual bool shouldFavorPostInc() const = 0;
1069  virtual bool isLegalMaskedStore(Type *DataType) = 0;
1070  virtual bool isLegalMaskedLoad(Type *DataType) = 0;
1071  virtual bool isLegalMaskedScatter(Type *DataType) = 0;
1072  virtual bool isLegalMaskedGather(Type *DataType) = 0;
1073  virtual bool hasDivRemOp(Type *DataType, bool IsSigned) = 0;
1074  virtual bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) = 0;
1075  virtual bool prefersVectorizedAddressing() = 0;
1076  virtual int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
1077  int64_t BaseOffset, bool HasBaseReg,
1078  int64_t Scale, unsigned AddrSpace) = 0;
1079  virtual bool LSRWithInstrQueries() = 0;
1080  virtual bool isTruncateFree(Type *Ty1, Type *Ty2) = 0;
1081  virtual bool isProfitableToHoist(Instruction *I) = 0;
1082  virtual bool useAA() = 0;
1083  virtual bool isTypeLegal(Type *Ty) = 0;
1084  virtual unsigned getJumpBufAlignment() = 0;
1085  virtual unsigned getJumpBufSize() = 0;
1086  virtual bool shouldBuildLookupTables() = 0;
1087  virtual bool shouldBuildLookupTablesForConstant(Constant *C) = 0;
1088  virtual bool useColdCCForColdCall(Function &F) = 0;
1089  virtual unsigned
1090  getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) = 0;
1091  virtual unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
1092  unsigned VF) = 0;
1093  virtual bool supportsEfficientVectorElementLoadStore() = 0;
1094  virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0;
1095  virtual const MemCmpExpansionOptions *enableMemCmpExpansion(
1096  bool IsZeroCmp) const = 0;
1097  virtual bool enableInterleavedAccessVectorization() = 0;
1098  virtual bool enableMaskedInterleavedAccessVectorization() = 0;
1099  virtual bool isFPVectorizationPotentiallyUnsafe() = 0;
1100  virtual bool allowsMisalignedMemoryAccesses(LLVMContext &Context,
1101  unsigned BitWidth,
1102  unsigned AddressSpace,
1103  unsigned Alignment,
1104  bool *Fast) = 0;
1105  virtual PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) = 0;
1106  virtual bool haveFastSqrt(Type *Ty) = 0;
1107  virtual bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) = 0;
1108  virtual int getFPOpCost(Type *Ty) = 0;
1109  virtual int getIntImmCodeSizeCost(unsigned Opc, unsigned Idx, const APInt &Imm,
1110  Type *Ty) = 0;
1111  virtual int getIntImmCost(const APInt &Imm, Type *Ty) = 0;
1112  virtual int getIntImmCost(unsigned Opc, unsigned Idx, const APInt &Imm,
1113  Type *Ty) = 0;
1114  virtual int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
1115  Type *Ty) = 0;
1116  virtual unsigned getNumberOfRegisters(bool Vector) = 0;
1117  virtual unsigned getRegisterBitWidth(bool Vector) const = 0;
1118  virtual unsigned getMinVectorRegisterBitWidth() = 0;
1119  virtual bool shouldMaximizeVectorBandwidth(bool OptSize) const = 0;
1120  virtual unsigned getMinimumVF(unsigned ElemWidth) const = 0;
1121  virtual bool shouldConsiderAddressTypePromotion(
1122  const Instruction &I, bool &AllowPromotionWithoutCommonHeader) = 0;
1123  virtual unsigned getCacheLineSize() = 0;
1124  virtual llvm::Optional<unsigned> getCacheSize(CacheLevel Level) = 0;
1125  virtual llvm::Optional<unsigned> getCacheAssociativity(CacheLevel Level) = 0;
1126  virtual unsigned getPrefetchDistance() = 0;
1127  virtual unsigned getMinPrefetchStride() = 0;
1128  virtual unsigned getMaxPrefetchIterationsAhead() = 0;
1129  virtual unsigned getMaxInterleaveFactor(unsigned VF) = 0;
1130  virtual unsigned
1131  getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
1132  OperandValueKind Opd2Info,
1133  OperandValueProperties Opd1PropInfo,
1134  OperandValueProperties Opd2PropInfo,
1135  ArrayRef<const Value *> Args) = 0;
1136  virtual int getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
1137  Type *SubTp) = 0;
1138  virtual int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1139  const Instruction *I) = 0;
1140  virtual int getExtractWithExtendCost(unsigned Opcode, Type *Dst,
1141  VectorType *VecTy, unsigned Index) = 0;
1142  virtual int getCFInstrCost(unsigned Opcode) = 0;
1143  virtual int getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
1144  Type *CondTy, const Instruction *I) = 0;
1145  virtual int getVectorInstrCost(unsigned Opcode, Type *Val,
1146  unsigned Index) = 0;
1147  virtual int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
1148  unsigned AddressSpace, const Instruction *I) = 0;
1149  virtual int getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
1150  unsigned Alignment,
1151  unsigned AddressSpace) = 0;
1152  virtual int getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
1153  Value *Ptr, bool VariableMask,
1154  unsigned Alignment) = 0;
1155  virtual int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
1156  unsigned Factor,
1157  ArrayRef<unsigned> Indices,
1158  unsigned Alignment,
1159  unsigned AddressSpace,
1160  bool UseMaskForCond = false,
1161  bool UseMaskForGaps = false) = 0;
1162  virtual int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
1163  bool IsPairwiseForm) = 0;
1164  virtual int getMinMaxReductionCost(Type *Ty, Type *CondTy,
1165  bool IsPairwiseForm, bool IsUnsigned) = 0;
1166  virtual int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
1168  unsigned ScalarizationCostPassed) = 0;
1169  virtual int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
1170  ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) = 0;
1171  virtual int getCallInstrCost(Function *F, Type *RetTy,
1172  ArrayRef<Type *> Tys) = 0;
1173  virtual unsigned getNumberOfParts(Type *Tp) = 0;
1174  virtual int getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
1175  const SCEV *Ptr) = 0;
1176  virtual unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) = 0;
1177  virtual bool getTgtMemIntrinsic(IntrinsicInst *Inst,
1178  MemIntrinsicInfo &Info) = 0;
1179  virtual unsigned getAtomicMemIntrinsicMaxElementSize() const = 0;
1180  virtual Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
1181  Type *ExpectedType) = 0;
1182  virtual Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
1183  unsigned SrcAlign,
1184  unsigned DestAlign) const = 0;
1185  virtual void getMemcpyLoopResidualLoweringType(
1186  SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
1187  unsigned RemainingBytes, unsigned SrcAlign, unsigned DestAlign) const = 0;
1188  virtual bool areInlineCompatible(const Function *Caller,
1189  const Function *Callee) const = 0;
1190  virtual bool
1191  areFunctionArgsABICompatible(const Function *Caller, const Function *Callee,
1192  SmallPtrSetImpl<Argument *> &Args) const = 0;
1193  virtual bool isIndexedLoadLegal(MemIndexedMode Mode, Type *Ty) const = 0;
1194  virtual bool isIndexedStoreLegal(MemIndexedMode Mode,Type *Ty) const = 0;
1195  virtual unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const = 0;
1196  virtual bool isLegalToVectorizeLoad(LoadInst *LI) const = 0;
1197  virtual bool isLegalToVectorizeStore(StoreInst *SI) const = 0;
1198  virtual bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
1199  unsigned Alignment,
1200  unsigned AddrSpace) const = 0;
1201  virtual bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
1202  unsigned Alignment,
1203  unsigned AddrSpace) const = 0;
1204  virtual unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
1205  unsigned ChainSizeInBytes,
1206  VectorType *VecTy) const = 0;
1207  virtual unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
1208  unsigned ChainSizeInBytes,
1209  VectorType *VecTy) const = 0;
1210  virtual bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
1211  ReductionFlags) const = 0;
1212  virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0;
1213  virtual int getInstructionLatency(const Instruction *I) = 0;
1214 };
1215 
1216 template <typename T>
1218  T Impl;
1219 
1220 public:
1221  Model(T Impl) : Impl(std::move(Impl)) {}
1222  ~Model() override {}
1223 
1224  const DataLayout &getDataLayout() const override {
1225  return Impl.getDataLayout();
1226  }
1227 
1228  int getOperationCost(unsigned Opcode, Type *Ty, Type *OpTy) override {
1229  return Impl.getOperationCost(Opcode, Ty, OpTy);
1230  }
1231  int getGEPCost(Type *PointeeType, const Value *Ptr,
1232  ArrayRef<const Value *> Operands) override {
1233  return Impl.getGEPCost(PointeeType, Ptr, Operands);
1234  }
1235  int getExtCost(const Instruction *I, const Value *Src) override {
1236  return Impl.getExtCost(I, Src);
1237  }
1238  int getCallCost(FunctionType *FTy, int NumArgs) override {
1239  return Impl.getCallCost(FTy, NumArgs);
1240  }
1241  int getCallCost(const Function *F, int NumArgs) override {
1242  return Impl.getCallCost(F, NumArgs);
1243  }
1244  int getCallCost(const Function *F,
1246  return Impl.getCallCost(F, Arguments);
1247  }
1248  unsigned getInliningThresholdMultiplier() override {
1249  return Impl.getInliningThresholdMultiplier();
1250  }
1251  int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
1252  ArrayRef<Type *> ParamTys) override {
1253  return Impl.getIntrinsicCost(IID, RetTy, ParamTys);
1254  }
1255  int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
1256  ArrayRef<const Value *> Arguments) override {
1257  return Impl.getIntrinsicCost(IID, RetTy, Arguments);
1258  }
1259  int getUserCost(const User *U, ArrayRef<const Value *> Operands) override {
1260  return Impl.getUserCost(U, Operands);
1261  }
1262  bool hasBranchDivergence() override { return Impl.hasBranchDivergence(); }
1263  bool isSourceOfDivergence(const Value *V) override {
1264  return Impl.isSourceOfDivergence(V);
1265  }
1266 
1267  bool isAlwaysUniform(const Value *V) override {
1268  return Impl.isAlwaysUniform(V);
1269  }
1270 
1271  unsigned getFlatAddressSpace() override {
1272  return Impl.getFlatAddressSpace();
1273  }
1274 
1275  bool isLoweredToCall(const Function *F) override {
1276  return Impl.isLoweredToCall(F);
1277  }
1278  void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1279  UnrollingPreferences &UP) override {
1280  return Impl.getUnrollingPreferences(L, SE, UP);
1281  }
1282  bool isLegalAddImmediate(int64_t Imm) override {
1283  return Impl.isLegalAddImmediate(Imm);
1284  }
1285  bool isLegalICmpImmediate(int64_t Imm) override {
1286  return Impl.isLegalICmpImmediate(Imm);
1287  }
1288  bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
1289  bool HasBaseReg, int64_t Scale,
1290  unsigned AddrSpace,
1291  Instruction *I) override {
1292  return Impl.isLegalAddressingMode(Ty, BaseGV, BaseOffset, HasBaseReg,
1293  Scale, AddrSpace, I);
1294  }
1295  bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
1296  TargetTransformInfo::LSRCost &C2) override {
1297  return Impl.isLSRCostLess(C1, C2);
1298  }
1299  bool canMacroFuseCmp() override {
1300  return Impl.canMacroFuseCmp();
1301  }
1302  bool shouldFavorPostInc() const override {
1303  return Impl.shouldFavorPostInc();
1304  }
1305  bool isLegalMaskedStore(Type *DataType) override {
1306  return Impl.isLegalMaskedStore(DataType);
1307  }
1308  bool isLegalMaskedLoad(Type *DataType) override {
1309  return Impl.isLegalMaskedLoad(DataType);
1310  }
1311  bool isLegalMaskedScatter(Type *DataType) override {
1312  return Impl.isLegalMaskedScatter(DataType);
1313  }
1314  bool isLegalMaskedGather(Type *DataType) override {
1315  return Impl.isLegalMaskedGather(DataType);
1316  }
1317  bool hasDivRemOp(Type *DataType, bool IsSigned) override {
1318  return Impl.hasDivRemOp(DataType, IsSigned);
1319  }
1320  bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) override {
1321  return Impl.hasVolatileVariant(I, AddrSpace);
1322  }
1323  bool prefersVectorizedAddressing() override {
1324  return Impl.prefersVectorizedAddressing();
1325  }
1326  int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
1327  bool HasBaseReg, int64_t Scale,
1328  unsigned AddrSpace) override {
1329  return Impl.getScalingFactorCost(Ty, BaseGV, BaseOffset, HasBaseReg,
1330  Scale, AddrSpace);
1331  }
1332  bool LSRWithInstrQueries() override {
1333  return Impl.LSRWithInstrQueries();
1334  }
1335  bool isTruncateFree(Type *Ty1, Type *Ty2) override {
1336  return Impl.isTruncateFree(Ty1, Ty2);
1337  }
1338  bool isProfitableToHoist(Instruction *I) override {
1339  return Impl.isProfitableToHoist(I);
1340  }
1341  bool useAA() override { return Impl.useAA(); }
1342  bool isTypeLegal(Type *Ty) override { return Impl.isTypeLegal(Ty); }
1343  unsigned getJumpBufAlignment() override { return Impl.getJumpBufAlignment(); }
1344  unsigned getJumpBufSize() override { return Impl.getJumpBufSize(); }
1345  bool shouldBuildLookupTables() override {
1346  return Impl.shouldBuildLookupTables();
1347  }
1348  bool shouldBuildLookupTablesForConstant(Constant *C) override {
1349  return Impl.shouldBuildLookupTablesForConstant(C);
1350  }
1351  bool useColdCCForColdCall(Function &F) override {
1352  return Impl.useColdCCForColdCall(F);
1353  }
1354 
1355  unsigned getScalarizationOverhead(Type *Ty, bool Insert,
1356  bool Extract) override {
1357  return Impl.getScalarizationOverhead(Ty, Insert, Extract);
1358  }
1359  unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
1360  unsigned VF) override {
1361  return Impl.getOperandsScalarizationOverhead(Args, VF);
1362  }
1363 
1364  bool supportsEfficientVectorElementLoadStore() override {
1365  return Impl.supportsEfficientVectorElementLoadStore();
1366  }
1367 
1368  bool enableAggressiveInterleaving(bool LoopHasReductions) override {
1369  return Impl.enableAggressiveInterleaving(LoopHasReductions);
1370  }
1371  const MemCmpExpansionOptions *enableMemCmpExpansion(
1372  bool IsZeroCmp) const override {
1373  return Impl.enableMemCmpExpansion(IsZeroCmp);
1374  }
1375  bool enableInterleavedAccessVectorization() override {
1376  return Impl.enableInterleavedAccessVectorization();
1377  }
1378  bool enableMaskedInterleavedAccessVectorization() override {
1379  return Impl.enableMaskedInterleavedAccessVectorization();
1380  }
1381  bool isFPVectorizationPotentiallyUnsafe() override {
1382  return Impl.isFPVectorizationPotentiallyUnsafe();
1383  }
1384  bool allowsMisalignedMemoryAccesses(LLVMContext &Context,
1385  unsigned BitWidth, unsigned AddressSpace,
1386  unsigned Alignment, bool *Fast) override {
1387  return Impl.allowsMisalignedMemoryAccesses(Context, BitWidth, AddressSpace,
1388  Alignment, Fast);
1389  }
1390  PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) override {
1391  return Impl.getPopcntSupport(IntTyWidthInBit);
1392  }
1393  bool haveFastSqrt(Type *Ty) override { return Impl.haveFastSqrt(Ty); }
1394 
1395  bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) override {
1396  return Impl.isFCmpOrdCheaperThanFCmpZero(Ty);
1397  }
1398 
1399  int getFPOpCost(Type *Ty) override { return Impl.getFPOpCost(Ty); }
1400 
1401  int getIntImmCodeSizeCost(unsigned Opc, unsigned Idx, const APInt &Imm,
1402  Type *Ty) override {
1403  return Impl.getIntImmCodeSizeCost(Opc, Idx, Imm, Ty);
1404  }
1405  int getIntImmCost(const APInt &Imm, Type *Ty) override {
1406  return Impl.getIntImmCost(Imm, Ty);
1407  }
1408  int getIntImmCost(unsigned Opc, unsigned Idx, const APInt &Imm,
1409  Type *Ty) override {
1410  return Impl.getIntImmCost(Opc, Idx, Imm, Ty);
1411  }
1412  int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
1413  Type *Ty) override {
1414  return Impl.getIntImmCost(IID, Idx, Imm, Ty);
1415  }
1416  unsigned getNumberOfRegisters(bool Vector) override {
1417  return Impl.getNumberOfRegisters(Vector);
1418  }
1419  unsigned getRegisterBitWidth(bool Vector) const override {
1420  return Impl.getRegisterBitWidth(Vector);
1421  }
1422  unsigned getMinVectorRegisterBitWidth() override {
1423  return Impl.getMinVectorRegisterBitWidth();
1424  }
1425  bool shouldMaximizeVectorBandwidth(bool OptSize) const override {
1426  return Impl.shouldMaximizeVectorBandwidth(OptSize);
1427  }
1428  unsigned getMinimumVF(unsigned ElemWidth) const override {
1429  return Impl.getMinimumVF(ElemWidth);
1430  }
1431  bool shouldConsiderAddressTypePromotion(
1432  const Instruction &I, bool &AllowPromotionWithoutCommonHeader) override {
1433  return Impl.shouldConsiderAddressTypePromotion(
1434  I, AllowPromotionWithoutCommonHeader);
1435  }
1436  unsigned getCacheLineSize() override {
1437  return Impl.getCacheLineSize();
1438  }
1439  llvm::Optional<unsigned> getCacheSize(CacheLevel Level) override {
1440  return Impl.getCacheSize(Level);
1441  }
1442  llvm::Optional<unsigned> getCacheAssociativity(CacheLevel Level) override {
1443  return Impl.getCacheAssociativity(Level);
1444  }
1445  unsigned getPrefetchDistance() override { return Impl.getPrefetchDistance(); }
1446  unsigned getMinPrefetchStride() override {
1447  return Impl.getMinPrefetchStride();
1448  }
1449  unsigned getMaxPrefetchIterationsAhead() override {
1450  return Impl.getMaxPrefetchIterationsAhead();
1451  }
1452  unsigned getMaxInterleaveFactor(unsigned VF) override {
1453  return Impl.getMaxInterleaveFactor(VF);
1454  }
1455  unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
1456  unsigned &JTSize) override {
1457  return Impl.getEstimatedNumberOfCaseClusters(SI, JTSize);
1458  }
1459  unsigned
1460  getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
1461  OperandValueKind Opd2Info,
1462  OperandValueProperties Opd1PropInfo,
1463  OperandValueProperties Opd2PropInfo,
1464  ArrayRef<const Value *> Args) override {
1465  return Impl.getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
1466  Opd1PropInfo, Opd2PropInfo, Args);
1467  }
1468  int getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
1469  Type *SubTp) override {
1470  return Impl.getShuffleCost(Kind, Tp, Index, SubTp);
1471  }
1472  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1473  const Instruction *I) override {
1474  return Impl.getCastInstrCost(Opcode, Dst, Src, I);
1475  }
1476  int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
1477  unsigned Index) override {
1478  return Impl.getExtractWithExtendCost(Opcode, Dst, VecTy, Index);
1479  }
1480  int getCFInstrCost(unsigned Opcode) override {
1481  return Impl.getCFInstrCost(Opcode);
1482  }
1483  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
1484  const Instruction *I) override {
1485  return Impl.getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
1486  }
1487  int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) override {
1488  return Impl.getVectorInstrCost(Opcode, Val, Index);
1489  }
1490  int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
1491  unsigned AddressSpace, const Instruction *I) override {
1492  return Impl.getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, I);
1493  }
1494  int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
1495  unsigned AddressSpace) override {
1496  return Impl.getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
1497  }
1498  int getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
1499  Value *Ptr, bool VariableMask,
1500  unsigned Alignment) override {
1501  return Impl.getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1502  Alignment);
1503  }
1504  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
1505  ArrayRef<unsigned> Indices, unsigned Alignment,
1506  unsigned AddressSpace, bool UseMaskForCond,
1507  bool UseMaskForGaps) override {
1508  return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1509  Alignment, AddressSpace,
1510  UseMaskForCond, UseMaskForGaps);
1511  }
1512  int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
1513  bool IsPairwiseForm) override {
1514  return Impl.getArithmeticReductionCost(Opcode, Ty, IsPairwiseForm);
1515  }
1516  int getMinMaxReductionCost(Type *Ty, Type *CondTy,
1517  bool IsPairwiseForm, bool IsUnsigned) override {
1518  return Impl.getMinMaxReductionCost(Ty, CondTy, IsPairwiseForm, IsUnsigned);
1519  }
1520  int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef<Type *> Tys,
1521  FastMathFlags FMF, unsigned ScalarizationCostPassed) override {
1522  return Impl.getIntrinsicInstrCost(ID, RetTy, Tys, FMF,
1523  ScalarizationCostPassed);
1524  }
1525  int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
1526  ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) override {
1527  return Impl.getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
1528  }
1529  int getCallInstrCost(Function *F, Type *RetTy,
1530  ArrayRef<Type *> Tys) override {
1531  return Impl.getCallInstrCost(F, RetTy, Tys);
1532  }
1533  unsigned getNumberOfParts(Type *Tp) override {
1534  return Impl.getNumberOfParts(Tp);
1535  }
1536  int getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
1537  const SCEV *Ptr) override {
1538  return Impl.getAddressComputationCost(Ty, SE, Ptr);
1539  }
1540  unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) override {
1541  return Impl.getCostOfKeepingLiveOverCall(Tys);
1542  }
1543  bool getTgtMemIntrinsic(IntrinsicInst *Inst,
1544  MemIntrinsicInfo &Info) override {
1545  return Impl.getTgtMemIntrinsic(Inst, Info);
1546  }
1547  unsigned getAtomicMemIntrinsicMaxElementSize() const override {
1548  return Impl.getAtomicMemIntrinsicMaxElementSize();
1549  }
1550  Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
1551  Type *ExpectedType) override {
1552  return Impl.getOrCreateResultFromMemIntrinsic(Inst, ExpectedType);
1553  }
1554  Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
1555  unsigned SrcAlign,
1556  unsigned DestAlign) const override {
1557  return Impl.getMemcpyLoopLoweringType(Context, Length, SrcAlign, DestAlign);
1558  }
1559  void getMemcpyLoopResidualLoweringType(SmallVectorImpl<Type *> &OpsOut,
1560  LLVMContext &Context,
1561  unsigned RemainingBytes,
1562  unsigned SrcAlign,
1563  unsigned DestAlign) const override {
1564  Impl.getMemcpyLoopResidualLoweringType(OpsOut, Context, RemainingBytes,
1565  SrcAlign, DestAlign);
1566  }
1567  bool areInlineCompatible(const Function *Caller,
1568  const Function *Callee) const override {
1569  return Impl.areInlineCompatible(Caller, Callee);
1570  }
1572  const Function *Caller, const Function *Callee,
1573  SmallPtrSetImpl<Argument *> &Args) const override {
1574  return Impl.areFunctionArgsABICompatible(Caller, Callee, Args);
1575  }
1576  bool isIndexedLoadLegal(MemIndexedMode Mode, Type *Ty) const override {
1577  return Impl.isIndexedLoadLegal(Mode, Ty, getDataLayout());
1578  }
1579  bool isIndexedStoreLegal(MemIndexedMode Mode, Type *Ty) const override {
1580  return Impl.isIndexedStoreLegal(Mode, Ty, getDataLayout());
1581  }
1582  unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override {
1583  return Impl.getLoadStoreVecRegBitWidth(AddrSpace);
1584  }
1585  bool isLegalToVectorizeLoad(LoadInst *LI) const override {
1586  return Impl.isLegalToVectorizeLoad(LI);
1587  }
1588  bool isLegalToVectorizeStore(StoreInst *SI) const override {
1589  return Impl.isLegalToVectorizeStore(SI);
1590  }
1591  bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
1592  unsigned Alignment,
1593  unsigned AddrSpace) const override {
1594  return Impl.isLegalToVectorizeLoadChain(ChainSizeInBytes, Alignment,
1595  AddrSpace);
1596  }
1597  bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
1598  unsigned Alignment,
1599  unsigned AddrSpace) const override {
1600  return Impl.isLegalToVectorizeStoreChain(ChainSizeInBytes, Alignment,
1601  AddrSpace);
1602  }
1603  unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
1604  unsigned ChainSizeInBytes,
1605  VectorType *VecTy) const override {
1606  return Impl.getLoadVectorFactor(VF, LoadSize, ChainSizeInBytes, VecTy);
1607  }
1608  unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
1609  unsigned ChainSizeInBytes,
1610  VectorType *VecTy) const override {
1611  return Impl.getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy);
1612  }
1613  bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
1614  ReductionFlags Flags) const override {
1615  return Impl.useReductionIntrinsic(Opcode, Ty, Flags);
1616  }
1617  bool shouldExpandReduction(const IntrinsicInst *II) const override {
1618  return Impl.shouldExpandReduction(II);
1619  }
1620  int getInstructionLatency(const Instruction *I) override {
1621  return Impl.getInstructionLatency(I);
1622  }
1623 };
1624 
1625 template <typename T>
1627  : TTIImpl(new Model<T>(Impl)) {}
1628 
1629 /// Analysis pass providing the \c TargetTransformInfo.
1630 ///
1631 /// The core idea of the TargetIRAnalysis is to expose an interface through
1632 /// which LLVM targets can analyze and provide information about the middle
1633 /// end's target-independent IR. This supports use cases such as target-aware
1634 /// cost modeling of IR constructs.
1635 ///
1636 /// This is a function analysis because much of the cost modeling for targets
1637 /// is done in a subtarget specific way and LLVM supports compiling different
1638 /// functions targeting different subtargets in order to support runtime
1639 /// dispatch according to the observed subtarget.
1640 class TargetIRAnalysis : public AnalysisInfoMixin<TargetIRAnalysis> {
1641 public:
1643 
1644  /// Default construct a target IR analysis.
1645  ///
1646  /// This will use the module's datalayout to construct a baseline
1647  /// conservative TTI result.
1648  TargetIRAnalysis();
1649 
1650  /// Construct an IR analysis pass around a target-provide callback.
1651  ///
1652  /// The callback will be called with a particular function for which the TTI
1653  /// is needed and must return a TTI object for that function.
1654  TargetIRAnalysis(std::function<Result(const Function &)> TTICallback);
1655 
1656  // Value semantics. We spell out the constructors for MSVC.
1658  : TTICallback(Arg.TTICallback) {}
1660  : TTICallback(std::move(Arg.TTICallback)) {}
1662  TTICallback = RHS.TTICallback;
1663  return *this;
1664  }
1666  TTICallback = std::move(RHS.TTICallback);
1667  return *this;
1668  }
1669 
1670  Result run(const Function &F, FunctionAnalysisManager &);
1671 
1672 private:
1674  static AnalysisKey Key;
1675 
1676  /// The callback used to produce a result.
1677  ///
1678  /// We use a completely opaque callback so that targets can provide whatever
1679  /// mechanism they desire for constructing the TTI for a given function.
1680  ///
1681  /// FIXME: Should we really use std::function? It's relatively inefficient.
1682  /// It might be possible to arrange for even stateful callbacks to outlive
1683  /// the analysis and thus use a function_ref which would be lighter weight.
1684  /// This may also be less error prone as the callback is likely to reference
1685  /// the external TargetMachine, and that reference needs to never dangle.
1686  std::function<Result(const Function &)> TTICallback;
1687 
1688  /// Helper function used as the callback in the default constructor.
1689  static Result getDefaultTTI(const Function &F);
1690 };
1691 
1692 /// Wrapper pass for TargetTransformInfo.
1693 ///
1694 /// This pass can be constructed from a TTI object which it stores internally
1695 /// and is queried by passes.
1697  TargetIRAnalysis TIRA;
1699 
1700  virtual void anchor();
1701 
1702 public:
1703  static char ID;
1704 
1705  /// We must provide a default constructor for the pass but it should
1706  /// never be used.
1707  ///
1708  /// Use the constructor below or call one of the creation routines.
1710 
1712 
1713  TargetTransformInfo &getTTI(const Function &F);
1714 };
1715 
1716 /// Create an analysis pass wrapper around a TTI object.
1717 ///
1718 /// This analysis pass just holds the TTI instance and makes it available to
1719 /// clients.
1721 
1722 } // End llvm namespace
1723 
1724 #endif
uint64_t CallInst * C
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:111
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
LLVMContext & Context
Atomic ordering constants.
SI Whole Quad Mode
This class represents lattice values for constants.
Definition: AllocatorList.h:24
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
TargetIRAnalysis & operator=(const TargetIRAnalysis &RHS)
static unsigned getScalarizationOverhead(Instruction *I, unsigned VF, const TargetTransformInfo &TTI)
Estimate the overhead of scalarizing an instruction.
value_op_iterator value_op_begin()
Definition: User.h:256
The main scalar evolution driver.
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold, but used for partial/runtime unrolling (set to UINT_MAX to disable).
MemIndexedMode
The type of load/store indexing.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
ImmutablePass * createTargetTransformInfoWrapperPass(TargetIRAnalysis TIRA)
Create an analysis pass wrapper around a TTI object.
Analysis pass providing the TargetTransformInfo.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
value_op_iterator value_op_end()
Definition: User.h:259
F(f)
An instruction for reading from memory.
Definition: Instructions.h:168
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
TargetIRAnalysis & operator=(TargetIRAnalysis &&RHS)
bool areInlineCompatible(const Function &Caller, const Function &Callee)
int getInstructionCost(const Instruction *I, enum TargetCostKind kind) const
Query the cost of a specified instruction.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:344
static bool areFunctionArgsABICompatible(const Function &F, const TargetTransformInfo &TTI, SmallPtrSetImpl< Argument *> &ArgsToPromote, SmallPtrSetImpl< Argument *> &ByValArgsToTransform)
Definition: BitVector.h:938
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:42
bool AllowPeeling
Allow peeling off loop iterations for loops with low dynamic tripcount.
bool AllowExpensiveTripCount
Allow emitting expensive instructions (such as divisions) when computing the trip count of a loop for...
unsigned FullUnrollMaxCount
Set the maximum unrolling factor for full unrolling.
AtomicOrdering
Atomic ordering for LLVM&#39;s memory model.
Key
PAL metadata keys.
Fast - This calling convention attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:43
Class to represent function types.
Definition: DerivedTypes.h:103
PopcntSupportKind
Flags indicating the kind of support for population count.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:33
An instruction for storing to memory.
Definition: Instructions.h:321
Reverse the order of the vector.
amdgpu Simplify well known AMD library false Value * Callee
bool AllowRemainder
Allow generation of a loop remainder (extra iterations after unroll).
Analysis containing CSE Info
Definition: CSEInfo.cpp:21
ExtractSubvector Index indicates start offset.
If not nullptr, enable inline expansion of memcmp.
Wrapper pass for TargetTransformInfo.
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:154
Flags describing the kind of vector reduction.
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:46
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:69
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
This is an important base class in LLVM.
Definition: Constant.h:42
A CRTP mix-in that provides informational APIs needed for analysis passes.
Definition: PassManager.h:383
AMDGPU Lower Kernel Arguments
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
TargetIRAnalysis(const TargetIRAnalysis &Arg)
bool IsMaxOp
If the op a min/max kind, true if it&#39;s a max operation.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
OperandValueProperties
Additional properties of an operand&#39;s values.
ImmutablePass class - This class is used to provide information that does not need to be run...
Definition: Pass.h:256
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:847
AddressSpace
Definition: NVPTXBaseInfo.h:22
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
Class to represent vector types.
Definition: DerivedTypes.h:393
Class for arbitrary precision integers.
Definition: APInt.h:70
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
TargetTransformInfo(T Impl)
Construct a TTI object using a type implementing the Concept API below.
amdgpu Simplify well known AMD library false Value Value * Arg
static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F, const Loop &L)
unsigned PeelCount
A forced peeling factor (the number of bodied of the original loop that should be peeled off before t...
unsigned Threshold
The cost threshold for the unrolled loop.
This class represents an analyzed expression in the program.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:465
Parameters that control the generic loop unrolling transformation.
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable)...
TargetIRAnalysis(TargetIRAnalysis &&Arg)
#define I(x, y, z)
Definition: MD5.cpp:58
TargetCostConstants
Underlying constants for &#39;cost&#39; values in this interface.
int getUserCost(const User *U) const
This is a helper function which calls the two-argument getUserCost with Operands which are the curren...
InsertSubvector. Index indicates start offset.
unsigned Insns
TODO: Some of these could be merged.
API to communicate dependencies between analyses during invalidation.
Definition: PassManager.h:642
const unsigned Kind
Multiway switch.
TargetTransformInfo Result
LLVM Value Representation.
Definition: Value.h:73
unsigned MaxPercentThresholdBoost
If complete unrolling will reduce the cost of the loop, we will boost the Threshold by a certain perc...
Broadcast element 0 to all other elements.
bool invalidate(Function &, const PreservedAnalyses &, FunctionAnalysisManager::Invalidator &)
Handle the invalidation of this information.
bool UpperBound
Allow using trip count upper bound to unroll loops.
print Print MemDeps of function
Convenience struct for specifying and reasoning about fast-math flags.
Definition: Operator.h:160
OperandValueKind
Additional information about an operand&#39;s possible values.
A container for analyses that lazily runs them and caches their results.
TargetCostKind
The kind of cost model.
CacheLevel
The possible cache levels.
This header defines various interfaces for pass management in LLVM.
Information about a load/store intrinsic defined by the target.
bool NoNaN
If op is an fp min/max, whether NaNs may be present.
A special type used by analysis passes to provide an address that identifies that particular analysis...
Definition: PassManager.h:71
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:44
ShuffleKind
The various kinds of shuffle patterns for vector queries.
bool IsSigned
Whether the operation is a signed int reduction.