56 #define DEBUG_TYPE "AMDGPUtti" 59 "amdgpu-unroll-threshold-private",
60 cl::desc(
"Unroll threshold for AMDGPU if private memory used in a loop"),
64 "amdgpu-unroll-threshold-local",
65 cl::desc(
"Unroll threshold for AMDGPU if local memory used in a loop"),
69 "amdgpu-unroll-threshold-if",
70 cl::desc(
"Unroll threshold increment for AMDGPU for each if statement inside loop"),
82 if (
const PHINode *PHI = dyn_cast<PHINode>(V)) {
84 return SubLoop->contains(PHI); }))
101 const unsigned MaxAlloca = (256 - 16) * 4;
104 unsigned MaxBoost =
std::max(ThresholdPrivate, ThresholdLocal);
106 const DataLayout &
DL = BB->getModule()->getDataLayout();
107 unsigned LocalGEPsSeen = 0;
110 return SubLoop->contains(BB); }))
119 if (
const BranchInst *Br = dyn_cast<BranchInst>(&
I)) {
120 if (UP.
Threshold < MaxBoost && Br->isConditional()) {
128 << *L <<
" due to " << *Br <<
'\n');
143 Threshold = ThresholdPrivate;
145 Threshold = ThresholdLocal;
160 if (AllocaSize > MaxAlloca)
175 bool HasLoopDef =
false;
182 return SubLoop->contains(Inst); }))
206 << *L <<
" due to " << *GEP <<
'\n');
222 return getHardwareNumberOfRegisters(Vec) >> 3;
234 unsigned ChainSizeInBytes,
236 unsigned VecRegBitWidth = VF * LoadSize;
239 return 128 / LoadSize;
245 unsigned ChainSizeInBytes,
247 unsigned VecRegBitWidth = VF * StoreSize;
248 if (VecRegBitWidth > 128)
249 return 128 / StoreSize;
267 return 8 *
ST->getMaxPrivateElementSize();
274 unsigned AddrSpace)
const {
279 return (Alignment >= 4 ||
ST->hasUnalignedScratchAccess()) &&
280 ChainSizeInBytes <= ST->getMaxPrivateElementSize();
287 unsigned AddrSpace)
const {
288 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
293 unsigned AddrSpace)
const {
294 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
318 if (!Ordering || !Volatile)
341 EVT OrigTy = TLI->getValueType(
DL, Ty);
344 Opd1PropInfo, Opd2PropInfo);
348 std::pair<int, MVT>
LT = TLI->getTypeLegalizationCost(
DL, Ty);
349 int ISD = TLI->InstructionOpcodeToISD(Opcode);
353 unsigned NElts = LT.second.isVector() ?
354 LT.second.getVectorNumElements() : 1;
363 return get64BitInstrCost() * LT.first * NElts;
366 return getFullRateInstrCost() * LT.first * NElts;
374 return 2 * getFullRateInstrCost() * LT.first * NElts;
377 return LT.first * NElts * getFullRateInstrCost();
379 const int QuarterRateCost = getQuarterRateInstrCost();
381 const int FullRateCost = getFullRateInstrCost();
382 return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
386 return QuarterRateCost * NElts * LT.first;
392 return LT.first * NElts * get64BitInstrCost();
395 return LT.first * NElts * getFullRateInstrCost();
402 int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost();
405 Cost += 3 * getFullRateInstrCost();
407 return LT.first * Cost * NElts;
412 if ((SLT ==
MVT::f32 && !
ST->hasFP32Denormals()) ||
414 return LT.first * getQuarterRateInstrCost() * NElts;
424 int Cost = 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost();
425 return LT.first * Cost * NElts;
429 int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost();
431 if (!
ST->hasFP32Denormals()) {
433 Cost += 2 * getFullRateInstrCost();
436 return LT.first * NElts * Cost;
444 Opd1PropInfo, Opd2PropInfo);
450 case Instruction::Br:
460 EVT OrigTy = TLI->getValueType(
DL, Ty);
465 !
ST->hasVOP3PInsts() ||
469 std::pair<int, MVT>
LT = TLI->getTypeLegalizationCost(
DL, Ty);
470 return LT.first * getFullRateInstrCost();
476 EVT OrigTy = TLI->getValueType(
DL, Ty);
481 !
ST->hasVOP3PInsts() ||
485 std::pair<int, MVT>
LT = TLI->getTypeLegalizationCost(
DL, Ty);
486 return LT.first * getHalfRateInstrCost();
492 case Instruction::ExtractElement:
493 case Instruction::InsertElement: {
497 if (EltSize == 16 && Index == 0 &&
ST->has16BitInsts())
507 return Index == ~0u ? 2 : 0;
545 if (
const Argument *A = dyn_cast<Argument>(V))
562 if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
565 if (
const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
569 if (isa<CallInst>(V) || isa<InvokeInst>(V))
576 if (
const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
577 switch (Intrinsic->getIntrinsicID()) {
590 if (
ST->hasVOP3PInsts()) {
619 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
620 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
621 return ((RealCallerBits & RealCalleeBits) == RealCalleeBits);
626 CommonTTI.getUnrollingPreferences(L, SE, UP);
634 return getHardwareNumberOfRegisters(Vec);
665 unsigned AddrSpace)
const {
674 unsigned AddrSpace)
const {
675 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
680 unsigned AddrSpace)
const {
681 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
696 case Instruction::Br:
707 case Instruction::ExtractElement:
708 case Instruction::InsertElement: {
720 return Index == ~0u ? 2 : 0;
729 CommonTTI.getUnrollingPreferences(L, SE, UP);
A parsed version of the target data layout string in and methods for querying it. ...
Value * getPointerOperand()
unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >())
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, unsigned Alignment, unsigned AddrSpace) const
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, unsigned Alignment, unsigned AddrSpace) const
Address space for direct addressible parameter memory (CONST0)
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
This class represents an incoming formal argument to a Function.
AMDGPU specific subclass of TargetSubtarget.
This class represents lattice values for constants.
bool isSized(SmallPtrSetImpl< Type *> *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
unsigned getCFInstrCost(unsigned Opcode)
unsigned getLoopDepth() const
Return the nesting level of this loop.
The main scalar evolution driver.
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, unsigned Alignment, unsigned AddrSpace) const
int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index)
Address space for 32-bit constant memory.
unsigned getMinVectorRegisterBitWidth() const
Address space for private memory.
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP)
An instruction for reading from memory.
int getArithmeticReductionCost(unsigned Opcode, Type *Ty, bool IsPairwise)
Value * getArgOperand(unsigned i) const
Calling convention used for Mesa/AMDPAL geometry shaders.
bool match(Val *V, const Pattern &P)
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const
Address space for constant memory (VTX2)
Calling convention used for Mesa/AMDPAL compute shaders.
const FeatureBitset & getFeatureBits() const
bool isAlwaysUniform(const Value *V) const
Shift and rotation operations.
unsigned getNumberOfRegisters(bool Vector) const
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, unsigned Alignment, unsigned AddrSpace) const
SPIR_KERNEL - Calling convention for SPIR kernel functions.
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly...
This file contains the simple types necessary to represent the attributes associated with functions a...
unsigned getArithmeticReductionCost(unsigned Opcode, Type *Ty, bool IsPairwise)
Try to calculate arithmetic and shuffle op costs for reduction operations.
uint64_t getNumElements() const
AtomicOrdering
Atomic ordering for LLVM's memory model.
unsigned getScalarSizeInBits() const
Calling convention used for AMDPAL shader stage before geometry shader if geometry is in use...
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, unsigned Alignment, unsigned AddrSpace) const
Simple integer binary arithmetic operators.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
unsigned getRegisterBitWidth(bool Vector) const
unsigned getMaxInterleaveFactor(unsigned VF)
AttributeList getAttributes() const
Return the attribute list for this Function.
Calling convention used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (ve...
Analysis containing CSE Info
unsigned getAddressSpace() const
Returns the address space of this instruction's pointer type.
unsigned getCFInstrCost(unsigned Opcode)
an instruction for type-safe pointer arithmetic to access elements of arrays and structs ...
initializer< Ty > init(const Ty &Val)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Container class for subtarget features.
LLVM Basic Block Representation.
bool isLoopExiting(const BlockT *BB) const
True if terminator in the block can branch to another block that is outside of the current loop...
The instances of the Type class are immutable: once they are created, they are never changed...
Simple binary floating point operators.
Conditional or Unconditional Branch instruction.
Address space for flat memory.
unsigned getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwise, bool)
Try to calculate op costs for min/max reduction operations.
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
Address space for indirect addressible parameter memory (VTX1)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly...
Address space for local memory.
unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp)
Calling convention used for AMDPAL vertex shader if tessellation is in use.
unsigned getNumberOfRegisters(bool Vec) const
Address space for global memory (RAT0, VTX0).
unsigned getHardwareNumberOfRegisters(bool Vec) const
Calling convention used for Mesa/AMDPAL pixel shaders.
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const
Value * GetUnderlyingObject(Value *V, const DataLayout &DL, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value...
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
unsigned getCFInstrCost(unsigned Opcode)
unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp)
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
bool hasParamAttribute(unsigned ArgNo, Attribute::AttrKind Kind) const
Equivalent to hasAttribute(ArgNo + FirstArgIndex, Kind).
This is the shared class of boolean and integer constants.
bool isSourceOfDivergence(const Value *V) const
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type...
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Module.h This file contains the declarations for the Module class.
int getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value *> Args=ArrayRef< const Value *>())
int getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwiseForm, bool IsUnsigned)
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, unsigned Alignment, unsigned AddrSpace) const
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
unsigned getMaxInterleaveFactor(unsigned VF)
Class to represent vector types.
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
unsigned getArgNo() const
Return the index of this formal argument in its containing function.
static cl::opt< unsigned > Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"), cl::init(100), cl::Hidden)
int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index)
uint64_t getTypeSizeInBits(Type *Ty) const
Size examples:
uint64_t getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
const std::vector< LoopT * > & getSubLoops() const
Return the loops contained entirely within this loop.
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
Bitwise operators - logical and, logical or, logical xor.
const Function * getParent() const
unsigned getHardwareNumberOfRegisters(bool Vector) const
Represents a single loop in the control flow graph.
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Calling convention used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP)
iterator_range< value_op_iterator > operand_values()
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
LLVM Value Representation.
Address space for region memory. (GDS)
unsigned getMinVectorRegisterBitWidth() const
Primary interface to the complete machine description for the target machine.
Type * getElementType() const
Value * PtrVal
This is the pointer that the intrinsic is loading from or storing to.
bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size...
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP)
Information about a load/store intrinsic defined by the target.
Calling convention for AMDGPU code object kernels.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
unsigned getRegisterBitWidth(bool Vector) const
bool empty() const
empty - Check if the array is empty.
A wrapper class for inspecting calls to intrinsic functions.
an instruction to allocate memory on the stack