34 #define DEBUG_TYPE "amdgpu-perf-hint" 38 cl::desc(
"Function mem bound threshold in %"));
42 cl::desc(
"Kernel limit wave threshold in %"));
46 cl::desc(
"Indirect access memory instruction weight"));
50 cl::desc(
"Large stride memory access weight"));
54 cl::desc(
"Large stride memory access threshold"));
56 STATISTIC(NumMemBound,
"Number of functions marked as memory bound");
57 STATISTIC(NumLimitWave,
"Number of functions marked as needing limit wave");
63 "Analysis if a function is memory bound",
true,
true)
67 struct AMDGPUPerfHint {
73 : FIM(FIM_), DL(
nullptr), TLI(TLI_) {}
78 struct MemAccessInfo {
82 MemAccessInfo() : V(
nullptr), Base(
nullptr),
Offset(0) {}
83 bool isLargeStride(MemAccessInfo &Reference)
const;
84 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 87 OS <<
"Value: " << *V <<
'\n' 88 <<
"Base: " << *Base <<
" Offset: " << Offset <<
'\n';
94 MemAccessInfo makeMemAccessInfo(
Instruction *)
const;
96 MemAccessInfo LastAccess;
108 bool isIndirectAccess(
const Instruction *Inst)
const;
119 bool isGlobalAddr(
const Value *V)
const;
120 bool isLocalAddr(
const Value *V)
const;
121 bool isConstantAddr(
const Value *V)
const;
125 if (
auto LI = dyn_cast<LoadInst>(Inst)) {
126 return LI->getPointerOperand();
128 if (
auto SI = dyn_cast<StoreInst>(Inst)) {
129 return SI->getPointerOperand();
131 if (
auto AI = dyn_cast<AtomicCmpXchgInst>(Inst)) {
132 return AI->getPointerOperand();
134 if (
auto AI = dyn_cast<AtomicRMWInst>(Inst)) {
135 return AI->getPointerOperand();
137 if (
auto MI = dyn_cast<AnyMemIntrinsic>(Inst)) {
138 return MI->getRawDest();
144 bool AMDGPUPerfHint::isIndirectAccess(
const Instruction *Inst)
const {
148 if (
const Value *MO = getMemoryInstrPtr(Inst)) {
149 if (isGlobalAddr(MO))
153 while (!WorkSet.
empty()) {
156 if (!Visited.
insert(V).second)
160 if (
auto LD = dyn_cast<LoadInst>(V)) {
161 auto M =
LD->getPointerOperand();
162 if (isGlobalAddr(M) || isLocalAddr(M) || isConstantAddr(M)) {
169 if (
auto GEP = dyn_cast<GetElementPtrInst>(V)) {
170 auto P =
GEP->getPointerOperand();
172 for (
unsigned I = 1,
E =
GEP->getNumIndices() + 1;
I !=
E; ++
I)
177 if (
auto U = dyn_cast<UnaryInstruction>(V)) {
178 WorkSet.
insert(U->getOperand(0));
182 if (
auto BO = dyn_cast<BinaryOperator>(V)) {
183 WorkSet.
insert(BO->getOperand(0));
184 WorkSet.
insert(BO->getOperand(1));
188 if (
auto S = dyn_cast<SelectInst>(V)) {
189 WorkSet.
insert(S->getFalseValue());
190 WorkSet.
insert(S->getTrueValue());
194 if (
auto E = dyn_cast<ExtractElementInst>(V)) {
195 WorkSet.
insert(
E->getVectorOperand());
206 void AMDGPUPerfHint::visit(
const Function &
F) {
216 LastAccess = MemAccessInfo();
218 if (getMemoryInstrPtr(&
I)) {
219 if (isIndirectAccess(&
I))
221 if (isLargeStride(&
I))
227 CallSite CS(const_cast<Instruction *>(&
I));
238 auto Loc = FIM.find(Callee);
240 assert(Loc != FIM.end() &&
"No func info");
245 }
else if (
auto *
GEP = dyn_cast<GetElementPtrInst>(&
I)) {
248 AM.
BaseGV = dyn_cast_or_null<GlobalValue>(
const_cast<Value *
>(Ptr));
250 if (TLI->isLegalAddressingMode(*DL, AM,
GEP->getResultElementType(),
251 GEP->getPointerAddressSpace()))
263 if (FIM.find(&F) != FIM.end())
270 auto Loc = FIM.find(&F);
272 assert(Loc != FIM.end() &&
"No func info");
275 <<
" IAMInst: " << Loc->second.IAMInstCount <<
'\n' 276 <<
" LSMInst: " << Loc->second.LSMInstCount <<
'\n' 277 <<
" TotalInst: " << Loc->second.InstCount <<
'\n');
279 auto &FI = Loc->second;
281 if (isMemBound(FI)) {
302 bool AMDGPUPerfHint::isGlobalAddr(
const Value *V)
const {
303 if (
auto PT = dyn_cast<PointerType>(V->
getType())) {
304 unsigned As = PT->getAddressSpace();
311 bool AMDGPUPerfHint::isLocalAddr(
const Value *V)
const {
312 if (
auto PT = dyn_cast<PointerType>(V->
getType()))
317 bool AMDGPUPerfHint::isLargeStride(
const Instruction *Inst) {
320 MemAccessInfo MAI = makeMemAccessInfo(const_cast<Instruction *>(Inst));
321 bool IsLargeStride = MAI.isLargeStride(LastAccess);
323 LastAccess = std::move(MAI);
325 return IsLargeStride;
328 AMDGPUPerfHint::MemAccessInfo
329 AMDGPUPerfHint::makeMemAccessInfo(
Instruction *Inst)
const {
331 const Value *MO = getMemoryInstrPtr(Inst);
343 bool AMDGPUPerfHint::isConstantAddr(
const Value *V)
const {
344 if (
auto PT = dyn_cast<PointerType>(V->
getType())) {
345 unsigned As = PT->getAddressSpace();
352 bool AMDGPUPerfHint::MemAccessInfo::isLargeStride(
353 MemAccessInfo &Reference)
const {
355 if (!
Base || !Reference.Base ||
Base != Reference.Base)
358 uint64_t Diff =
Offset > Reference.Offset ?
Offset - Reference.Offset
359 : Reference.Offset -
Offset;
362 <<
print() <<
"<=>\n" 363 << Reference.print() <<
"Result:" << Result <<
'\n');
369 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
376 AMDGPUPerfHint Analyzer(FIM, ST->getTargetLowering());
377 Analyzer.runOnFunction(F);
382 auto FI = FIM.find(F);
386 return AMDGPUPerfHint::isMemBound(FI->second);
390 auto FI = FIM.find(F);
394 return AMDGPUPerfHint::needLimitWave(FI->second);
A parsed version of the target data layout string in and methods for querying it. ...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...
This class represents lattice values for constants.
A Module instance is used to store all the information related to an LLVM module. ...
bool needsWaveLimiter(const Function *F) const
static cl::opt< unsigned > LimitWaveThresh("amdgpu-limit-wave-threshold", cl::init(50), cl::Hidden, cl::desc("Kernel limit wave threshold in %"))
Address space for 32-bit constant memory.
bool runOnFunction(Function &F) override
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass...
STATISTIC(NumFunctions, "Total number of functions")
LLVM_NODISCARD bool empty() const
const_iterator begin() const
Address space for constant memory (VTX2)
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
Type * getType() const
All values are typed, get the type of this value.
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset...
static cl::opt< unsigned > LSWeight("amdgpu-large-stride-weight", cl::init(1000), cl::Hidden, cl::desc("Large stride memory access weight"))
static cl::opt< unsigned > LargeStrideThresh("amdgpu-large-stride-threshold", cl::init(64), cl::Hidden, cl::desc("Large stride memory access threshold"))
amdgpu Simplify well known AMD library false Value * Callee
static bool runOnFunction(Function &F, bool PostInlining)
initializer< Ty > init(const Ty &Val)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Address space for flat memory.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
This file contains the declarations for the subclasses of Constant, which represent the different fla...
bool isEntryFunctionCC(CallingConv::ID CC)
char & AMDGPUPerfHintAnalysisID
Address space for local memory.
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static cl::opt< unsigned > IAWeight("amdgpu-indirect-access-weight", cl::init(1000), cl::Hidden, cl::desc("Indirect access memory instruction weight"))
INITIALIZE_PASS(AMDGPUPerfHintAnalysis, DEBUG_TYPE, "Analysis if a function is memory bound", true, true) namespace
std::pair< NoneType, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Address space for global memory (RAT0, VTX0).
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
static cl::opt< unsigned > MemBoundThresh("amdgpu-membound-threshold", cl::init(50), cl::Hidden, cl::desc("Function mem bound threshold in %"))
Module.h This file contains the declarations for the Module class.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
TargetSubtargetInfo - Generic base class for all target subtargets.
bool isMemoryBound(const Function *F) const
StringRef getName() const
Return a constant reference to the value's name.
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
FunTy * getCalledFunction() const
Return the function being called if this is a direct call, otherwise return null (if it's an indirect...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Module * getParent()
Get the module that this global value is contained inside of...
LLVM Value Representation.
This class implements an extremely fast bulk output stream that can only output to a stream...
Primary interface to the complete machine description for the target machine.
Simple wrapper around std::function<void(raw_ostream&)>.
Analyzes if a function potentially memory bound and if a kernel kernel may benefit from limiting numb...
This file describes how to lower LLVM code to machine code.