LLVM  8.0.1
AMDGPULowerKernelArguments.cpp
Go to the documentation of this file.
1 //===-- AMDGPULowerKernelArguments.cpp ------------------------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file This pass replaces accesses to kernel arguments with loads from
11 /// offsets from the kernarg base pointer.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPU.h"
16 #include "AMDGPUSubtarget.h"
17 #include "AMDGPUTargetMachine.h"
18 #include "llvm/ADT/StringRef.h"
19 #include "llvm/Analysis/Loads.h"
20 #include "llvm/CodeGen/Passes.h"
22 #include "llvm/IR/Attributes.h"
23 #include "llvm/IR/BasicBlock.h"
24 #include "llvm/IR/Constants.h"
25 #include "llvm/IR/DerivedTypes.h"
26 #include "llvm/IR/Function.h"
27 #include "llvm/IR/IRBuilder.h"
28 #include "llvm/IR/InstrTypes.h"
29 #include "llvm/IR/Instruction.h"
30 #include "llvm/IR/Instructions.h"
31 #include "llvm/IR/LLVMContext.h"
32 #include "llvm/IR/MDBuilder.h"
33 #include "llvm/IR/Metadata.h"
34 #include "llvm/IR/Operator.h"
35 #include "llvm/IR/Type.h"
36 #include "llvm/IR/Value.h"
37 #include "llvm/Pass.h"
38 #include "llvm/Support/Casting.h"
39 
40 #define DEBUG_TYPE "amdgpu-lower-kernel-arguments"
41 
42 using namespace llvm;
43 
44 namespace {
45 
46 class AMDGPULowerKernelArguments : public FunctionPass{
47 public:
48  static char ID;
49 
50  AMDGPULowerKernelArguments() : FunctionPass(ID) {}
51 
52  bool runOnFunction(Function &F) override;
53 
54  void getAnalysisUsage(AnalysisUsage &AU) const override {
56  AU.setPreservesAll();
57  }
58 };
59 
60 } // end anonymous namespace
61 
64  if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty())
65  return false;
66 
67  auto &TPC = getAnalysis<TargetPassConfig>();
68 
69  const TargetMachine &TM = TPC.getTM<TargetMachine>();
70  const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
71  LLVMContext &Ctx = F.getParent()->getContext();
72  const DataLayout &DL = F.getParent()->getDataLayout();
73  BasicBlock &EntryBlock = *F.begin();
74  IRBuilder<> Builder(&*EntryBlock.begin());
75 
76  const unsigned KernArgBaseAlign = 16; // FIXME: Increase if necessary
77  const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(F);
78 
79  unsigned MaxAlign;
80  // FIXME: Alignment is broken broken with explicit arg offset.;
81  const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F, MaxAlign);
82  if (TotalKernArgSize == 0)
83  return false;
84 
85  CallInst *KernArgSegment =
86  Builder.CreateIntrinsic(Intrinsic::amdgcn_kernarg_segment_ptr, {}, {},
87  nullptr, F.getName() + ".kernarg.segment");
88 
91  Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize));
92 
93  unsigned AS = KernArgSegment->getType()->getPointerAddressSpace();
94  uint64_t ExplicitArgOffset = 0;
95 
96  for (Argument &Arg : F.args()) {
97  Type *ArgTy = Arg.getType();
98  unsigned Align = DL.getABITypeAlignment(ArgTy);
99  unsigned Size = DL.getTypeSizeInBits(ArgTy);
100  unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
101 
102  uint64_t EltOffset = alignTo(ExplicitArgOffset, Align) + BaseOffset;
103  ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize;
104 
105  if (Arg.use_empty())
106  continue;
107 
108  if (PointerType *PT = dyn_cast<PointerType>(ArgTy)) {
109  // FIXME: Hack. We rely on AssertZext to be able to fold DS addressing
110  // modes on SI to know the high bits are 0 so pointer adds don't wrap. We
111  // can't represent this with range metadata because it's only allowed for
112  // integer types.
113  if (PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
114  ST.getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS)
115  continue;
116 
117  // FIXME: We can replace this with equivalent alias.scope/noalias
118  // metadata, but this appears to be a lot of work.
119  if (Arg.hasNoAliasAttr())
120  continue;
121  }
122 
123  VectorType *VT = dyn_cast<VectorType>(ArgTy);
124  bool IsV3 = VT && VT->getNumElements() == 3;
125  bool DoShiftOpt = Size < 32 && !ArgTy->isAggregateType();
126 
127  VectorType *V4Ty = nullptr;
128 
129  int64_t AlignDownOffset = alignDown(EltOffset, 4);
130  int64_t OffsetDiff = EltOffset - AlignDownOffset;
131  unsigned AdjustedAlign = MinAlign(DoShiftOpt ? AlignDownOffset : EltOffset,
132  KernArgBaseAlign);
133 
134  Value *ArgPtr;
135  if (DoShiftOpt) { // FIXME: Handle aggregate types
136  // Since we don't have sub-dword scalar loads, avoid doing an extload by
137  // loading earlier than the argument address, and extracting the relevant
138  // bits.
139  //
140  // Additionally widen any sub-dword load to i32 even if suitably aligned,
141  // so that CSE between different argument loads works easily.
142 
143  ArgPtr = Builder.CreateConstInBoundsGEP1_64(
144  KernArgSegment,
145  AlignDownOffset,
146  Arg.getName() + ".kernarg.offset.align.down");
147  ArgPtr = Builder.CreateBitCast(ArgPtr,
148  Builder.getInt32Ty()->getPointerTo(AS),
149  ArgPtr->getName() + ".cast");
150  } else {
151  ArgPtr = Builder.CreateConstInBoundsGEP1_64(
152  KernArgSegment,
153  EltOffset,
154  Arg.getName() + ".kernarg.offset");
155  ArgPtr = Builder.CreateBitCast(ArgPtr, ArgTy->getPointerTo(AS),
156  ArgPtr->getName() + ".cast");
157  }
158 
159  if (IsV3 && Size >= 32) {
160  V4Ty = VectorType::get(VT->getVectorElementType(), 4);
161  // Use the hack that clang uses to avoid SelectionDAG ruining v3 loads
162  ArgPtr = Builder.CreateBitCast(ArgPtr, V4Ty->getPointerTo(AS));
163  }
164 
165  LoadInst *Load = Builder.CreateAlignedLoad(ArgPtr, AdjustedAlign);
167 
168  MDBuilder MDB(Ctx);
169 
170  if (isa<PointerType>(ArgTy)) {
171  if (Arg.hasNonNullAttr())
173 
174  uint64_t DerefBytes = Arg.getDereferenceableBytes();
175  if (DerefBytes != 0) {
176  Load->setMetadata(
178  MDNode::get(Ctx,
179  MDB.createConstant(
180  ConstantInt::get(Builder.getInt64Ty(), DerefBytes))));
181  }
182 
183  uint64_t DerefOrNullBytes = Arg.getDereferenceableOrNullBytes();
184  if (DerefOrNullBytes != 0) {
185  Load->setMetadata(
187  MDNode::get(Ctx,
188  MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(),
189  DerefOrNullBytes))));
190  }
191 
192  unsigned ParamAlign = Arg.getParamAlignment();
193  if (ParamAlign != 0) {
194  Load->setMetadata(
196  MDNode::get(Ctx,
197  MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(),
198  ParamAlign))));
199  }
200  }
201 
202  // TODO: Convert noalias arg to !noalias
203 
204  if (DoShiftOpt) {
205  Value *ExtractBits = OffsetDiff == 0 ?
206  Load : Builder.CreateLShr(Load, OffsetDiff * 8);
207 
208  IntegerType *ArgIntTy = Builder.getIntNTy(Size);
209  Value *Trunc = Builder.CreateTrunc(ExtractBits, ArgIntTy);
210  Value *NewVal = Builder.CreateBitCast(Trunc, ArgTy,
211  Arg.getName() + ".load");
212  Arg.replaceAllUsesWith(NewVal);
213  } else if (IsV3) {
214  Value *Shuf = Builder.CreateShuffleVector(Load, UndefValue::get(V4Ty),
215  {0, 1, 2},
216  Arg.getName() + ".load");
217  Arg.replaceAllUsesWith(Shuf);
218  } else {
219  Load->setName(Arg.getName() + ".load");
220  Arg.replaceAllUsesWith(Load);
221  }
222  }
223 
224  KernArgSegment->addAttribute(
226  Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));
227 
228  return true;
229 }
230 
231 INITIALIZE_PASS_BEGIN(AMDGPULowerKernelArguments, DEBUG_TYPE,
232  "AMDGPU Lower Kernel Arguments", false, false)
234  false, false)
235 
236 char AMDGPULowerKernelArguments::ID = 0;
237 
239  return new AMDGPULowerKernelArguments();
240 }
Type * getVectorElementType() const
Definition: Type.h:371
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:111
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
This class represents an incoming formal argument to a Function.
Definition: Argument.h:30
AMDGPU specific subclass of TargetSubtarget.
This class represents lattice values for constants.
Definition: AllocatorList.h:24
ConstantAsMetadata * createConstant(Constant *C)
Return the given constant as metadata.
Definition: MDBuilder.cpp:25
static Attribute getWithAlignment(LLVMContext &Context, uint64_t Align)
Return a uniquified Attribute object that has the specific alignment set.
Definition: Attributes.cpp:125
static Attribute getWithDereferenceableBytes(LLVMContext &Context, uint64_t Bytes)
Definition: Attributes.cpp:138
This class represents a function call, abstracting a target machine&#39;s calling convention.
This file contains the declarations for metadata subclasses.
#define DEBUG_TYPE
F(f)
uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the next integer (mod 2**64) that is greater than or equal to Value and is a multiple of Alig...
Definition: MathExtras.h:685
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition: DerivedTypes.h:503
An instruction for reading from memory.
Definition: Instructions.h:168
void addAttribute(unsigned i, Attribute::AttrKind Kind)
adds the attribute to the list of attributes.
Definition: InstrTypes.h:1261
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:269
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:718
AnalysisUsage & addRequired()
const DataLayout & getDataLayout() const
Get the data layout for the module&#39;s target platform.
Definition: Module.cpp:371
LLVMContext & getContext() const
Get the global data context.
Definition: Module.h:244
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
Definition: Type.cpp:652
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:743
This file contains the simple types necessary to represent the attributes associated with functions a...
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:285
uint64_t getNumElements() const
Definition: DerivedTypes.h:359
Target-Independent Code Generator Pass Configuration Options.
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:245
bool arg_empty() const
Definition: Function.h:699
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:429
iterator begin()
Definition: Function.h:656
Class to represent pointers.
Definition: DerivedTypes.h:467
constexpr uint64_t MinAlign(uint64_t A, uint64_t B)
A and B are either alignments or offsets.
Definition: MathExtras.h:610
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata *> MDs)
Definition: Metadata.h:1166
static bool runOnFunction(Function &F, bool PostInlining)
FunctionPass * createAMDGPULowerKernelArgumentsPass()
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:46
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:69
This file contains the declarations for the subclasses of Constant, which represent the different fla...
AMDGPU Lower Kernel Arguments
Represent the analysis usage information of a pass.
Address space for local memory.
Definition: AMDGPU.h:260
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:285
Class to represent integer types.
Definition: DerivedTypes.h:40
The AMDGPU TargetMachine interface definition for hw codgen targets.
static UndefValue * get(Type *T)
Static factory methods - Return an &#39;undef&#39; object of the specified type.
Definition: Constants.cpp:1415
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1226
INITIALIZE_PASS_BEGIN(AMDGPULowerKernelArguments, DEBUG_TYPE, "AMDGPU Lower Kernel Arguments", false, false) INITIALIZE_PASS_END(AMDGPULowerKernelArguments
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:213
unsigned getABITypeAlignment(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:730
bool isAggregateType() const
Return true if the type is an aggregate type.
Definition: Type.h:258
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition: Type.cpp:180
static Constant * get(Type *Ty, uint64_t V, bool isSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:622
Class to represent vector types.
Definition: DerivedTypes.h:393
void setPreservesAll()
Set by analyses that do not transform their input at all.
amdgpu Simplify well known AMD library false Value Value * Arg
uint64_t getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:568
uint64_t getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:436
StringRef getName() const
Return a constant reference to the value&#39;s name.
Definition: Value.cpp:214
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:323
uint32_t Size
Definition: Profile.cpp:47
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:566
LLVM Value Representation.
Definition: Value.h:73
static VectorType * get(Type *ElementType, unsigned NumElements)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:606
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:59
bool use_empty() const
Definition: Value.h:323
Calling convention for AMDGPU code object kernels.
Definition: CallingConv.h:201
iterator_range< arg_iterator > args()
Definition: Function.h:689