LLVM  8.0.1
AMDGPUSubtarget.cpp
Go to the documentation of this file.
1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// Implements the AMDGPU specific subclass of TargetSubtarget.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUSubtarget.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUTargetMachine.h"
18 #include "AMDGPUCallLowering.h"
20 #include "AMDGPULegalizerInfo.h"
21 #include "AMDGPURegisterBankInfo.h"
22 #include "SIMachineFunctionInfo.h"
24 #include "llvm/ADT/SmallString.h"
27 #include "llvm/IR/MDBuilder.h"
29 #include <algorithm>
30 
31 using namespace llvm;
32 
33 #define DEBUG_TYPE "amdgpu-subtarget"
34 
35 #define GET_SUBTARGETINFO_TARGET_DESC
36 #define GET_SUBTARGETINFO_CTOR
37 #define AMDGPUSubtarget GCNSubtarget
38 #include "AMDGPUGenSubtargetInfo.inc"
39 #define GET_SUBTARGETINFO_TARGET_DESC
40 #define GET_SUBTARGETINFO_CTOR
41 #undef AMDGPUSubtarget
42 #include "R600GenSubtargetInfo.inc"
43 
44 GCNSubtarget::~GCNSubtarget() = default;
45 
48  StringRef GPU, StringRef FS) {
49  SmallString<256> FullFS("+promote-alloca,+dx10-clamp,");
50  FullFS += FS;
51  ParseSubtargetFeatures(GPU, FullFS);
52 
53  // FIXME: I don't think think Evergreen has any useful support for
54  // denormals, but should be checked. Should we issue a warning somewhere
55  // if someone tries to enable these?
57  FP32Denormals = false;
58  }
59 
62 
63  return *this;
64 }
65 
68  StringRef GPU, StringRef FS) {
69  // Determine default and user-specified characteristics
70  // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
71  // enabled, but some instructions do not respect them and they run at the
72  // double precision rate, so don't enable by default.
73  //
74  // We want to be able to turn these off, but making this a subtarget feature
75  // for SI has the unhelpful behavior that it unsets everything else if you
76  // disable it.
77  //
78  // Similarly we want enable-prt-strict-null to be on by default and not to
79  // unset everything else if it is disabled
80 
81  SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
82 
83  if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
84  FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,";
85 
86  // FIXME: I don't think think Evergreen has any useful support for
87  // denormals, but should be checked. Should we issue a warning somewhere
88  // if someone tries to enable these?
90  FullFS += "+fp64-fp16-denormals,";
91  } else {
92  FullFS += "-fp32-denormals,";
93  }
94 
95  FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
96 
97  FullFS += FS;
98 
99  ParseSubtargetFeatures(GPU, FullFS);
100 
101  // We don't support FP64 for EG/NI atm.
103 
104  // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
105  // on VI and newer hardware to avoid assertion failures due to missing ADDR64
106  // variants of MUBUF instructions.
107  if (!hasAddr64() && !FS.contains("flat-for-global")) {
108  FlatForGlobal = true;
109  }
110 
111  // Set defaults if needed.
112  if (MaxPrivateElementSize == 0)
113  MaxPrivateElementSize = 4;
114 
115  if (LDSBankCount == 0)
116  LDSBankCount = 32;
117 
118  if (TT.getArch() == Triple::amdgcn) {
119  if (LocalMemorySize == 0)
120  LocalMemorySize = 32768;
121 
122  // Do something sensible for unspecified target.
123  if (!HasMovrel && !HasVGPRIndexMode)
124  HasMovrel = true;
125  }
126 
128 
129  return *this;
130 }
131 
133  TargetTriple(TT),
138  HasSDWA(false),
140  HasMulI24(true),
141  HasMulU24(true),
146  LocalMemorySize(0),
147  WavefrontSize(0)
148  { }
149 
151  const GCNTargetMachine &TM) :
152  AMDGPUGenSubtargetInfo(TT, GPU, FS),
153  AMDGPUSubtarget(TT),
154  TargetTriple(TT),
155  Gen(SOUTHERN_ISLANDS),
156  IsaVersion(ISAVersion0_0_0),
157  InstrItins(getInstrItineraryForCPU(GPU)),
158  LDSBankCount(0),
159  MaxPrivateElementSize(0),
160 
161  FastFMAF32(false),
162  HalfRate64Ops(false),
163 
164  FP64FP16Denormals(false),
165  DX10Clamp(false),
166  FlatForGlobal(false),
167  AutoWaitcntBeforeBarrier(false),
168  CodeObjectV3(false),
169  UnalignedScratchAccess(false),
170  UnalignedBufferAccess(false),
171 
172  HasApertureRegs(false),
173  EnableXNACK(false),
174  TrapHandler(false),
175  DebuggerInsertNops(false),
176  DebuggerEmitPrologue(false),
177 
178  EnableHugePrivateBuffer(false),
180  EnableUnsafeDSOffsetFolding(false),
181  EnableSIScheduler(false),
182  EnableDS128(false),
183  EnablePRTStrictNull(false),
184  DumpCode(false),
185 
186  FP64(false),
187  GCN3Encoding(false),
188  CIInsts(false),
189  VIInsts(false),
190  GFX9Insts(false),
191  SGPRInitBug(false),
192  HasSMemRealTime(false),
193  HasIntClamp(false),
194  HasFmaMixInsts(false),
195  HasMovrel(false),
196  HasVGPRIndexMode(false),
197  HasScalarStores(false),
198  HasScalarAtomics(false),
199  HasSDWAOmod(false),
200  HasSDWAScalar(false),
201  HasSDWASdst(false),
202  HasSDWAMac(false),
203  HasSDWAOutModsVOPC(false),
204  HasDPP(false),
205  HasR128A16(false),
206  HasDLInsts(false),
207  HasDotInsts(false),
208  EnableSRAMECC(false),
209  FlatAddressSpace(false),
210  FlatInstOffsets(false),
211  FlatGlobalInsts(false),
212  FlatScratchInsts(false),
213  AddNoCarryInsts(false),
214  HasUnpackedD16VMem(false),
215 
217 
218  FeatureDisable(false),
219  InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
220  TLInfo(TM, *this),
221  FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
222  CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
223  Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
224  RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
225  InstSelector.reset(new AMDGPUInstructionSelector(
226  *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
227 }
228 
230  const Function &F) const {
231  if (NWaves == 1)
232  return getLocalMemorySize();
233  unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
234  unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
235  unsigned MaxWaves = getMaxWavesPerEU();
236  return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
237 }
238 
240  const Function &F) const {
241  unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
242  unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
243  unsigned MaxWaves = getMaxWavesPerEU();
244  unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
245  unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
246  NumWaves = std::min(NumWaves, MaxWaves);
247  NumWaves = std::max(NumWaves, 1u);
248  return NumWaves;
249 }
250 
251 unsigned
253  const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
254  return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
255 }
256 
257 std::pair<unsigned, unsigned>
259  switch (CC) {
263  return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4);
270  return std::make_pair(1, getWavefrontSize());
271  default:
272  return std::make_pair(1, 16 * getWavefrontSize());
273  }
274 }
275 
276 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
277  const Function &F) const {
278  // FIXME: 1024 if function.
279  // Default minimum/maximum flat work group sizes.
280  std::pair<unsigned, unsigned> Default =
282 
283  // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
284  // starts using "amdgpu-flat-work-group-size" attribute.
285  Default.second = AMDGPU::getIntegerAttribute(
286  F, "amdgpu-max-work-group-size", Default.second);
287  Default.first = std::min(Default.first, Default.second);
288 
289  // Requested minimum/maximum flat work group sizes.
290  std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
291  F, "amdgpu-flat-work-group-size", Default);
292 
293  // Make sure requested minimum is less than requested maximum.
294  if (Requested.first > Requested.second)
295  return Default;
296 
297  // Make sure requested values do not violate subtarget's specifications.
298  if (Requested.first < getMinFlatWorkGroupSize())
299  return Default;
300  if (Requested.second > getMaxFlatWorkGroupSize())
301  return Default;
302 
303  return Requested;
304 }
305 
306 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
307  const Function &F) const {
308  // Default minimum/maximum number of waves per execution unit.
309  std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
310 
311  // Default/requested minimum/maximum flat work group sizes.
312  std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
313 
314  // If minimum/maximum flat work group sizes were explicitly requested using
315  // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
316  // number of waves per execution unit to values implied by requested
317  // minimum/maximum flat work group sizes.
318  unsigned MinImpliedByFlatWorkGroupSize =
319  getMaxWavesPerEU(FlatWorkGroupSizes.second);
320  bool RequestedFlatWorkGroupSize = false;
321 
322  // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
323  // starts using "amdgpu-flat-work-group-size" attribute.
324  if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
325  F.hasFnAttribute("amdgpu-flat-work-group-size")) {
326  Default.first = MinImpliedByFlatWorkGroupSize;
327  RequestedFlatWorkGroupSize = true;
328  }
329 
330  // Requested minimum/maximum number of waves per execution unit.
331  std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
332  F, "amdgpu-waves-per-eu", Default, true);
333 
334  // Make sure requested minimum is less than requested maximum.
335  if (Requested.second && Requested.first > Requested.second)
336  return Default;
337 
338  // Make sure requested values do not violate subtarget's specifications.
339  if (Requested.first < getMinWavesPerEU() ||
340  Requested.first > getMaxWavesPerEU())
341  return Default;
342  if (Requested.second > getMaxWavesPerEU())
343  return Default;
344 
345  // Make sure requested values are compatible with values implied by requested
346  // minimum/maximum flat work group sizes.
347  if (RequestedFlatWorkGroupSize &&
348  Requested.first < MinImpliedByFlatWorkGroupSize)
349  return Default;
350 
351  return Requested;
352 }
353 
355  Function *Kernel = I->getParent()->getParent();
356  unsigned MinSize = 0;
357  unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
358  bool IdQuery = false;
359 
360  // If reqd_work_group_size is present it narrows value down.
361  if (auto *CI = dyn_cast<CallInst>(I)) {
362  const Function *F = CI->getCalledFunction();
363  if (F) {
364  unsigned Dim = UINT_MAX;
365  switch (F->getIntrinsicID()) {
368  IdQuery = true;
371  Dim = 0;
372  break;
375  IdQuery = true;
378  Dim = 1;
379  break;
382  IdQuery = true;
385  Dim = 2;
386  break;
387  default:
388  break;
389  }
390  if (Dim <= 3) {
391  if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
392  if (Node->getNumOperands() == 3)
393  MinSize = MaxSize = mdconst::extract<ConstantInt>(
394  Node->getOperand(Dim))->getZExtValue();
395  }
396  }
397  }
398 
399  if (!MaxSize)
400  return false;
401 
402  // Range metadata is [Lo, Hi). For ID query we need to pass max size
403  // as Hi. For size query we need to pass Hi + 1.
404  if (IdQuery)
405  MinSize = 0;
406  else
407  ++MaxSize;
408 
409  MDBuilder MDB(I->getContext());
410  MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
411  APInt(32, MaxSize));
412  I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
413  return true;
414 }
415 
417  unsigned &MaxAlign) const {
420 
421  const DataLayout &DL = F.getParent()->getDataLayout();
422  uint64_t ExplicitArgBytes = 0;
423  MaxAlign = 1;
424 
425  for (const Argument &Arg : F.args()) {
426  Type *ArgTy = Arg.getType();
427 
428  unsigned Align = DL.getABITypeAlignment(ArgTy);
429  uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
430  ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
431  MaxAlign = std::max(MaxAlign, Align);
432  }
433 
434  return ExplicitArgBytes;
435 }
436 
438  unsigned &MaxAlign) const {
439  uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
440 
441  unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
442 
443  uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
444  unsigned ImplicitBytes = getImplicitArgNumBytes(F);
445  if (ImplicitBytes != 0) {
446  unsigned Alignment = getAlignmentForImplicitArgPtr();
447  TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
448  }
449 
450  // Being able to dereference past the end is useful for emitting scalar loads.
451  return alignTo(TotalSize, 4);
452 }
453 
455  const TargetMachine &TM) :
456  R600GenSubtargetInfo(TT, GPU, FS),
457  AMDGPUSubtarget(TT),
458  InstrInfo(*this),
459  FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
460  FMA(false),
461  CaymanISA(false),
462  CFALUBug(false),
463  DX10Clamp(false),
466  FP64(false),
467  TexVTXClauseSize(0),
468  Gen(R600),
469  TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
470  InstrItins(getInstrItineraryForCPU(GPU)) { }
471 
473  unsigned NumRegionInstrs) const {
474  // Track register pressure so the scheduler can try to decrease
475  // pressure once register usage is above the threshold defined by
476  // SIRegisterInfo::getRegPressureSetLimit()
477  Policy.ShouldTrackPressure = true;
478 
479  // Enabling both top down and bottom up scheduling seems to give us less
480  // register spills than just using one of these approaches on its own.
481  Policy.OnlyTopDown = false;
482  Policy.OnlyBottomUp = false;
483 
484  // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
485  if (!enableSIScheduler())
486  Policy.ShouldTrackLaneMasks = true;
487 }
488 
489 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
491  if (SGPRs <= 80)
492  return 10;
493  if (SGPRs <= 88)
494  return 9;
495  if (SGPRs <= 100)
496  return 8;
497  return 7;
498  }
499  if (SGPRs <= 48)
500  return 10;
501  if (SGPRs <= 56)
502  return 9;
503  if (SGPRs <= 64)
504  return 8;
505  if (SGPRs <= 72)
506  return 7;
507  if (SGPRs <= 80)
508  return 6;
509  return 5;
510 }
511 
512 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
513  if (VGPRs <= 24)
514  return 10;
515  if (VGPRs <= 28)
516  return 9;
517  if (VGPRs <= 32)
518  return 8;
519  if (VGPRs <= 36)
520  return 7;
521  if (VGPRs <= 40)
522  return 6;
523  if (VGPRs <= 48)
524  return 5;
525  if (VGPRs <= 64)
526  return 4;
527  if (VGPRs <= 84)
528  return 3;
529  if (VGPRs <= 128)
530  return 2;
531  return 1;
532 }
533 
536  if (MFI.hasFlatScratchInit()) {
538  return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
540  return 4; // FLAT_SCRATCH, VCC (in that order).
541  }
542 
543  if (isXNACKEnabled())
544  return 4; // XNACK, VCC (in that order).
545  return 2; // VCC.
546 }
547 
549  const Function &F = MF.getFunction();
551 
552  // Compute maximum number of SGPRs function can use using default/requested
553  // minimum number of waves per execution unit.
554  std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
555  unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
556  unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
557 
558  // Check if maximum number of SGPRs was explicitly requested using
559  // "amdgpu-num-sgpr" attribute.
560  if (F.hasFnAttribute("amdgpu-num-sgpr")) {
561  unsigned Requested = AMDGPU::getIntegerAttribute(
562  F, "amdgpu-num-sgpr", MaxNumSGPRs);
563 
564  // Make sure requested value does not violate subtarget's specifications.
565  if (Requested && (Requested <= getReservedNumSGPRs(MF)))
566  Requested = 0;
567 
568  // If more SGPRs are required to support the input user/system SGPRs,
569  // increase to accommodate them.
570  //
571  // FIXME: This really ends up using the requested number of SGPRs + number
572  // of reserved special registers in total. Theoretically you could re-use
573  // the last input registers for these special registers, but this would
574  // require a lot of complexity to deal with the weird aliasing.
575  unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
576  if (Requested && Requested < InputNumSGPRs)
577  Requested = InputNumSGPRs;
578 
579  // Make sure requested value is compatible with values implied by
580  // default/requested minimum/maximum number of waves per execution unit.
581  if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
582  Requested = 0;
583  if (WavesPerEU.second &&
584  Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
585  Requested = 0;
586 
587  if (Requested)
588  MaxNumSGPRs = Requested;
589  }
590 
591  if (hasSGPRInitBug())
593 
594  return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
595  MaxAddressableNumSGPRs);
596 }
597 
599  const Function &F = MF.getFunction();
601 
602  // Compute maximum number of VGPRs function can use using default/requested
603  // minimum number of waves per execution unit.
604  std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
605  unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
606 
607  // Check if maximum number of VGPRs was explicitly requested using
608  // "amdgpu-num-vgpr" attribute.
609  if (F.hasFnAttribute("amdgpu-num-vgpr")) {
610  unsigned Requested = AMDGPU::getIntegerAttribute(
611  F, "amdgpu-num-vgpr", MaxNumVGPRs);
612 
613  // Make sure requested value is compatible with values implied by
614  // default/requested minimum/maximum number of waves per execution unit.
615  if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
616  Requested = 0;
617  if (WavesPerEU.second &&
618  Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
619  Requested = 0;
620 
621  if (Requested)
622  MaxNumVGPRs = Requested;
623  }
624 
625  return MaxNumVGPRs;
626 }
627 
628 namespace {
629 struct MemOpClusterMutation : ScheduleDAGMutation {
630  const SIInstrInfo *TII;
631 
632  MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
633 
634  void apply(ScheduleDAGInstrs *DAGInstrs) override {
635  ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
636 
637  SUnit *SUa = nullptr;
638  // Search for two consequent memory operations and link them
639  // to prevent scheduler from moving them apart.
640  // In DAG pre-process SUnits are in the original order of
641  // the instructions before scheduling.
642  for (SUnit &SU : DAG->SUnits) {
643  MachineInstr &MI2 = *SU.getInstr();
644  if (!MI2.mayLoad() && !MI2.mayStore()) {
645  SUa = nullptr;
646  continue;
647  }
648  if (!SUa) {
649  SUa = &SU;
650  continue;
651  }
652 
653  MachineInstr &MI1 = *SUa->getInstr();
654  if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
655  (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
656  (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
657  (TII->isDS(MI1) && TII->isDS(MI2))) {
658  SU.addPredBarrier(SUa);
659 
660  for (const SDep &SI : SU.Preds) {
661  if (SI.getSUnit() != SUa)
662  SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
663  }
664 
665  if (&SU != &DAG->ExitSU) {
666  for (const SDep &SI : SUa->Succs) {
667  if (SI.getSUnit() != &SU)
668  SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
669  }
670  }
671  }
672 
673  SUa = &SU;
674  }
675  }
676 };
677 } // namespace
678 
680  std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
681  Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
682 }
683 
686  return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
687  else
688  return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
689 }
690 
692  if (TM.getTargetTriple().getArch() == Triple::amdgcn)
693  return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
694  else
695  return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
696 }
bool makeLIDRangeMetadata(Instruction *I) const
Creates value range metadata on an workitemid.* inrinsic call or load.
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:111
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
This class represents an incoming formal argument to a Function.
Definition: Argument.h:30
AMDGPU specific subclass of TargetSubtarget.
This class represents lattice values for constants.
Definition: AllocatorList.h:24
LLVM_NODISCARD LLVM_ATTRIBUTE_ALWAYS_INLINE bool contains(StringRef Other) const
Return true if the given string is a substring of *this, and false otherwise.
Definition: StringRef.h:448
unsigned getImplicitArgNumBytes(const Function &F) const
#define LLVM_FALLTHROUGH
Definition: Compiler.h:86
unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const
Inverse of getMaxLocalMemWithWaveCount.
unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override
This file describes how to lower LLVM calls to machine code calls.
std::pair< unsigned, unsigned > getDefaultFlatWorkGroupSize(CallingConv::ID CC) const
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:705
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.h:321
Mutate the DAG as a postpass after normal DAG building.
Metadata node.
Definition: Metadata.h:864
F(f)
uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the next integer (mod 2**64) that is greater than or equal to Value and is a multiple of Alig...
Definition: MathExtras.h:685
block Block Frequency true
InstrItineraryData InstrItins
unsigned getMaxWavesPerEU() const
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
Generation getGeneration() const
static bool isSMRD(const MachineInstr &MI)
Definition: SIInstrInfo.h:435
SmallVector< SDep, 4 > Preds
All sunit predecessors.
Definition: ScheduleDAG.h:260
std::pair< int, int > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< int, int > Default, bool OnlyFirstRequired)
static bool isDS(const MachineInstr &MI)
Definition: SIInstrInfo.h:445
Calling convention used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:192
static const AMDGPUSubtarget & get(const MachineFunction &MF)
Calling convention used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:198
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:471
This file declares the targeting of the InstructionSelector class for AMDGPU.
const DataLayout & getDataLayout() const
Get the data layout for the module&#39;s target platform.
Definition: Module.cpp:371
const HexagonInstrInfo * TII
int getLocalMemorySize() const
SPIR_KERNEL - Calling convention for SPIR kernel functions.
Definition: CallingConv.h:137
void getPostRAMutations(std::vector< std::unique_ptr< ScheduleDAGMutation >> &Mutations) const override
MDNode * getMetadata(unsigned KindID) const
Get the current metadata attachments for the given kind, if any.
Definition: Metadata.cpp:1444
void apply(Opt *O, const Mod &M, const Mods &... Ms)
Definition: CommandLine.h:1186
unsigned getMaxNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU, bool Addressable)
Calling convention used for AMDPAL shader stage before geometry shader if geometry is in use...
Definition: CallingConv.h:221
static cl::opt< bool > ScalarizeGlobal("amdgpu-scalarize-global-loads", cl::desc("Enable global load scalarization"), cl::init(true), cl::Hidden)
static cl::opt< bool > EnableLoadStoreOpt("aarch64-enable-ldst-opt", cl::desc("Enable the load/store pair" " optimization pass"), cl::init(true), cl::Hidden)
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:245
ArchType getArch() const
getArch - Get the parsed architecture type of this triple.
Definition: Triple.h:290
uint64_t getExplicitKernArgSize(const Function &F, unsigned &MaxAlign) const
unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU)
SUnit * getSUnit() const
Definition: ScheduleDAG.h:484
Calling convention used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (ve...
Definition: CallingConv.h:189
Scheduling dependency.
Definition: ScheduleDAG.h:50
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
Definition: MachineInstr.h:820
void overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const override
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:377
* if(!EatIfPresent(lltok::kw_thread_local)) return false
ParseOptionalThreadLocal := /*empty.
unsigned getMinNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:46
unsigned getReservedNumSGPRs(const MachineFunction &MF) const
unsigned getStackAlignment() const
void ParseSubtargetFeatures(StringRef CPU, StringRef FS)
R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS, const TargetMachine &TM)
bool ShouldTrackLaneMasks
Track LaneMasks to allow reordering of independent subregister writes of the same vreg...
Calling convention used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:216
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const Triple & getTargetTriple() const
std::pair< unsigned, unsigned > getWavesPerEU() const
The AMDGPU TargetMachine interface definition for hw codgen targets.
Calling convention used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:195
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1226
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
unsigned getKernArgSegmentSize(const Function &F, unsigned &MaxAlign) const
unsigned getWavefrontSize() const
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:213
unsigned getExplicitKernelArgOffset(const Function &F) const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument...
Information about stack frame layout on the target.
bool hasCaymanISA() const
bool addPredBarrier(SUnit *SU)
Adds a barrier edge to SU by calling addPred(), with latency 0 generally or latency 1 for a store fol...
Definition: ScheduleDAG.h:388
unsigned getAlignmentForImplicitArgPtr() const
This class provides the information for the target register banks.
Intrinsic::ID getIntrinsicID() const LLVM_READONLY
getIntrinsicID - This method returns the ID number of the specified function, or Intrinsic::not_intri...
Definition: Function.h:194
const Function & getFunction() const
Return the LLVM function that this machine code represents.
Class for arbitrary precision integers.
Definition: APInt.h:70
GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const GCNTargetMachine &TM)
This file declares the targeting of the Machinelegalizer class for AMDGPU.
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const
Return the maximum number of waves per SIMD for kernels using SGPRs SGPRs.
amdgpu Simplify well known AMD library false Value Value * Arg
Provides AMDGPU specific target descriptions.
A ScheduleDAG for scheduling lists of MachineInstr.
Define a generic scheduling policy for targets that don&#39;t provide their own MachineSchedStrategy.
Representation of each machine instruction.
Definition: MachineInstr.h:64
SUnit ExitSU
Special node for the region exit.
Definition: ScheduleDAG.h:568
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPUSubtarget(const Triple &TT)
Calling convention used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:208
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:107
int getIntegerAttribute(const Function &F, StringRef Name, int Default)
#define I(x, y, z)
Definition: MD5.cpp:58
static bool isVMEM(const MachineInstr &MI)
Definition: SIInstrInfo.h:331
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
unsigned getMaxFlatWorkGroupSize() const override
unsigned getMinFlatWorkGroupSize() const override
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
Definition: MachineInstr.h:807
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
bool addPred(const SDep &D, bool Required=true)
Adds the specified edge as a pred of the current node if not already.
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:566
~GCNSubtarget() override
SmallVector< SDep, 4 > Succs
All sunit successors.
Definition: ScheduleDAG.h:261
Arbitrary strong DAG edge (no real dependence).
Definition: ScheduleDAG.h:73
GCNSubtarget & initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS)
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:59
unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU)
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:49
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const
Return the amount of LDS that can be used that will not restrict the occupancy lower than WaveCount...
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:566
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const
Return the maximum number of waves per SIMD for kernels using VGPRs VGPRs.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
unsigned getMinWavesPerEU() const override
const SITargetLowering * getTargetLowering() const override
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
R600Subtarget & initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS)
Calling convention for AMDGPU code object kernels.
Definition: CallingConv.h:201
iterator_range< arg_iterator > args()
Definition: Function.h:689
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:246
const BasicBlock * getParent() const
Definition: Instruction.h:67
const SIRegisterInfo * getRegisterInfo() const override