LLVM  8.0.1
R600MachineScheduler.cpp
Go to the documentation of this file.
1 //===-- R600MachineScheduler.cpp - R600 Scheduler Interface -*- C++ -*-----===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// R600 Machine Scheduler interface
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "R600MachineScheduler.h"
16 #include "AMDGPUSubtarget.h"
17 #include "R600InstrInfo.h"
21 #include "llvm/Pass.h"
23 
24 using namespace llvm;
25 
26 #define DEBUG_TYPE "machine-scheduler"
27 
29  assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness");
30  DAG = static_cast<ScheduleDAGMILive*>(dag);
31  const R600Subtarget &ST = DAG->MF.getSubtarget<R600Subtarget>();
32  TII = static_cast<const R600InstrInfo*>(DAG->TII);
33  TRI = static_cast<const R600RegisterInfo*>(DAG->TRI);
34  VLIW5 = !ST.hasCaymanISA();
35  MRI = &DAG->MRI;
36  CurInstKind = IDOther;
37  CurEmitted = 0;
38  OccupedSlotsMask = 31;
39  InstKindLimit[IDAlu] = TII->getMaxAlusPerClause();
40  InstKindLimit[IDOther] = 32;
41  InstKindLimit[IDFetch] = ST.getTexVTXClauseSize();
42  AluInstCount = 0;
43  FetchInstCount = 0;
44 }
45 
46 void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
47  std::vector<SUnit *> &QDst)
48 {
49  QDst.insert(QDst.end(), QSrc.begin(), QSrc.end());
50  QSrc.clear();
51 }
52 
53 static unsigned getWFCountLimitedByGPR(unsigned GPRCount) {
54  assert (GPRCount && "GPRCount cannot be 0");
55  return 248 / GPRCount;
56 }
57 
59  SUnit *SU = nullptr;
60  NextInstKind = IDOther;
61 
62  IsTopNode = false;
63 
64  // check if we might want to switch current clause type
65  bool AllowSwitchToAlu = (CurEmitted >= InstKindLimit[CurInstKind]) ||
66  (Available[CurInstKind].empty());
67  bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) &&
68  (!Available[IDFetch].empty() || !Available[IDOther].empty());
69 
70  if (CurInstKind == IDAlu && !Available[IDFetch].empty()) {
71  // We use the heuristic provided by AMD Accelerated Parallel Processing
72  // OpenCL Programming Guide :
73  // The approx. number of WF that allows TEX inst to hide ALU inst is :
74  // 500 (cycles for TEX) / (AluFetchRatio * 8 (cycles for ALU))
75  float ALUFetchRationEstimate =
76  (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) /
77  (FetchInstCount + Available[IDFetch].size());
78  if (ALUFetchRationEstimate == 0) {
79  AllowSwitchFromAlu = true;
80  } else {
81  unsigned NeededWF = 62.5f / ALUFetchRationEstimate;
82  LLVM_DEBUG(dbgs() << NeededWF << " approx. Wavefronts Required\n");
83  // We assume the local GPR requirements to be "dominated" by the requirement
84  // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and
85  // after TEX are indeed likely to consume or generate values from/for the
86  // TEX clause.
87  // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause
88  // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need
89  // one GPR) or TmXYZW = TnXYZW (need 2 GPR).
90  // (TODO : use RegisterPressure)
91  // If we are going too use too many GPR, we flush Fetch instruction to lower
92  // register pressure on 128 bits regs.
93  unsigned NearRegisterRequirement = 2 * Available[IDFetch].size();
94  if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement))
95  AllowSwitchFromAlu = true;
96  }
97  }
98 
99  if (!SU && ((AllowSwitchToAlu && CurInstKind != IDAlu) ||
100  (!AllowSwitchFromAlu && CurInstKind == IDAlu))) {
101  // try to pick ALU
102  SU = pickAlu();
103  if (!SU && !PhysicalRegCopy.empty()) {
104  SU = PhysicalRegCopy.front();
105  PhysicalRegCopy.erase(PhysicalRegCopy.begin());
106  }
107  if (SU) {
108  if (CurEmitted >= InstKindLimit[IDAlu])
109  CurEmitted = 0;
110  NextInstKind = IDAlu;
111  }
112  }
113 
114  if (!SU) {
115  // try to pick FETCH
116  SU = pickOther(IDFetch);
117  if (SU)
118  NextInstKind = IDFetch;
119  }
120 
121  // try to pick other
122  if (!SU) {
123  SU = pickOther(IDOther);
124  if (SU)
125  NextInstKind = IDOther;
126  }
127 
128  LLVM_DEBUG(if (SU) {
129  dbgs() << " ** Pick node **\n";
130  DAG->dumpNode(*SU);
131  } else {
132  dbgs() << "NO NODE \n";
133  for (unsigned i = 0; i < DAG->SUnits.size(); i++) {
134  const SUnit &S = DAG->SUnits[i];
135  if (!S.isScheduled)
136  DAG->dumpNode(S);
137  }
138  });
139 
140  return SU;
141 }
142 
143 void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
144  if (NextInstKind != CurInstKind) {
145  LLVM_DEBUG(dbgs() << "Instruction Type Switch\n");
146  if (NextInstKind != IDAlu)
147  OccupedSlotsMask |= 31;
148  CurEmitted = 0;
149  CurInstKind = NextInstKind;
150  }
151 
152  if (CurInstKind == IDAlu) {
153  AluInstCount ++;
154  switch (getAluKind(SU)) {
155  case AluT_XYZW:
156  CurEmitted += 4;
157  break;
158  case AluDiscarded:
159  break;
160  default: {
161  ++CurEmitted;
163  E = SU->getInstr()->operands_end(); It != E; ++It) {
164  MachineOperand &MO = *It;
165  if (MO.isReg() && MO.getReg() == R600::ALU_LITERAL_X)
166  ++CurEmitted;
167  }
168  }
169  }
170  } else {
171  ++CurEmitted;
172  }
173 
174  LLVM_DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n");
175 
176  if (CurInstKind != IDFetch) {
177  MoveUnits(Pending[IDFetch], Available[IDFetch]);
178  } else
179  FetchInstCount++;
180 }
181 
182 static bool
184  if (MI->getOpcode() != R600::COPY)
185  return false;
186 
188 }
189 
191  LLVM_DEBUG(dbgs() << "Top Releasing "; DAG->dumpNode(*SU));
192 }
193 
195  LLVM_DEBUG(dbgs() << "Bottom Releasing "; DAG->dumpNode(*SU));
196  if (isPhysicalRegCopy(SU->getInstr())) {
197  PhysicalRegCopy.push_back(SU);
198  return;
199  }
200 
201  int IK = getInstKind(SU);
202 
203  // There is no export clause, we can schedule one as soon as its ready
204  if (IK == IDOther)
205  Available[IDOther].push_back(SU);
206  else
207  Pending[IK].push_back(SU);
208 
209 }
210 
211 bool R600SchedStrategy::regBelongsToClass(unsigned Reg,
212  const TargetRegisterClass *RC) const {
214  return RC->contains(Reg);
215  } else {
216  return MRI->getRegClass(Reg) == RC;
217  }
218 }
219 
220 R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
221  MachineInstr *MI = SU->getInstr();
222 
223  if (TII->isTransOnly(*MI))
224  return AluTrans;
225 
226  switch (MI->getOpcode()) {
227  case R600::PRED_X:
228  return AluPredX;
229  case R600::INTERP_PAIR_XY:
230  case R600::INTERP_PAIR_ZW:
231  case R600::INTERP_VEC_LOAD:
232  case R600::DOT_4:
233  return AluT_XYZW;
234  case R600::COPY:
235  if (MI->getOperand(1).isUndef()) {
236  // MI will become a KILL, don't considers it in scheduling
237  return AluDiscarded;
238  }
239  break;
240  default:
241  break;
242  }
243 
244  // Does the instruction take a whole IG ?
245  // XXX: Is it possible to add a helper function in R600InstrInfo that can
246  // be used here and in R600PacketizerList::isSoloInstruction() ?
247  if(TII->isVector(*MI) ||
248  TII->isCubeOp(MI->getOpcode()) ||
249  TII->isReductionOp(MI->getOpcode()) ||
250  MI->getOpcode() == R600::GROUP_BARRIER) {
251  return AluT_XYZW;
252  }
253 
254  if (TII->isLDSInstr(MI->getOpcode())) {
255  return AluT_X;
256  }
257 
258  // Is the result already assigned to a channel ?
259  unsigned DestSubReg = MI->getOperand(0).getSubReg();
260  switch (DestSubReg) {
261  case R600::sub0:
262  return AluT_X;
263  case R600::sub1:
264  return AluT_Y;
265  case R600::sub2:
266  return AluT_Z;
267  case R600::sub3:
268  return AluT_W;
269  default:
270  break;
271  }
272 
273  // Is the result already member of a X/Y/Z/W class ?
274  unsigned DestReg = MI->getOperand(0).getReg();
275  if (regBelongsToClass(DestReg, &R600::R600_TReg32_XRegClass) ||
276  regBelongsToClass(DestReg, &R600::R600_AddrRegClass))
277  return AluT_X;
278  if (regBelongsToClass(DestReg, &R600::R600_TReg32_YRegClass))
279  return AluT_Y;
280  if (regBelongsToClass(DestReg, &R600::R600_TReg32_ZRegClass))
281  return AluT_Z;
282  if (regBelongsToClass(DestReg, &R600::R600_TReg32_WRegClass))
283  return AluT_W;
284  if (regBelongsToClass(DestReg, &R600::R600_Reg128RegClass))
285  return AluT_XYZW;
286 
287  // LDS src registers cannot be used in the Trans slot.
288  if (TII->readsLDSSrcReg(*MI))
289  return AluT_XYZW;
290 
291  return AluAny;
292 }
293 
294 int R600SchedStrategy::getInstKind(SUnit* SU) {
295  int Opcode = SU->getInstr()->getOpcode();
296 
297  if (TII->usesTextureCache(Opcode) || TII->usesVertexCache(Opcode))
298  return IDFetch;
299 
300  if (TII->isALUInstr(Opcode)) {
301  return IDAlu;
302  }
303 
304  switch (Opcode) {
305  case R600::PRED_X:
306  case R600::COPY:
307  case R600::CONST_COPY:
308  case R600::INTERP_PAIR_XY:
309  case R600::INTERP_PAIR_ZW:
310  case R600::INTERP_VEC_LOAD:
311  case R600::DOT_4:
312  return IDAlu;
313  default:
314  return IDOther;
315  }
316 }
317 
318 SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) {
319  if (Q.empty())
320  return nullptr;
321  for (std::vector<SUnit *>::reverse_iterator It = Q.rbegin(), E = Q.rend();
322  It != E; ++It) {
323  SUnit *SU = *It;
324  InstructionsGroupCandidate.push_back(SU->getInstr());
325  if (TII->fitsConstReadLimitations(InstructionsGroupCandidate) &&
326  (!AnyALU || !TII->isVectorOnly(*SU->getInstr()))) {
327  InstructionsGroupCandidate.pop_back();
328  Q.erase((It + 1).base());
329  return SU;
330  } else {
331  InstructionsGroupCandidate.pop_back();
332  }
333  }
334  return nullptr;
335 }
336 
337 void R600SchedStrategy::LoadAlu() {
338  std::vector<SUnit *> &QSrc = Pending[IDAlu];
339  for (unsigned i = 0, e = QSrc.size(); i < e; ++i) {
340  AluKind AK = getAluKind(QSrc[i]);
341  AvailableAlus[AK].push_back(QSrc[i]);
342  }
343  QSrc.clear();
344 }
345 
346 void R600SchedStrategy::PrepareNextSlot() {
347  LLVM_DEBUG(dbgs() << "New Slot\n");
348  assert (OccupedSlotsMask && "Slot wasn't filled");
349  OccupedSlotsMask = 0;
350 // if (HwGen == AMDGPUSubtarget::NORTHERN_ISLANDS)
351 // OccupedSlotsMask |= 16;
352  InstructionsGroupCandidate.clear();
353  LoadAlu();
354 }
355 
356 void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {
357  int DstIndex = TII->getOperandIdx(MI->getOpcode(), R600::OpName::dst);
358  if (DstIndex == -1) {
359  return;
360  }
361  unsigned DestReg = MI->getOperand(DstIndex).getReg();
362  // PressureRegister crashes if an operand is def and used in the same inst
363  // and we try to constraint its regclass
365  E = MI->operands_end(); It != E; ++It) {
366  MachineOperand &MO = *It;
367  if (MO.isReg() && !MO.isDef() &&
368  MO.getReg() == DestReg)
369  return;
370  }
371  // Constrains the regclass of DestReg to assign it to Slot
372  switch (Slot) {
373  case 0:
374  MRI->constrainRegClass(DestReg, &R600::R600_TReg32_XRegClass);
375  break;
376  case 1:
377  MRI->constrainRegClass(DestReg, &R600::R600_TReg32_YRegClass);
378  break;
379  case 2:
380  MRI->constrainRegClass(DestReg, &R600::R600_TReg32_ZRegClass);
381  break;
382  case 3:
383  MRI->constrainRegClass(DestReg, &R600::R600_TReg32_WRegClass);
384  break;
385  }
386 }
387 
388 SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot, bool AnyAlu) {
389  static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W};
390  SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]], AnyAlu);
391  if (SlotedSU)
392  return SlotedSU;
393  SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny], AnyAlu);
394  if (UnslotedSU)
395  AssignSlot(UnslotedSU->getInstr(), Slot);
396  return UnslotedSU;
397 }
398 
399 unsigned R600SchedStrategy::AvailablesAluCount() const {
400  return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() +
401  AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() +
402  AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() +
403  AvailableAlus[AluTrans].size() + AvailableAlus[AluDiscarded].size() +
404  AvailableAlus[AluPredX].size();
405 }
406 
407 SUnit* R600SchedStrategy::pickAlu() {
408  while (AvailablesAluCount() || !Pending[IDAlu].empty()) {
409  if (!OccupedSlotsMask) {
410  // Bottom up scheduling : predX must comes first
411  if (!AvailableAlus[AluPredX].empty()) {
412  OccupedSlotsMask |= 31;
413  return PopInst(AvailableAlus[AluPredX], false);
414  }
415  // Flush physical reg copies (RA will discard them)
416  if (!AvailableAlus[AluDiscarded].empty()) {
417  OccupedSlotsMask |= 31;
418  return PopInst(AvailableAlus[AluDiscarded], false);
419  }
420  // If there is a T_XYZW alu available, use it
421  if (!AvailableAlus[AluT_XYZW].empty()) {
422  OccupedSlotsMask |= 15;
423  return PopInst(AvailableAlus[AluT_XYZW], false);
424  }
425  }
426  bool TransSlotOccuped = OccupedSlotsMask & 16;
427  if (!TransSlotOccuped && VLIW5) {
428  if (!AvailableAlus[AluTrans].empty()) {
429  OccupedSlotsMask |= 16;
430  return PopInst(AvailableAlus[AluTrans], false);
431  }
432  SUnit *SU = AttemptFillSlot(3, true);
433  if (SU) {
434  OccupedSlotsMask |= 16;
435  return SU;
436  }
437  }
438  for (int Chan = 3; Chan > -1; --Chan) {
439  bool isOccupied = OccupedSlotsMask & (1 << Chan);
440  if (!isOccupied) {
441  SUnit *SU = AttemptFillSlot(Chan, false);
442  if (SU) {
443  OccupedSlotsMask |= (1 << Chan);
444  InstructionsGroupCandidate.push_back(SU->getInstr());
445  return SU;
446  }
447  }
448  }
449  PrepareNextSlot();
450  }
451  return nullptr;
452 }
453 
454 SUnit* R600SchedStrategy::pickOther(int QID) {
455  SUnit *SU = nullptr;
456  std::vector<SUnit *> &AQ = Available[QID];
457 
458  if (AQ.empty()) {
459  MoveUnits(Pending[QID], AQ);
460  }
461  if (!AQ.empty()) {
462  SU = AQ.back();
463  AQ.pop_back();
464  }
465  return SU;
466 }
mop_iterator operands_end()
Definition: MachineInstr.h:454
void schedNode(SUnit *SU, bool IsTopNode) override
Notify MachineSchedStrategy that ScheduleDAGMI has scheduled an instruction and updated scheduled/rem...
bool contains(unsigned Reg) const
Return true if the specified register is included in this register class.
AMDGPU specific subclass of TargetSubtarget.
const TargetRegisterClass * getRegClass(unsigned Reg) const
Return the register class of the specified virtual register.
This class represents lattice values for constants.
Definition: AllocatorList.h:24
Interface definition for R600InstrInfo.
void initialize(ScheduleDAGMI *dag) override
Initialize the strategy after building the DAG for a new region.
unsigned getReg() const
getReg - Returns the register number.
static bool isVirtualRegister(unsigned Reg)
Return true if the specified register number is in the virtual register namespace.
unsigned Reg
unsigned getSubReg() const
R600 Machine Scheduler interface.
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
ScheduleDAGMILive is an implementation of ScheduleDAGInstrs that schedules machine instructions while...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
bool isScheduled
True once scheduled.
Definition: ScheduleDAG.h:288
void releaseBottomNode(SUnit *SU) override
When all successor dependencies have been resolved, free this node for bottom-up scheduling.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:409
bool usesVertexCache(unsigned Opcode) const
SUnit * pickNode(bool &IsTopNode) override
Pick the next node to schedule, or return NULL.
const TargetRegisterClass * constrainRegClass(unsigned Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:377
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static bool isPhysicalRegCopy(MachineInstr *MI)
void releaseTopNode(SUnit *SU) override
When all predecessor dependencies have been resolved, free this node for top-down scheduling...
bool isLDSInstr(unsigned Opcode) const
int getOperandIdx(const MachineInstr &MI, unsigned Op) const
Get the index of Op in the MachineInstr.
bool isVectorOnly(unsigned Opcode) const
bool fitsConstReadLimitations(const std::vector< MachineInstr *> &) const
An instruction group can only access 2 channel pair (either [XY] or [ZW]) from KCache bank on R700+...
constexpr bool empty(const T &RangeOrContainer)
Test whether RangeOrContainer is empty. Similar to C++17 std::empty.
Definition: STLExtras.h:210
bool isTransOnly(unsigned Opcode) const
bool isVector(const MachineInstr &MI) const
Vector instructions are instructions that must fill all instruction slots within an instruction group...
auto size(R &&Range, typename std::enable_if< std::is_same< typename std::iterator_traits< decltype(Range.begin())>::iterator_category, std::random_access_iterator_tag >::value, void >::type *=nullptr) -> decltype(std::distance(Range.begin(), Range.end()))
Get the size of a range.
Definition: STLExtras.h:1167
MachineOperand class - Representation of each machine instruction operand.
bool hasCaymanISA() const
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:133
bool isALUInstr(unsigned Opcode) const
static unsigned getWFCountLimitedByGPR(unsigned GPRCount)
void dumpNode(const SUnit &SU) const override
bool readsLDSSrcReg(const MachineInstr &MI) const
Provides AMDGPU specific target descriptions.
Representation of each machine instruction.
Definition: MachineInstr.h:64
short getTexVTXClauseSize() const
bool isReductionOp(unsigned opcode) const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
mop_iterator operands_begin()
Definition: MachineInstr.h:453
IRTranslator LLVM IR MI
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:566
#define LLVM_DEBUG(X)
Definition: Debug.h:123
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:414
bool usesTextureCache(unsigned Opcode) const
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:246
bool isCubeOp(unsigned opcode) const