LLVM  8.0.1
SIInsertWaitcnts.cpp
Go to the documentation of this file.
1 //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// Insert wait instructions for memory reads and writes.
12 ///
13 /// Memory reads and writes are issued asynchronously, so we need to insert
14 /// S_WAITCNT instructions when we want to access any of their results or
15 /// overwrite any register that's used asynchronously.
16 ///
17 /// TODO: This pass currently keeps one timeline per hardware counter. A more
18 /// finely-grained approach that keeps one timeline per event type could
19 /// sometimes get away with generating weaker s_waitcnt instructions. For
20 /// example, when both SMEM and LDS are in flight and we need to wait for
21 /// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
22 /// but the pass will currently generate a conservative lgkmcnt(0) because
23 /// multiple event types are in flight.
24 //
25 //===----------------------------------------------------------------------===//
26 
27 #include "AMDGPU.h"
28 #include "AMDGPUSubtarget.h"
29 #include "SIDefines.h"
30 #include "SIInstrInfo.h"
31 #include "SIMachineFunctionInfo.h"
32 #include "SIRegisterInfo.h"
33 #include "Utils/AMDGPUBaseInfo.h"
34 #include "llvm/ADT/DenseMap.h"
35 #include "llvm/ADT/DenseSet.h"
37 #include "llvm/ADT/STLExtras.h"
38 #include "llvm/ADT/SmallVector.h"
47 #include "llvm/IR/DebugLoc.h"
48 #include "llvm/Pass.h"
49 #include "llvm/Support/Debug.h"
53 #include <algorithm>
54 #include <cassert>
55 #include <cstdint>
56 #include <cstring>
57 #include <memory>
58 #include <utility>
59 #include <vector>
60 
61 using namespace llvm;
62 
63 #define DEBUG_TYPE "si-insert-waitcnts"
64 
65 DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
66  "Force emit s_waitcnt expcnt(0) instrs");
67 DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
68  "Force emit s_waitcnt lgkmcnt(0) instrs");
69 DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
70  "Force emit s_waitcnt vmcnt(0) instrs");
71 
73  "amdgpu-waitcnt-forcezero",
74  cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
75  cl::init(0), cl::Hidden);
76 
77 namespace {
78 
79 template <typename EnumT>
80 class enum_iterator
81  : public iterator_facade_base<enum_iterator<EnumT>,
82  std::forward_iterator_tag, const EnumT> {
83  EnumT Value;
84 public:
85  enum_iterator() = default;
86  enum_iterator(EnumT Value) : Value(Value) {}
87 
88  enum_iterator &operator++() {
89  Value = static_cast<EnumT>(Value + 1);
90  return *this;
91  }
92 
93  bool operator==(const enum_iterator &RHS) const { return Value == RHS.Value; }
94 
95  EnumT operator*() const { return Value; }
96 };
97 
98 // Class of object that encapsulates latest instruction counter score
99 // associated with the operand. Used for determining whether
100 // s_waitcnt instruction needs to be emited.
101 
102 #define CNT_MASK(t) (1u << (t))
103 
104 enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS };
105 
106 iterator_range<enum_iterator<InstCounterType>> inst_counter_types() {
107  return make_range(enum_iterator<InstCounterType>(VM_CNT),
108  enum_iterator<InstCounterType>(NUM_INST_CNTS));
109 }
110 
111 using RegInterval = std::pair<signed, signed>;
112 
113 struct {
114  uint32_t VmcntMax;
115  uint32_t ExpcntMax;
116  uint32_t LgkmcntMax;
117  int32_t NumVGPRsMax;
118  int32_t NumSGPRsMax;
119 } HardwareLimits;
120 
121 struct {
122  unsigned VGPR0;
123  unsigned VGPRL;
124  unsigned SGPR0;
125  unsigned SGPRL;
126 } RegisterEncoding;
127 
129  VMEM_ACCESS, // vector-memory read & write
130  LDS_ACCESS, // lds read & write
131  GDS_ACCESS, // gds read & write
132  SQ_MESSAGE, // send message
133  SMEM_ACCESS, // scalar-memory read & write
134  EXP_GPR_LOCK, // export holding on its data src
135  GDS_GPR_LOCK, // GDS holding on its data and addr src
136  EXP_POS_ACCESS, // write to export position
137  EXP_PARAM_ACCESS, // write to export parameter
138  VMW_GPR_LOCK, // vector-memory write holding on its data src
139  NUM_WAIT_EVENTS,
140 };
141 
142 static const uint32_t WaitEventMaskForInst[NUM_INST_CNTS] = {
143  (1 << VMEM_ACCESS),
144  (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
145  (1 << SQ_MESSAGE),
146  (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
147  (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS),
148 };
149 
150 // The mapping is:
151 // 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
152 // SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
153 // NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
154 // We reserve a fixed number of VGPR slots in the scoring tables for
155 // special tokens like SCMEM_LDS (needed for buffer load to LDS).
157  SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets.
158  SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
159  NUM_EXTRA_VGPRS = 1, // A reserved slot for DS.
160  EXTRA_VGPR_LDS = 0, // This is a placeholder the Shader algorithm uses.
161  NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
162 };
163 
164 void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
165  switch (T) {
166  case VM_CNT:
167  Wait.VmCnt = std::min(Wait.VmCnt, Count);
168  break;
169  case EXP_CNT:
170  Wait.ExpCnt = std::min(Wait.ExpCnt, Count);
171  break;
172  case LGKM_CNT:
173  Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count);
174  break;
175  default:
176  llvm_unreachable("bad InstCounterType");
177  }
178 }
179 
180 // This objects maintains the current score brackets of each wait counter, and
181 // a per-register scoreboard for each wait counter.
182 //
183 // We also maintain the latest score for every event type that can change the
184 // waitcnt in order to know if there are multiple types of events within
185 // the brackets. When multiple types of event happen in the bracket,
186 // wait count may get decreased out of order, therefore we need to put in
187 // "s_waitcnt 0" before use.
188 class WaitcntBrackets {
189 public:
190  WaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {
191  for (auto T : inst_counter_types())
192  memset(VgprScores[T], 0, sizeof(VgprScores[T]));
193  }
194 
195  static uint32_t getWaitCountMax(InstCounterType T) {
196  switch (T) {
197  case VM_CNT:
198  return HardwareLimits.VmcntMax;
199  case LGKM_CNT:
200  return HardwareLimits.LgkmcntMax;
201  case EXP_CNT:
202  return HardwareLimits.ExpcntMax;
203  default:
204  break;
205  }
206  return 0;
207  }
208 
209  uint32_t getScoreLB(InstCounterType T) const {
210  assert(T < NUM_INST_CNTS);
211  if (T >= NUM_INST_CNTS)
212  return 0;
213  return ScoreLBs[T];
214  }
215 
216  uint32_t getScoreUB(InstCounterType T) const {
217  assert(T < NUM_INST_CNTS);
218  if (T >= NUM_INST_CNTS)
219  return 0;
220  return ScoreUBs[T];
221  }
222 
223  // Mapping from event to counter.
224  InstCounterType eventCounter(WaitEventType E) {
225  if (E == VMEM_ACCESS)
226  return VM_CNT;
227  if (WaitEventMaskForInst[LGKM_CNT] & (1 << E))
228  return LGKM_CNT;
229  assert(WaitEventMaskForInst[EXP_CNT] & (1 << E));
230  return EXP_CNT;
231  }
232 
233  uint32_t getRegScore(int GprNo, InstCounterType T) {
234  if (GprNo < NUM_ALL_VGPRS) {
235  return VgprScores[T][GprNo];
236  }
237  assert(T == LGKM_CNT);
238  return SgprScores[GprNo - NUM_ALL_VGPRS];
239  }
240 
241  void clear() {
242  memset(ScoreLBs, 0, sizeof(ScoreLBs));
243  memset(ScoreUBs, 0, sizeof(ScoreUBs));
244  PendingEvents = 0;
245  memset(MixedPendingEvents, 0, sizeof(MixedPendingEvents));
246  for (auto T : inst_counter_types())
247  memset(VgprScores[T], 0, sizeof(VgprScores[T]));
248  memset(SgprScores, 0, sizeof(SgprScores));
249  }
250 
251  bool merge(const WaitcntBrackets &Other);
252 
253  RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
254  const MachineRegisterInfo *MRI,
255  const SIRegisterInfo *TRI, unsigned OpNo,
256  bool Def) const;
257 
258  int32_t getMaxVGPR() const { return VgprUB; }
259  int32_t getMaxSGPR() const { return SgprUB; }
260 
261  bool counterOutOfOrder(InstCounterType T) const;
262  bool simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
263  bool simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
264  void determineWait(InstCounterType T, uint32_t ScoreToWait,
265  AMDGPU::Waitcnt &Wait) const;
266  void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
267  void applyWaitcnt(InstCounterType T, unsigned Count);
268  void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
269  const MachineRegisterInfo *MRI, WaitEventType E,
270  MachineInstr &MI);
271 
272  bool hasPending() const { return PendingEvents != 0; }
273  bool hasPendingEvent(WaitEventType E) const {
274  return PendingEvents & (1 << E);
275  }
276 
277  bool hasPendingFlat() const {
278  return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
279  LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
280  (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
281  LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
282  }
283 
284  void setPendingFlat() {
285  LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
286  LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
287  }
288 
289  void print(raw_ostream &);
290  void dump() { print(dbgs()); }
291 
292 private:
293  struct MergeInfo {
294  uint32_t OldLB;
295  uint32_t OtherLB;
296  uint32_t MyShift;
297  uint32_t OtherShift;
298  };
299  static bool mergeScore(const MergeInfo &M, uint32_t &Score,
300  uint32_t OtherScore);
301 
302  void setScoreLB(InstCounterType T, uint32_t Val) {
303  assert(T < NUM_INST_CNTS);
304  if (T >= NUM_INST_CNTS)
305  return;
306  ScoreLBs[T] = Val;
307  }
308 
309  void setScoreUB(InstCounterType T, uint32_t Val) {
310  assert(T < NUM_INST_CNTS);
311  if (T >= NUM_INST_CNTS)
312  return;
313  ScoreUBs[T] = Val;
314  if (T == EXP_CNT) {
315  uint32_t UB = ScoreUBs[T] - getWaitCountMax(EXP_CNT);
316  if (ScoreLBs[T] < UB && UB < ScoreUBs[T])
317  ScoreLBs[T] = UB;
318  }
319  }
320 
321  void setRegScore(int GprNo, InstCounterType T, uint32_t Val) {
322  if (GprNo < NUM_ALL_VGPRS) {
323  if (GprNo > VgprUB) {
324  VgprUB = GprNo;
325  }
326  VgprScores[T][GprNo] = Val;
327  } else {
328  assert(T == LGKM_CNT);
329  if (GprNo - NUM_ALL_VGPRS > SgprUB) {
330  SgprUB = GprNo - NUM_ALL_VGPRS;
331  }
332  SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
333  }
334  }
335 
336  void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
337  const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
338  unsigned OpNo, uint32_t Val);
339 
340  const GCNSubtarget *ST = nullptr;
341  uint32_t ScoreLBs[NUM_INST_CNTS] = {0};
342  uint32_t ScoreUBs[NUM_INST_CNTS] = {0};
343  uint32_t PendingEvents = 0;
344  bool MixedPendingEvents[NUM_INST_CNTS] = {false};
345  // Remember the last flat memory operation.
346  uint32_t LastFlat[NUM_INST_CNTS] = {0};
347  // wait_cnt scores for every vgpr.
348  // Keep track of the VgprUB and SgprUB to make merge at join efficient.
349  int32_t VgprUB = 0;
350  int32_t SgprUB = 0;
351  uint32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
352  // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
353  uint32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
354 };
355 
356 class SIInsertWaitcnts : public MachineFunctionPass {
357 private:
358  const GCNSubtarget *ST = nullptr;
359  const SIInstrInfo *TII = nullptr;
360  const SIRegisterInfo *TRI = nullptr;
361  const MachineRegisterInfo *MRI = nullptr;
362  AMDGPU::IsaVersion IV;
363 
364  DenseSet<MachineInstr *> TrackedWaitcntSet;
365  DenseSet<MachineInstr *> VCCZBugHandledSet;
366 
367  struct BlockInfo {
368  MachineBasicBlock *MBB;
369  std::unique_ptr<WaitcntBrackets> Incoming;
370  bool Dirty = true;
371 
372  explicit BlockInfo(MachineBasicBlock *MBB) : MBB(MBB) {}
373  };
374 
375  std::vector<BlockInfo> BlockInfos; // by reverse post-order traversal index
377 
378  // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
379  // because of amdgpu-waitcnt-forcezero flag
380  bool ForceEmitZeroWaitcnts;
381  bool ForceEmitWaitcnt[NUM_INST_CNTS];
382 
383 public:
384  static char ID;
385 
386  SIInsertWaitcnts() : MachineFunctionPass(ID) {
387  (void)ForceExpCounter;
388  (void)ForceLgkmCounter;
389  (void)ForceVMCounter;
390  }
391 
392  bool runOnMachineFunction(MachineFunction &MF) override;
393 
394  StringRef getPassName() const override {
395  return "SI insert wait instructions";
396  }
397 
398  void getAnalysisUsage(AnalysisUsage &AU) const override {
399  AU.setPreservesCFG();
401  }
402 
403  bool isForceEmitWaitcnt() const {
404  for (auto T : inst_counter_types())
405  if (ForceEmitWaitcnt[T])
406  return true;
407  return false;
408  }
409 
410  void setForceEmitWaitcnt() {
411 // For non-debug builds, ForceEmitWaitcnt has been initialized to false;
412 // For debug builds, get the debug counter info and adjust if need be
413 #ifndef NDEBUG
414  if (DebugCounter::isCounterSet(ForceExpCounter) &&
415  DebugCounter::shouldExecute(ForceExpCounter)) {
416  ForceEmitWaitcnt[EXP_CNT] = true;
417  } else {
418  ForceEmitWaitcnt[EXP_CNT] = false;
419  }
420 
421  if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
422  DebugCounter::shouldExecute(ForceLgkmCounter)) {
423  ForceEmitWaitcnt[LGKM_CNT] = true;
424  } else {
425  ForceEmitWaitcnt[LGKM_CNT] = false;
426  }
427 
428  if (DebugCounter::isCounterSet(ForceVMCounter) &&
429  DebugCounter::shouldExecute(ForceVMCounter)) {
430  ForceEmitWaitcnt[VM_CNT] = true;
431  } else {
432  ForceEmitWaitcnt[VM_CNT] = false;
433  }
434 #endif // NDEBUG
435  }
436 
437  bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
438  bool generateWaitcntInstBefore(MachineInstr &MI,
439  WaitcntBrackets &ScoreBrackets,
440  MachineInstr *OldWaitcntInstr);
441  void updateEventWaitcntAfter(MachineInstr &Inst,
442  WaitcntBrackets *ScoreBrackets);
443  bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
444  WaitcntBrackets &ScoreBrackets);
445 };
446 
447 } // end anonymous namespace
448 
449 RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
450  const SIInstrInfo *TII,
451  const MachineRegisterInfo *MRI,
452  const SIRegisterInfo *TRI,
453  unsigned OpNo, bool Def) const {
454  const MachineOperand &Op = MI->getOperand(OpNo);
455  if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) ||
456  (Def && !Op.isDef()))
457  return {-1, -1};
458 
459  // A use via a PW operand does not need a waitcnt.
460  // A partial write is not a WAW.
461  assert(!Op.getSubReg() || !Op.isUndef());
462 
463  RegInterval Result;
464  const MachineRegisterInfo &MRIA = *MRI;
465 
466  unsigned Reg = TRI->getEncodingValue(Op.getReg());
467 
468  if (TRI->isVGPR(MRIA, Op.getReg())) {
469  assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
470  Result.first = Reg - RegisterEncoding.VGPR0;
471  assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
472  } else if (TRI->isSGPRReg(MRIA, Op.getReg())) {
473  assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
474  Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
475  assert(Result.first >= NUM_ALL_VGPRS &&
476  Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
477  }
478  // TODO: Handle TTMP
479  // else if (TRI->isTTMP(MRIA, Reg.getReg())) ...
480  else
481  return {-1, -1};
482 
483  const MachineInstr &MIA = *MI;
484  const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo);
485  unsigned Size = TRI->getRegSizeInBits(*RC);
486  Result.second = Result.first + (Size / 32);
487 
488  return Result;
489 }
490 
491 void WaitcntBrackets::setExpScore(const MachineInstr *MI,
492  const SIInstrInfo *TII,
493  const SIRegisterInfo *TRI,
494  const MachineRegisterInfo *MRI, unsigned OpNo,
495  uint32_t Val) {
496  RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
497  LLVM_DEBUG({
498  const MachineOperand &Opnd = MI->getOperand(OpNo);
499  assert(TRI->isVGPR(*MRI, Opnd.getReg()));
500  });
501  for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
502  setRegScore(RegNo, EXP_CNT, Val);
503  }
504 }
505 
506 void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
507  const SIRegisterInfo *TRI,
508  const MachineRegisterInfo *MRI,
509  WaitEventType E, MachineInstr &Inst) {
510  const MachineRegisterInfo &MRIA = *MRI;
511  InstCounterType T = eventCounter(E);
512  uint32_t CurrScore = getScoreUB(T) + 1;
513  if (CurrScore == 0)
514  report_fatal_error("InsertWaitcnt score wraparound");
515  // PendingEvents and ScoreUB need to be update regardless if this event
516  // changes the score of a register or not.
517  // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
518  if (!hasPendingEvent(E)) {
519  if (PendingEvents & WaitEventMaskForInst[T])
520  MixedPendingEvents[T] = true;
521  PendingEvents |= 1 << E;
522  }
523  setScoreUB(T, CurrScore);
524 
525  if (T == EXP_CNT) {
526  // Put score on the source vgprs. If this is a store, just use those
527  // specific register(s).
528  if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
529  // All GDS operations must protect their address register (same as
530  // export.)
531  if (Inst.getOpcode() != AMDGPU::DS_APPEND &&
532  Inst.getOpcode() != AMDGPU::DS_CONSUME) {
533  setExpScore(
534  &Inst, TII, TRI, MRI,
535  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr),
536  CurrScore);
537  }
538  if (Inst.mayStore()) {
540  AMDGPU::OpName::data0) != -1) {
541  setExpScore(
542  &Inst, TII, TRI, MRI,
543  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
544  CurrScore);
545  }
547  AMDGPU::OpName::data1) != -1) {
548  setExpScore(&Inst, TII, TRI, MRI,
550  AMDGPU::OpName::data1),
551  CurrScore);
552  }
553  } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 &&
554  Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
555  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
556  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
557  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
558  Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
559  Inst.getOpcode() != AMDGPU::DS_APPEND &&
560  Inst.getOpcode() != AMDGPU::DS_CONSUME &&
562  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
563  const MachineOperand &Op = Inst.getOperand(I);
564  if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) {
565  setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
566  }
567  }
568  }
569  } else if (TII->isFLAT(Inst)) {
570  if (Inst.mayStore()) {
571  setExpScore(
572  &Inst, TII, TRI, MRI,
573  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
574  CurrScore);
575  } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
576  setExpScore(
577  &Inst, TII, TRI, MRI,
578  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
579  CurrScore);
580  }
581  } else if (TII->isMIMG(Inst)) {
582  if (Inst.mayStore()) {
583  setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
584  } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
585  setExpScore(
586  &Inst, TII, TRI, MRI,
587  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
588  CurrScore);
589  }
590  } else if (TII->isMTBUF(Inst)) {
591  if (Inst.mayStore()) {
592  setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
593  }
594  } else if (TII->isMUBUF(Inst)) {
595  if (Inst.mayStore()) {
596  setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
597  } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
598  setExpScore(
599  &Inst, TII, TRI, MRI,
600  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
601  CurrScore);
602  }
603  } else {
604  if (TII->isEXP(Inst)) {
605  // For export the destination registers are really temps that
606  // can be used as the actual source after export patching, so
607  // we need to treat them like sources and set the EXP_CNT
608  // score.
609  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
610  MachineOperand &DefMO = Inst.getOperand(I);
611  if (DefMO.isReg() && DefMO.isDef() &&
612  TRI->isVGPR(MRIA, DefMO.getReg())) {
613  setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT,
614  CurrScore);
615  }
616  }
617  }
618  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
619  MachineOperand &MO = Inst.getOperand(I);
620  if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) {
621  setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
622  }
623  }
624  }
625 #if 0 // TODO: check if this is handled by MUBUF code above.
626  } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
627  Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
628  Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
629  MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
630  unsigned OpNo;//TODO: find the OpNo for this operand;
631  RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false);
632  for (signed RegNo = Interval.first; RegNo < Interval.second;
633  ++RegNo) {
634  setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
635  }
636 #endif
637  } else {
638  // Match the score to the destination registers.
639  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
640  RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true);
641  if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS)
642  continue;
643  for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
644  setRegScore(RegNo, T, CurrScore);
645  }
646  }
647  if (TII->isDS(Inst) && Inst.mayStore()) {
648  setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
649  }
650  }
651 }
652 
654  OS << '\n';
655  for (auto T : inst_counter_types()) {
656  uint32_t LB = getScoreLB(T);
657  uint32_t UB = getScoreUB(T);
658 
659  switch (T) {
660  case VM_CNT:
661  OS << " VM_CNT(" << UB - LB << "): ";
662  break;
663  case LGKM_CNT:
664  OS << " LGKM_CNT(" << UB - LB << "): ";
665  break;
666  case EXP_CNT:
667  OS << " EXP_CNT(" << UB - LB << "): ";
668  break;
669  default:
670  OS << " UNKNOWN(" << UB - LB << "): ";
671  break;
672  }
673 
674  if (LB < UB) {
675  // Print vgpr scores.
676  for (int J = 0; J <= getMaxVGPR(); J++) {
677  uint32_t RegScore = getRegScore(J, T);
678  if (RegScore <= LB)
679  continue;
680  uint32_t RelScore = RegScore - LB - 1;
681  if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
682  OS << RelScore << ":v" << J << " ";
683  } else {
684  OS << RelScore << ":ds ";
685  }
686  }
687  // Also need to print sgpr scores for lgkm_cnt.
688  if (T == LGKM_CNT) {
689  for (int J = 0; J <= getMaxSGPR(); J++) {
690  uint32_t RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
691  if (RegScore <= LB)
692  continue;
693  uint32_t RelScore = RegScore - LB - 1;
694  OS << RelScore << ":s" << J << " ";
695  }
696  }
697  }
698  OS << '\n';
699  }
700  OS << '\n';
701 }
702 
703 /// Simplify the waitcnt, in the sense of removing redundant counts, and return
704 /// whether a waitcnt instruction is needed at all.
705 bool WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
706  return simplifyWaitcnt(VM_CNT, Wait.VmCnt) |
707  simplifyWaitcnt(EXP_CNT, Wait.ExpCnt) |
708  simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
709 }
710 
711 bool WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
712  unsigned &Count) const {
713  const uint32_t LB = getScoreLB(T);
714  const uint32_t UB = getScoreUB(T);
715  if (Count < UB && UB - Count > LB)
716  return true;
717 
718  Count = ~0u;
719  return false;
720 }
721 
722 void WaitcntBrackets::determineWait(InstCounterType T, uint32_t ScoreToWait,
723  AMDGPU::Waitcnt &Wait) const {
724  // If the score of src_operand falls within the bracket, we need an
725  // s_waitcnt instruction.
726  const uint32_t LB = getScoreLB(T);
727  const uint32_t UB = getScoreUB(T);
728  if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
729  if ((T == VM_CNT || T == LGKM_CNT) &&
730  hasPendingFlat() &&
731  !ST->hasFlatLgkmVMemCountInOrder()) {
732  // If there is a pending FLAT operation, and this is a VMem or LGKM
733  // waitcnt and the target can report early completion, then we need
734  // to force a waitcnt 0.
735  addWait(Wait, T, 0);
736  } else if (counterOutOfOrder(T)) {
737  // Counter can get decremented out-of-order when there
738  // are multiple types event in the bracket. Also emit an s_wait counter
739  // with a conservative value of 0 for the counter.
740  addWait(Wait, T, 0);
741  } else {
742  addWait(Wait, T, UB - ScoreToWait);
743  }
744  }
745 }
746 
747 void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
748  applyWaitcnt(VM_CNT, Wait.VmCnt);
749  applyWaitcnt(EXP_CNT, Wait.ExpCnt);
750  applyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
751 }
752 
753 void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
754  const uint32_t UB = getScoreUB(T);
755  if (Count >= UB)
756  return;
757  if (Count != 0) {
758  if (counterOutOfOrder(T))
759  return;
760  setScoreLB(T, std::max(getScoreLB(T), UB - Count));
761  } else {
762  setScoreLB(T, UB);
763  MixedPendingEvents[T] = false;
764  PendingEvents &= ~WaitEventMaskForInst[T];
765  }
766 }
767 
768 // Where there are multiple types of event in the bracket of a counter,
769 // the decrement may go out of order.
770 bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
771  // Scalar memory read always can go out of order.
772  if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS))
773  return true;
774  return MixedPendingEvents[T];
775 }
776 
777 INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
778  false)
779 INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
780  false)
781 
782 char SIInsertWaitcnts::ID = 0;
783 
784 char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
785 
787  return new SIInsertWaitcnts();
788 }
789 
790 static bool readsVCCZ(const MachineInstr &MI) {
791  unsigned Opc = MI.getOpcode();
792  return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
793  !MI.getOperand(1).isUndef();
794 }
795 
796 /// Generate s_waitcnt instruction to be placed before cur_Inst.
797 /// Instructions of a given type are returned in order,
798 /// but instructions of different types can complete out of order.
799 /// We rely on this in-order completion
800 /// and simply assign a score to the memory access instructions.
801 /// We keep track of the active "score bracket" to determine
802 /// if an access of a memory read requires an s_waitcnt
803 /// and if so what the value of each counter is.
804 /// The "score bracket" is bound by the lower bound and upper bound
805 /// scores (*_score_LB and *_score_ub respectively).
806 bool SIInsertWaitcnts::generateWaitcntInstBefore(
807  MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
808  MachineInstr *OldWaitcntInstr) {
809  setForceEmitWaitcnt();
810  bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
811 
812  if (MI.isDebugInstr())
813  return false;
814 
815  AMDGPU::Waitcnt Wait;
816 
817  // See if this instruction has a forced S_WAITCNT VM.
818  // TODO: Handle other cases of NeedsWaitcntVmBefore()
819  if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
820  MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
821  MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
822  Wait.VmCnt = 0;
823  }
824 
825  // All waits must be resolved at call return.
826  // NOTE: this could be improved with knowledge of all call sites or
827  // with knowledge of the called routines.
828  if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
829  MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
830  Wait = AMDGPU::Waitcnt::allZero();
831  }
832  // Resolve vm waits before gs-done.
833  else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
834  MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
837  Wait.VmCnt = 0;
838  }
839 #if 0 // TODO: the following blocks of logic when we have fence.
840  else if (MI.getOpcode() == SC_FENCE) {
841  const unsigned int group_size =
842  context->shader_info->GetMaxThreadGroupSize();
843  // group_size == 0 means thread group size is unknown at compile time
844  const bool group_is_multi_wave =
845  (group_size == 0 || group_size > target_info->GetWaveFrontSize());
846  const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
847 
848  for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
849  SCRegType src_type = Inst->GetSrcType(i);
850  switch (src_type) {
851  case SCMEM_LDS:
852  if (group_is_multi_wave ||
853  context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
854  EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
855  ScoreBrackets->getScoreUB(LGKM_CNT));
856  // LDS may have to wait for VM_CNT after buffer load to LDS
857  if (target_info->HasBufferLoadToLDS()) {
858  EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
859  ScoreBrackets->getScoreUB(VM_CNT));
860  }
861  }
862  break;
863 
864  case SCMEM_GDS:
865  if (group_is_multi_wave || fence_is_global) {
866  EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
867  ScoreBrackets->getScoreUB(EXP_CNT));
868  EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
869  ScoreBrackets->getScoreUB(LGKM_CNT));
870  }
871  break;
872 
873  case SCMEM_UAV:
874  case SCMEM_TFBUF:
875  case SCMEM_RING:
876  case SCMEM_SCATTER:
877  if (group_is_multi_wave || fence_is_global) {
878  EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
879  ScoreBrackets->getScoreUB(EXP_CNT));
880  EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
881  ScoreBrackets->getScoreUB(VM_CNT));
882  }
883  break;
884 
885  case SCMEM_SCRATCH:
886  default:
887  break;
888  }
889  }
890  }
891 #endif
892 
893  // Export & GDS instructions do not read the EXEC mask until after the export
894  // is granted (which can occur well after the instruction is issued).
895  // The shader program must flush all EXP operations on the export-count
896  // before overwriting the EXEC mask.
897  else {
898  if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
899  // Export and GDS are tracked individually, either may trigger a waitcnt
900  // for EXEC.
901  if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
902  ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
903  ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
904  ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
905  Wait.ExpCnt = 0;
906  }
907  }
908 
909 #if 0 // TODO: the following code to handle CALL.
910  // The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT.
911  // However, there is a problem with EXP_CNT, because the call cannot
912  // easily tell if a register is used in the function, and if it did, then
913  // the referring instruction would have to have an S_WAITCNT, which is
914  // dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs
915  // before the call.
916  if (MI.getOpcode() == SC_CALL) {
917  if (ScoreBrackets->getScoreUB(EXP_CNT) >
918  ScoreBrackets->getScoreLB(EXP_CNT)) {
919  ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
920  EmitWaitcnt |= CNT_MASK(EXP_CNT);
921  }
922  }
923 #endif
924 
925  // FIXME: Should not be relying on memoperands.
926  // Look at the source operands of every instruction to see if
927  // any of them results from a previous memory operation that affects
928  // its current usage. If so, an s_waitcnt instruction needs to be
929  // emitted.
930  // If the source operand was defined by a load, add the s_waitcnt
931  // instruction.
932  for (const MachineMemOperand *Memop : MI.memoperands()) {
933  unsigned AS = Memop->getAddrSpace();
934  if (AS != AMDGPUAS::LOCAL_ADDRESS)
935  continue;
936  unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
937  // VM_CNT is only relevant to vgpr or LDS.
938  ScoreBrackets.determineWait(
939  VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
940  }
941 
942  for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
943  const MachineOperand &Op = MI.getOperand(I);
944  const MachineRegisterInfo &MRIA = *MRI;
945  RegInterval Interval =
946  ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, false);
947  for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
948  if (TRI->isVGPR(MRIA, Op.getReg())) {
949  // VM_CNT is only relevant to vgpr or LDS.
950  ScoreBrackets.determineWait(
951  VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
952  }
953  ScoreBrackets.determineWait(
954  LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
955  }
956  }
957  // End of for loop that looks at all source operands to decide vm_wait_cnt
958  // and lgk_wait_cnt.
959 
960  // Two cases are handled for destination operands:
961  // 1) If the destination operand was defined by a load, add the s_waitcnt
962  // instruction to guarantee the right WAW order.
963  // 2) If a destination operand that was used by a recent export/store ins,
964  // add s_waitcnt on exp_cnt to guarantee the WAR order.
965  if (MI.mayStore()) {
966  // FIXME: Should not be relying on memoperands.
967  for (const MachineMemOperand *Memop : MI.memoperands()) {
968  unsigned AS = Memop->getAddrSpace();
969  if (AS != AMDGPUAS::LOCAL_ADDRESS)
970  continue;
971  unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
972  ScoreBrackets.determineWait(
973  VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
974  ScoreBrackets.determineWait(
975  EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
976  }
977  }
978  for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
980  const MachineRegisterInfo &MRIA = *MRI;
981  RegInterval Interval =
982  ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, true);
983  for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
984  if (TRI->isVGPR(MRIA, Def.getReg())) {
985  ScoreBrackets.determineWait(
986  VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
987  ScoreBrackets.determineWait(
988  EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
989  }
990  ScoreBrackets.determineWait(
991  LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
992  }
993  } // End of for loop that looks at all dest operands.
994  }
995 
996  // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
997  // occurs before the instruction. Doing it here prevents any additional
998  // S_WAITCNTs from being emitted if the instruction was marked as
999  // requiring a WAITCNT beforehand.
1000  if (MI.getOpcode() == AMDGPU::S_BARRIER &&
1001  !ST->hasAutoWaitcntBeforeBarrier()) {
1002  Wait = AMDGPU::Waitcnt::allZero();
1003  }
1004 
1005  // TODO: Remove this work-around, enable the assert for Bug 457939
1006  // after fixing the scheduler. Also, the Shader Compiler code is
1007  // independent of target.
1008  if (readsVCCZ(MI) && ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) {
1009  if (ScoreBrackets.getScoreLB(LGKM_CNT) <
1010  ScoreBrackets.getScoreUB(LGKM_CNT) &&
1011  ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1012  Wait.LgkmCnt = 0;
1013  }
1014  }
1015 
1016  // Early-out if no wait is indicated.
1017  if (!ScoreBrackets.simplifyWaitcnt(Wait) && !IsForceEmitWaitcnt) {
1018  bool Modified = false;
1019  if (OldWaitcntInstr) {
1020  if (TrackedWaitcntSet.count(OldWaitcntInstr)) {
1021  TrackedWaitcntSet.erase(OldWaitcntInstr);
1022  OldWaitcntInstr->eraseFromParent();
1023  Modified = true;
1024  } else {
1025  int64_t Imm = OldWaitcntInstr->getOperand(0).getImm();
1026  ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
1027  }
1028  Modified = true;
1029  }
1030  return Modified;
1031  }
1032 
1033  if (ForceEmitZeroWaitcnts)
1034  Wait = AMDGPU::Waitcnt::allZero();
1035 
1036  if (ForceEmitWaitcnt[VM_CNT])
1037  Wait.VmCnt = 0;
1038  if (ForceEmitWaitcnt[EXP_CNT])
1039  Wait.ExpCnt = 0;
1040  if (ForceEmitWaitcnt[LGKM_CNT])
1041  Wait.LgkmCnt = 0;
1042 
1043  ScoreBrackets.applyWaitcnt(Wait);
1044 
1045  AMDGPU::Waitcnt OldWait;
1046  if (OldWaitcntInstr) {
1047  OldWait =
1048  AMDGPU::decodeWaitcnt(IV, OldWaitcntInstr->getOperand(0).getImm());
1049  }
1050  if (OldWait.dominates(Wait))
1051  return false;
1052 
1053  if (OldWaitcntInstr && !TrackedWaitcntSet.count(OldWaitcntInstr))
1054  Wait = Wait.combined(OldWait);
1055 
1056  unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1057  if (OldWaitcntInstr) {
1058  OldWaitcntInstr->getOperand(0).setImm(Enc);
1059 
1060  LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
1061  << "Old Instr: " << MI << '\n'
1062  << "New Instr: " << *OldWaitcntInstr << '\n');
1063  } else {
1064  auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
1065  MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1066  .addImm(Enc);
1067  TrackedWaitcntSet.insert(SWaitInst);
1068 
1069  LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
1070  << "Old Instr: " << MI << '\n'
1071  << "New Instr: " << *SWaitInst << '\n');
1072  }
1073 
1074  return true;
1075 }
1076 
1077 // This is a flat memory operation. Check to see if it has memory
1078 // tokens for both LDS and Memory, and if so mark it as a flat.
1079 bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1080  if (MI.memoperands_empty())
1081  return true;
1082 
1083  for (const MachineMemOperand *Memop : MI.memoperands()) {
1084  unsigned AS = Memop->getAddrSpace();
1086  return true;
1087  }
1088 
1089  return false;
1090 }
1091 
1092 void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
1093  WaitcntBrackets *ScoreBrackets) {
1094  // Now look at the instruction opcode. If it is a memory access
1095  // instruction, update the upper-bound of the appropriate counter's
1096  // bracket and the destination operand scores.
1097  // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
1098  if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
1099  if (TII->isAlwaysGDS(Inst.getOpcode()) ||
1100  TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
1101  ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
1102  ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
1103  } else {
1104  ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1105  }
1106  } else if (TII->isFLAT(Inst)) {
1107  assert(Inst.mayLoad() || Inst.mayStore());
1108 
1109  if (TII->usesVM_CNT(Inst))
1110  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1111 
1112  if (TII->usesLGKM_CNT(Inst)) {
1113  ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1114 
1115  // This is a flat memory operation, so note it - it will require
1116  // that both the VM and LGKM be flushed to zero if it is pending when
1117  // a VM or LGKM dependency occurs.
1118  if (mayAccessLDSThroughFlat(Inst))
1119  ScoreBrackets->setPendingFlat();
1120  }
1121  } else if (SIInstrInfo::isVMEM(Inst) &&
1122  // TODO: get a better carve out.
1123  Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
1124  Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
1125  Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
1126  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1127  if (ST->vmemWriteNeedsExpWaitcnt() &&
1128  (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
1129  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
1130  }
1131  } else if (TII->isSMRD(Inst)) {
1132  ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1133  } else {
1134  switch (Inst.getOpcode()) {
1135  case AMDGPU::S_SENDMSG:
1136  case AMDGPU::S_SENDMSGHALT:
1137  ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
1138  break;
1139  case AMDGPU::EXP:
1140  case AMDGPU::EXP_DONE: {
1141  int Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
1142  if (Imm >= 32 && Imm <= 63)
1143  ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
1144  else if (Imm >= 12 && Imm <= 15)
1145  ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
1146  else
1147  ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
1148  break;
1149  }
1150  case AMDGPU::S_MEMTIME:
1151  case AMDGPU::S_MEMREALTIME:
1152  ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1153  break;
1154  default:
1155  break;
1156  }
1157  }
1158 }
1159 
1160 bool WaitcntBrackets::mergeScore(const MergeInfo &M, uint32_t &Score,
1161  uint32_t OtherScore) {
1162  uint32_t MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
1163  uint32_t OtherShifted =
1164  OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
1165  Score = std::max(MyShifted, OtherShifted);
1166  return OtherShifted > MyShifted;
1167 }
1168 
1169 /// Merge the pending events and associater score brackets of \p Other into
1170 /// this brackets status.
1171 ///
1172 /// Returns whether the merge resulted in a change that requires tighter waits
1173 /// (i.e. the merged brackets strictly dominate the original brackets).
1174 bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
1175  bool StrictDom = false;
1176 
1177  for (auto T : inst_counter_types()) {
1178  // Merge event flags for this counter
1179  const bool OldOutOfOrder = counterOutOfOrder(T);
1180  const uint32_t OldEvents = PendingEvents & WaitEventMaskForInst[T];
1181  const uint32_t OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
1182  if (OtherEvents & ~OldEvents)
1183  StrictDom = true;
1184  if (Other.MixedPendingEvents[T] ||
1185  (OldEvents && OtherEvents && OldEvents != OtherEvents))
1186  MixedPendingEvents[T] = true;
1187  PendingEvents |= OtherEvents;
1188 
1189  // Merge scores for this counter
1190  const uint32_t MyPending = ScoreUBs[T] - ScoreLBs[T];
1191  const uint32_t OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
1192  MergeInfo M;
1193  M.OldLB = ScoreLBs[T];
1194  M.OtherLB = Other.ScoreLBs[T];
1195  M.MyShift = OtherPending > MyPending ? OtherPending - MyPending : 0;
1196  M.OtherShift = ScoreUBs[T] - Other.ScoreUBs[T] + M.MyShift;
1197 
1198  const uint32_t NewUB = ScoreUBs[T] + M.MyShift;
1199  if (NewUB < ScoreUBs[T])
1200  report_fatal_error("waitcnt score overflow");
1201  ScoreUBs[T] = NewUB;
1202  ScoreLBs[T] = std::min(M.OldLB + M.MyShift, M.OtherLB + M.OtherShift);
1203 
1204  StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
1205 
1206  bool RegStrictDom = false;
1207  for (int J = 0, E = std::max(getMaxVGPR(), Other.getMaxVGPR()) + 1; J != E;
1208  J++) {
1209  RegStrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
1210  }
1211 
1212  if (T == LGKM_CNT) {
1213  for (int J = 0, E = std::max(getMaxSGPR(), Other.getMaxSGPR()) + 1;
1214  J != E; J++) {
1215  RegStrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
1216  }
1217  }
1218 
1219  if (RegStrictDom && !OldOutOfOrder)
1220  StrictDom = true;
1221  }
1222 
1223  VgprUB = std::max(getMaxVGPR(), Other.getMaxVGPR());
1224  SgprUB = std::max(getMaxSGPR(), Other.getMaxSGPR());
1225 
1226  return StrictDom;
1227 }
1228 
1229 // Generate s_waitcnt instructions where needed.
1230 bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
1231  MachineBasicBlock &Block,
1232  WaitcntBrackets &ScoreBrackets) {
1233  bool Modified = false;
1234 
1235  LLVM_DEBUG({
1236  dbgs() << "*** Block" << Block.getNumber() << " ***";
1237  ScoreBrackets.dump();
1238  });
1239 
1240  // Walk over the instructions.
1241  MachineInstr *OldWaitcntInstr = nullptr;
1242 
1243  for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end();
1244  Iter != E;) {
1245  MachineInstr &Inst = *Iter;
1246 
1247  // Remove any previously existing waitcnts.
1248  if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
1249  if (OldWaitcntInstr) {
1250  if (TrackedWaitcntSet.count(OldWaitcntInstr)) {
1251  TrackedWaitcntSet.erase(OldWaitcntInstr);
1252  OldWaitcntInstr->eraseFromParent();
1253  OldWaitcntInstr = nullptr;
1254  } else if (!TrackedWaitcntSet.count(&Inst)) {
1255  // Two successive s_waitcnt's, both of which are pre-existing and
1256  // are therefore preserved.
1257  int64_t Imm = OldWaitcntInstr->getOperand(0).getImm();
1258  ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
1259  } else {
1260  ++Iter;
1261  Inst.eraseFromParent();
1262  Modified = true;
1263  continue;
1264  }
1265  }
1266 
1267  OldWaitcntInstr = &Inst;
1268  ++Iter;
1269  continue;
1270  }
1271 
1272  bool VCCZBugWorkAround = false;
1273  if (readsVCCZ(Inst) &&
1274  (!VCCZBugHandledSet.count(&Inst))) {
1275  if (ScoreBrackets.getScoreLB(LGKM_CNT) <
1276  ScoreBrackets.getScoreUB(LGKM_CNT) &&
1277  ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1278  if (ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
1279  VCCZBugWorkAround = true;
1280  }
1281  }
1282 
1283  // Generate an s_waitcnt instruction to be placed before
1284  // cur_Inst, if needed.
1285  Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
1286  OldWaitcntInstr = nullptr;
1287 
1288  updateEventWaitcntAfter(Inst, &ScoreBrackets);
1289 
1290 #if 0 // TODO: implement resource type check controlled by options with ub = LB.
1291  // If this instruction generates a S_SETVSKIP because it is an
1292  // indexed resource, and we are on Tahiti, then it will also force
1293  // an S_WAITCNT vmcnt(0)
1294  if (RequireCheckResourceType(Inst, context)) {
1295  // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
1296  ScoreBrackets->setScoreLB(VM_CNT,
1297  ScoreBrackets->getScoreUB(VM_CNT));
1298  }
1299 #endif
1300 
1301  LLVM_DEBUG({
1302  Inst.print(dbgs());
1303  ScoreBrackets.dump();
1304  });
1305 
1306  // Check to see if this is a GWS instruction. If so, and if this is CI or
1307  // VI, then the generated code sequence will include an S_WAITCNT 0.
1308  // TODO: Are these the only GWS instructions?
1309  if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
1310  Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V ||
1311  Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
1312  Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P ||
1313  Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
1314  // TODO: && context->target_info->GwsRequiresMemViolTest() ) {
1315  ScoreBrackets.applyWaitcnt(AMDGPU::Waitcnt::allZero());
1316  }
1317 
1318  // TODO: Remove this work-around after fixing the scheduler and enable the
1319  // assert above.
1320  if (VCCZBugWorkAround) {
1321  // Restore the vccz bit. Any time a value is written to vcc, the vcc
1322  // bit is updated, so we can restore the bit by reading the value of
1323  // vcc and then writing it back to the register.
1324  BuildMI(Block, Inst, Inst.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
1325  AMDGPU::VCC)
1326  .addReg(AMDGPU::VCC);
1327  VCCZBugHandledSet.insert(&Inst);
1328  Modified = true;
1329  }
1330 
1331  ++Iter;
1332  }
1333 
1334  return Modified;
1335 }
1336 
1337 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
1338  ST = &MF.getSubtarget<GCNSubtarget>();
1339  TII = ST->getInstrInfo();
1340  TRI = &TII->getRegisterInfo();
1341  MRI = &MF.getRegInfo();
1342  IV = AMDGPU::getIsaVersion(ST->getCPU());
1344 
1345  ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
1346  for (auto T : inst_counter_types())
1347  ForceEmitWaitcnt[T] = false;
1348 
1349  HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
1350  HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
1351  HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
1352 
1353  HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();
1354  HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();
1355  assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
1356  assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
1357 
1358  RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
1359  RegisterEncoding.VGPRL =
1360  RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1;
1361  RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
1362  RegisterEncoding.SGPRL =
1363  RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
1364 
1365  TrackedWaitcntSet.clear();
1366  VCCZBugHandledSet.clear();
1367  RpotIdxMap.clear();
1368  BlockInfos.clear();
1369 
1370  // Keep iterating over the blocks in reverse post order, inserting and
1371  // updating s_waitcnt where needed, until a fix point is reached.
1372  for (MachineBasicBlock *MBB :
1374  RpotIdxMap[MBB] = BlockInfos.size();
1375  BlockInfos.emplace_back(MBB);
1376  }
1377 
1378  std::unique_ptr<WaitcntBrackets> Brackets;
1379  bool Modified = false;
1380  bool Repeat;
1381  do {
1382  Repeat = false;
1383 
1384  for (BlockInfo &BI : BlockInfos) {
1385  if (!BI.Dirty)
1386  continue;
1387 
1388  unsigned Idx = std::distance(&*BlockInfos.begin(), &BI);
1389 
1390  if (BI.Incoming) {
1391  if (!Brackets)
1392  Brackets = llvm::make_unique<WaitcntBrackets>(*BI.Incoming);
1393  else
1394  *Brackets = *BI.Incoming;
1395  } else {
1396  if (!Brackets)
1397  Brackets = llvm::make_unique<WaitcntBrackets>(ST);
1398  else
1399  Brackets->clear();
1400  }
1401 
1402  Modified |= insertWaitcntInBlock(MF, *BI.MBB, *Brackets);
1403  BI.Dirty = false;
1404 
1405  if (Brackets->hasPending()) {
1406  BlockInfo *MoveBracketsToSucc = nullptr;
1407  for (MachineBasicBlock *Succ : BI.MBB->successors()) {
1408  unsigned SuccIdx = RpotIdxMap[Succ];
1409  BlockInfo &SuccBI = BlockInfos[SuccIdx];
1410  if (!SuccBI.Incoming) {
1411  SuccBI.Dirty = true;
1412  if (SuccIdx <= Idx)
1413  Repeat = true;
1414  if (!MoveBracketsToSucc) {
1415  MoveBracketsToSucc = &SuccBI;
1416  } else {
1417  SuccBI.Incoming = llvm::make_unique<WaitcntBrackets>(*Brackets);
1418  }
1419  } else if (SuccBI.Incoming->merge(*Brackets)) {
1420  SuccBI.Dirty = true;
1421  if (SuccIdx <= Idx)
1422  Repeat = true;
1423  }
1424  }
1425  if (MoveBracketsToSucc)
1426  MoveBracketsToSucc->Incoming = std::move(Brackets);
1427  }
1428  }
1429  } while (Repeat);
1430 
1432 
1433  bool HaveScalarStores = false;
1434 
1435  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
1436  ++BI) {
1437  MachineBasicBlock &MBB = *BI;
1438 
1439  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
1440  ++I) {
1441  if (!HaveScalarStores && TII->isScalarStore(*I))
1442  HaveScalarStores = true;
1443 
1444  if (I->getOpcode() == AMDGPU::S_ENDPGM ||
1445  I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
1446  EndPgmBlocks.push_back(&MBB);
1447  }
1448  }
1449 
1450  if (HaveScalarStores) {
1451  // If scalar writes are used, the cache must be flushed or else the next
1452  // wave to reuse the same scratch memory can be clobbered.
1453  //
1454  // Insert s_dcache_wb at wave termination points if there were any scalar
1455  // stores, and only if the cache hasn't already been flushed. This could be
1456  // improved by looking across blocks for flushes in postdominating blocks
1457  // from the stores but an explicitly requested flush is probably very rare.
1458  for (MachineBasicBlock *MBB : EndPgmBlocks) {
1459  bool SeenDCacheWB = false;
1460 
1461  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
1462  ++I) {
1463  if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
1464  SeenDCacheWB = true;
1465  else if (TII->isScalarStore(*I))
1466  SeenDCacheWB = false;
1467 
1468  // FIXME: It would be better to insert this before a waitcnt if any.
1469  if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
1470  I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
1471  !SeenDCacheWB) {
1472  Modified = true;
1473  BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
1474  }
1475  }
1476  }
1477  }
1478 
1479  if (!MFI->isEntryFunction()) {
1480  // Wait for any outstanding memory operations that the input registers may
1481  // depend on. We can't track them and it's better to the wait after the
1482  // costly call sequence.
1483 
1484  // TODO: Could insert earlier and schedule more liberally with operations
1485  // that only use caller preserved registers.
1486  MachineBasicBlock &EntryBB = MF.front();
1487  BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1488  .addImm(0);
1489 
1490  Modified = true;
1491  }
1492 
1493  return Modified;
1494 }
static Waitcnt allZero()
static cl::opt< unsigned > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(0), cl::Hidden)
Interface definition for SIRegisterInfo.
bool modifiesRegister(unsigned Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr modifies (fully define or partially define) the specified register...
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
AMDGPU specific subclass of TargetSubtarget.
LLVM_ATTRIBUTE_NORETURN void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:140
This class represents lattice values for constants.
Definition: AllocatorList.h:24
SI Insert Waitcnts
unsigned getExpcntBitMask(const IsaVersion &Version)
Interval Class - An Interval is a set of nodes defined such that every node in the interval has all o...
Definition: Interval.h:37
Implements a dense probed hash-table based set.
Definition: DenseSet.h:250
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:383
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
unsigned getReg() const
getReg - Returns the register number.
unsigned Reg
unsigned getSubReg() const
unsigned const TargetRegisterInfo * TRI
A debug info location.
Definition: DebugLoc.h:34
LLVM_READONLY int getAtomicNoRetOp(uint16_t Opcode)
FunctionPass * createSIInsertWaitcntsPass()
DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp", "Force emit s_waitcnt expcnt(0) instrs")
iterator_range< succ_iterator > successors()
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
static bool isDS(const MachineInstr &MI)
Definition: SIInstrInfo.h:445
static const AMDGPUSubtarget & get(const MachineFunction &MF)
A description of a memory reference used in the backend.
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:471
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:412
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:455
void eraseFromParent()
Unlink &#39;this&#39; from the containing basic block and delete it.
static bool readsVCCZ(const MachineInstr &MI)
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:409
This file provides an implementation of debug counters.
ELFYAML::ELF_STO Other
Definition: ELFYAML.cpp:784
APInt operator*(APInt a, uint64_t RHS)
Definition: APInt.h:2091
InstCounterType
bool isVGPR(const MachineRegisterInfo &MRI, unsigned Reg) const
static bool isMUBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:419
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, unsigned OperandName) const
Returns the operand named Op.
#define T
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
static bool isCounterSet(unsigned ID)
Definition: DebugCounter.h:102
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they&#39;re not in a MachineFuncti...
CRTP base class which implements the entire standard iterator facade in terms of a minimal subset of ...
Definition: iterator.h:68
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
Definition: MachineInstr.h:820
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:423
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
unsigned const MachineRegisterInfo * MRI
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:516
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Address space for flat memory.
Definition: AMDGPU.h:255
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Represent the analysis usage information of a pass.
Address space for local memory.
Definition: AMDGPU.h:260
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
void setImm(int64_t immVal)
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:285
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
self_iterator getIterator()
Definition: ilist_node.h:82
static bool shouldExecute(unsigned CounterName)
Definition: DebugCounter.h:74
#define CNT_MASK(t)
RegisterMapping
void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
const MachineBasicBlock & front() const
bool isDebugInstr() const
Definition: MachineInstr.h:999
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:487
IsaVersion getIsaVersion(StringRef GPU)
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
char & SIInsertWaitcntsID
Iterator for intrusive lists based on ilist_node.
MachineOperand class - Representation of each machine instruction operand.
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:847
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:286
int64_t getImm() const
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:133
A range adaptor for a pair of iterators.
static void clear(coro::Shape &Shape)
Definition: Coroutines.cpp:212
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:254
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
Representation of each machine instruction.
Definition: MachineInstr.h:64
INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, false) INITIALIZE_PASS_END(SIInsertWaitcnts
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Interface definition for SIInstrInfo.
static bool isMTBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:427
#define DEBUG_TYPE
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
#define I(x, y, z)
Definition: MD5.cpp:58
static bool isVMEM(const MachineInstr &MI)
Definition: SIInstrInfo.h:331
uint32_t Size
Definition: Profile.cpp:47
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
Definition: MachineInstr.h:807
bool memoperands_empty() const
Return true if we don&#39;t have any memory operands which described the memory access done by this instr...
Definition: MachineInstr.h:546
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
LLVM Value Representation.
Definition: Value.h:73
This class implements an extremely fast bulk output stream that can only output to a stream...
Definition: raw_ostream.h:46
IRTranslator LLVM IR MI
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:49
unsigned getLgkmcntBitMask(const IsaVersion &Version)
ProcessInfo Wait(const ProcessInfo &PI, unsigned SecondsToWait, bool WaitUntilTerminates, std::string *ErrMsg=nullptr)
This function waits for the process specified by PI to finish.
bool operator==(uint64_t V1, const APInt &V2)
Definition: APInt.h:1967
#define LLVM_DEBUG(X)
Definition: Debug.h:123
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:414
WaitEventType
unsigned getVmcntBitMask(const IsaVersion &Version)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
const SIRegisterInfo * getRegisterInfo() const override