LLVM  8.0.1
SIMachineScheduler.cpp
Go to the documentation of this file.
1 //===-- SIMachineScheduler.cpp - SI Scheduler Interface -------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// SI Machine Scheduler interface
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "SIMachineScheduler.h"
16 #include "AMDGPU.h"
17 #include "SIInstrInfo.h"
18 #include "SIRegisterInfo.h"
20 #include "llvm/ADT/STLExtras.h"
21 #include "llvm/ADT/SmallVector.h"
30 #include "llvm/Support/Debug.h"
33 #include <algorithm>
34 #include <cassert>
35 #include <map>
36 #include <set>
37 #include <utility>
38 #include <vector>
39 
40 using namespace llvm;
41 
42 #define DEBUG_TYPE "machine-scheduler"
43 
44 // This scheduler implements a different scheduling algorithm than
45 // GenericScheduler.
46 //
47 // There are several specific architecture behaviours that can't be modelled
48 // for GenericScheduler:
49 // . When accessing the result of an SGPR load instruction, you have to wait
50 // for all the SGPR load instructions before your current instruction to
51 // have finished.
52 // . When accessing the result of an VGPR load instruction, you have to wait
53 // for all the VGPR load instructions previous to the VGPR load instruction
54 // you are interested in to finish.
55 // . The less the register pressure, the best load latencies are hidden
56 //
57 // Moreover some specifities (like the fact a lot of instructions in the shader
58 // have few dependencies) makes the generic scheduler have some unpredictable
59 // behaviours. For example when register pressure becomes high, it can either
60 // manage to prevent register pressure from going too high, or it can
61 // increase register pressure even more than if it hadn't taken register
62 // pressure into account.
63 //
64 // Also some other bad behaviours are generated, like loading at the beginning
65 // of the shader a constant in VGPR you won't need until the end of the shader.
66 //
67 // The scheduling problem for SI can distinguish three main parts:
68 // . Hiding high latencies (texture sampling, etc)
69 // . Hiding low latencies (SGPR constant loading, etc)
70 // . Keeping register usage low for better latency hiding and general
71 // performance
72 //
73 // Some other things can also affect performance, but are hard to predict
74 // (cache usage, the fact the HW can issue several instructions from different
75 // wavefronts if different types, etc)
76 //
77 // This scheduler tries to solve the scheduling problem by dividing it into
78 // simpler sub-problems. It divides the instructions into blocks, schedules
79 // locally inside the blocks where it takes care of low latencies, and then
80 // chooses the order of the blocks by taking care of high latencies.
81 // Dividing the instructions into blocks helps control keeping register
82 // usage low.
83 //
84 // First the instructions are put into blocks.
85 // We want the blocks help control register usage and hide high latencies
86 // later. To help control register usage, we typically want all local
87 // computations, when for example you create a result that can be comsummed
88 // right away, to be contained in a block. Block inputs and outputs would
89 // typically be important results that are needed in several locations of
90 // the shader. Since we do want blocks to help hide high latencies, we want
91 // the instructions inside the block to have a minimal set of dependencies
92 // on high latencies. It will make it easy to pick blocks to hide specific
93 // high latencies.
94 // The block creation algorithm is divided into several steps, and several
95 // variants can be tried during the scheduling process.
96 //
97 // Second the order of the instructions inside the blocks is chosen.
98 // At that step we do take into account only register usage and hiding
99 // low latency instructions
100 //
101 // Third the block order is chosen, there we try to hide high latencies
102 // and keep register usage low.
103 //
104 // After the third step, a pass is done to improve the hiding of low
105 // latencies.
106 //
107 // Actually when talking about 'low latency' or 'high latency' it includes
108 // both the latency to get the cache (or global mem) data go to the register,
109 // and the bandwidth limitations.
110 // Increasing the number of active wavefronts helps hide the former, but it
111 // doesn't solve the latter, thus why even if wavefront count is high, we have
112 // to try have as many instructions hiding high latencies as possible.
113 // The OpenCL doc says for example latency of 400 cycles for a global mem access,
114 // which is hidden by 10 instructions if the wavefront count is 10.
115 
116 // Some figures taken from AMD docs:
117 // Both texture and constant L1 caches are 4-way associative with 64 bytes
118 // lines.
119 // Constant cache is shared with 4 CUs.
120 // For texture sampling, the address generation unit receives 4 texture
121 // addresses per cycle, thus we could expect texture sampling latency to be
122 // equivalent to 4 instructions in the very best case (a VGPR is 64 work items,
123 // instructions in a wavefront group are executed every 4 cycles),
124 // or 16 instructions if the other wavefronts associated to the 3 other VALUs
125 // of the CU do texture sampling too. (Don't take these figures too seriously,
126 // as I'm not 100% sure of the computation)
127 // Data exports should get similar latency.
128 // For constant loading, the cache is shader with 4 CUs.
129 // The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit"
130 // I guess if the other CU don't read the cache, it can go up to 64B/cycle.
131 // It means a simple s_buffer_load should take one instruction to hide, as
132 // well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same
133 // cache line.
134 //
135 // As of today the driver doesn't preload the constants in cache, thus the
136 // first loads get extra latency. The doc says global memory access can be
137 // 300-600 cycles. We do not specially take that into account when scheduling
138 // As we expect the driver to be able to preload the constants soon.
139 
140 // common code //
141 
142 #ifndef NDEBUG
143 
144 static const char *getReasonStr(SIScheduleCandReason Reason) {
145  switch (Reason) {
146  case NoCand: return "NOCAND";
147  case RegUsage: return "REGUSAGE";
148  case Latency: return "LATENCY";
149  case Successor: return "SUCCESSOR";
150  case Depth: return "DEPTH";
151  case NodeOrder: return "ORDER";
152  }
153  llvm_unreachable("Unknown reason!");
154 }
155 
156 #endif
157 
158 namespace llvm {
159 namespace SISched {
160 static bool tryLess(int TryVal, int CandVal,
161  SISchedulerCandidate &TryCand,
162  SISchedulerCandidate &Cand,
163  SIScheduleCandReason Reason) {
164  if (TryVal < CandVal) {
165  TryCand.Reason = Reason;
166  return true;
167  }
168  if (TryVal > CandVal) {
169  if (Cand.Reason > Reason)
170  Cand.Reason = Reason;
171  return true;
172  }
173  Cand.setRepeat(Reason);
174  return false;
175 }
176 
177 static bool tryGreater(int TryVal, int CandVal,
178  SISchedulerCandidate &TryCand,
179  SISchedulerCandidate &Cand,
180  SIScheduleCandReason Reason) {
181  if (TryVal > CandVal) {
182  TryCand.Reason = Reason;
183  return true;
184  }
185  if (TryVal < CandVal) {
186  if (Cand.Reason > Reason)
187  Cand.Reason = Reason;
188  return true;
189  }
190  Cand.setRepeat(Reason);
191  return false;
192 }
193 } // end namespace SISched
194 } // end namespace llvm
195 
196 // SIScheduleBlock //
197 
199  NodeNum2Index[SU->NodeNum] = SUnits.size();
200  SUnits.push_back(SU);
201 }
202 
203 #ifndef NDEBUG
204 void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) {
205 
206  dbgs() << " SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
207  dbgs() << '\n';
208 }
209 #endif
210 
211 void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
212  SISchedCandidate &TryCand) {
213  // Initialize the candidate if needed.
214  if (!Cand.isValid()) {
215  TryCand.Reason = NodeOrder;
216  return;
217  }
218 
219  if (Cand.SGPRUsage > 60 &&
220  SISched::tryLess(TryCand.SGPRUsage, Cand.SGPRUsage,
221  TryCand, Cand, RegUsage))
222  return;
223 
224  // Schedule low latency instructions as top as possible.
225  // Order of priority is:
226  // . Low latency instructions which do not depend on other low latency
227  // instructions we haven't waited for
228  // . Other instructions which do not depend on low latency instructions
229  // we haven't waited for
230  // . Low latencies
231  // . All other instructions
232  // Goal is to get: low latency instructions - independent instructions
233  // - (eventually some more low latency instructions)
234  // - instructions that depend on the first low latency instructions.
235  // If in the block there is a lot of constant loads, the SGPR usage
236  // could go quite high, thus above the arbitrary limit of 60 will encourage
237  // use the already loaded constants (in order to release some SGPRs) before
238  // loading more.
239  if (SISched::tryLess(TryCand.HasLowLatencyNonWaitedParent,
240  Cand.HasLowLatencyNonWaitedParent,
241  TryCand, Cand, SIScheduleCandReason::Depth))
242  return;
243 
244  if (SISched::tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency,
245  TryCand, Cand, SIScheduleCandReason::Depth))
246  return;
247 
248  if (TryCand.IsLowLatency &&
249  SISched::tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset,
250  TryCand, Cand, SIScheduleCandReason::Depth))
251  return;
252 
253  if (SISched::tryLess(TryCand.VGPRUsage, Cand.VGPRUsage,
254  TryCand, Cand, RegUsage))
255  return;
256 
257  // Fall through to original instruction order.
258  if (TryCand.SU->NodeNum < Cand.SU->NodeNum) {
259  TryCand.Reason = NodeOrder;
260  }
261 }
262 
263 SUnit* SIScheduleBlock::pickNode() {
264  SISchedCandidate TopCand;
265 
266  for (SUnit* SU : TopReadySUs) {
267  SISchedCandidate TryCand;
268  std::vector<unsigned> pressure;
269  std::vector<unsigned> MaxPressure;
270  // Predict register usage after this instruction.
271  TryCand.SU = SU;
272  TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure);
273  TryCand.SGPRUsage = pressure[DAG->getSGPRSetID()];
274  TryCand.VGPRUsage = pressure[DAG->getVGPRSetID()];
275  TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum];
276  TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum];
277  TryCand.HasLowLatencyNonWaitedParent =
278  HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]];
279  tryCandidateTopDown(TopCand, TryCand);
280  if (TryCand.Reason != NoCand)
281  TopCand.setBest(TryCand);
282  }
283 
284  return TopCand.SU;
285 }
286 
287 
288 // Schedule something valid.
290  TopReadySUs.clear();
291  if (Scheduled)
292  undoSchedule();
293 
294  for (SUnit* SU : SUnits) {
295  if (!SU->NumPredsLeft)
296  TopReadySUs.push_back(SU);
297  }
298 
299  while (!TopReadySUs.empty()) {
300  SUnit *SU = TopReadySUs[0];
301  ScheduledSUnits.push_back(SU);
302  nodeScheduled(SU);
303  }
304 
305  Scheduled = true;
306 }
307 
308 // Returns if the register was set between first and last.
309 static bool isDefBetween(unsigned Reg,
310  SlotIndex First, SlotIndex Last,
311  const MachineRegisterInfo *MRI,
312  const LiveIntervals *LIS) {
314  UI = MRI->def_instr_begin(Reg),
315  UE = MRI->def_instr_end(); UI != UE; ++UI) {
316  const MachineInstr* MI = &*UI;
317  if (MI->isDebugValue())
318  continue;
319  SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot();
320  if (InstSlot >= First && InstSlot <= Last)
321  return true;
322  }
323  return false;
324 }
325 
326 void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
327  MachineBasicBlock::iterator EndBlock) {
328  IntervalPressure Pressure, BotPressure;
329  RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure);
330  LiveIntervals *LIS = DAG->getLIS();
331  MachineRegisterInfo *MRI = DAG->getMRI();
332  DAG->initRPTracker(TopRPTracker);
333  DAG->initRPTracker(BotRPTracker);
334  DAG->initRPTracker(RPTracker);
335 
336  // Goes though all SU. RPTracker captures what had to be alive for the SUs
337  // to execute, and what is still alive at the end.
338  for (SUnit* SU : ScheduledSUnits) {
339  RPTracker.setPos(SU->getInstr());
340  RPTracker.advance();
341  }
342 
343  // Close the RPTracker to finalize live ins/outs.
344  RPTracker.closeRegion();
345 
346  // Initialize the live ins and live outs.
347  TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
348  BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
349 
350  // Do not Track Physical Registers, because it messes up.
351  for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
352  if (TargetRegisterInfo::isVirtualRegister(RegMaskPair.RegUnit))
353  LiveInRegs.insert(RegMaskPair.RegUnit);
354  }
355  LiveOutRegs.clear();
356  // There is several possibilities to distinguish:
357  // 1) Reg is not input to any instruction in the block, but is output of one
358  // 2) 1) + read in the block and not needed after it
359  // 3) 1) + read in the block but needed in another block
360  // 4) Reg is input of an instruction but another block will read it too
361  // 5) Reg is input of an instruction and then rewritten in the block.
362  // result is not read in the block (implies used in another block)
363  // 6) Reg is input of an instruction and then rewritten in the block.
364  // result is read in the block and not needed in another block
365  // 7) Reg is input of an instruction and then rewritten in the block.
366  // result is read in the block but also needed in another block
367  // LiveInRegs will contains all the regs in situation 4, 5, 6, 7
368  // We want LiveOutRegs to contain only Regs whose content will be read after
369  // in another block, and whose content was written in the current block,
370  // that is we want it to get 1, 3, 5, 7
371  // Since we made the MIs of a block to be packed all together before
372  // scheduling, then the LiveIntervals were correct, and the RPTracker was
373  // able to correctly handle 5 vs 6, 2 vs 3.
374  // (Note: This is not sufficient for RPTracker to not do mistakes for case 4)
375  // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7
376  // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7
377  // The use of findDefBetween removes the case 4.
378  for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
379  unsigned Reg = RegMaskPair.RegUnit;
381  isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(),
382  LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI,
383  LIS)) {
384  LiveOutRegs.insert(Reg);
385  }
386  }
387 
388  // Pressure = sum_alive_registers register size
389  // Internally llvm will represent some registers as big 128 bits registers
390  // for example, but they actually correspond to 4 actual 32 bits registers.
391  // Thus Pressure is not equal to num_alive_registers * constant.
392  LiveInPressure = TopPressure.MaxSetPressure;
393  LiveOutPressure = BotPressure.MaxSetPressure;
394 
395  // Prepares TopRPTracker for top down scheduling.
396  TopRPTracker.closeTop();
397 }
398 
400  MachineBasicBlock::iterator EndBlock) {
401  if (!Scheduled)
402  fastSchedule();
403 
404  // PreScheduling phase to set LiveIn and LiveOut.
405  initRegPressure(BeginBlock, EndBlock);
406  undoSchedule();
407 
408  // Schedule for real now.
409 
410  TopReadySUs.clear();
411 
412  for (SUnit* SU : SUnits) {
413  if (!SU->NumPredsLeft)
414  TopReadySUs.push_back(SU);
415  }
416 
417  while (!TopReadySUs.empty()) {
418  SUnit *SU = pickNode();
419  ScheduledSUnits.push_back(SU);
420  TopRPTracker.setPos(SU->getInstr());
421  TopRPTracker.advance();
422  nodeScheduled(SU);
423  }
424 
425  // TODO: compute InternalAdditionnalPressure.
426  InternalAdditionnalPressure.resize(TopPressure.MaxSetPressure.size());
427 
428  // Check everything is right.
429 #ifndef NDEBUG
430  assert(SUnits.size() == ScheduledSUnits.size() &&
431  TopReadySUs.empty());
432  for (SUnit* SU : SUnits) {
433  assert(SU->isScheduled &&
434  SU->NumPredsLeft == 0);
435  }
436 #endif
437 
438  Scheduled = true;
439 }
440 
441 void SIScheduleBlock::undoSchedule() {
442  for (SUnit* SU : SUnits) {
443  SU->isScheduled = false;
444  for (SDep& Succ : SU->Succs) {
445  if (BC->isSUInBlock(Succ.getSUnit(), ID))
446  undoReleaseSucc(SU, &Succ);
447  }
448  }
449  HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
450  ScheduledSUnits.clear();
451  Scheduled = false;
452 }
453 
454 void SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) {
455  SUnit *SuccSU = SuccEdge->getSUnit();
456 
457  if (SuccEdge->isWeak()) {
458  ++SuccSU->WeakPredsLeft;
459  return;
460  }
461  ++SuccSU->NumPredsLeft;
462 }
463 
464 void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) {
465  SUnit *SuccSU = SuccEdge->getSUnit();
466 
467  if (SuccEdge->isWeak()) {
468  --SuccSU->WeakPredsLeft;
469  return;
470  }
471 #ifndef NDEBUG
472  if (SuccSU->NumPredsLeft == 0) {
473  dbgs() << "*** Scheduling failed! ***\n";
474  DAG->dumpNode(*SuccSU);
475  dbgs() << " has been released too many times!\n";
476  llvm_unreachable(nullptr);
477  }
478 #endif
479 
480  --SuccSU->NumPredsLeft;
481 }
482 
483 /// Release Successors of the SU that are in the block or not.
484 void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) {
485  for (SDep& Succ : SU->Succs) {
486  SUnit *SuccSU = Succ.getSUnit();
487 
488  if (SuccSU->NodeNum >= DAG->SUnits.size())
489  continue;
490 
491  if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock)
492  continue;
493 
494  releaseSucc(SU, &Succ);
495  if (SuccSU->NumPredsLeft == 0 && InOrOutBlock)
496  TopReadySUs.push_back(SuccSU);
497  }
498 }
499 
500 void SIScheduleBlock::nodeScheduled(SUnit *SU) {
501  // Is in TopReadySUs
502  assert (!SU->NumPredsLeft);
503  std::vector<SUnit *>::iterator I = llvm::find(TopReadySUs, SU);
504  if (I == TopReadySUs.end()) {
505  dbgs() << "Data Structure Bug in SI Scheduler\n";
506  llvm_unreachable(nullptr);
507  }
508  TopReadySUs.erase(I);
509 
510  releaseSuccessors(SU, true);
511  // Scheduling this node will trigger a wait,
512  // thus propagate to other instructions that they do not need to wait either.
513  if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]])
514  HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
515 
516  if (DAG->IsLowLatencySU[SU->NodeNum]) {
517  for (SDep& Succ : SU->Succs) {
518  std::map<unsigned, unsigned>::iterator I =
519  NodeNum2Index.find(Succ.getSUnit()->NodeNum);
520  if (I != NodeNum2Index.end())
521  HasLowLatencyNonWaitedParent[I->second] = 1;
522  }
523  }
524  SU->isScheduled = true;
525 }
526 
528  // We remove links from outside blocks to enable scheduling inside the block.
529  for (SUnit* SU : SUnits) {
530  releaseSuccessors(SU, false);
531  if (DAG->IsHighLatencySU[SU->NodeNum])
532  HighLatencyBlock = true;
533  }
534  HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0);
535 }
536 
537 // we maintain ascending order of IDs
539  unsigned PredID = Pred->getID();
540 
541  // Check if not already predecessor.
542  for (SIScheduleBlock* P : Preds) {
543  if (PredID == P->getID())
544  return;
545  }
546  Preds.push_back(Pred);
547 
548  assert(none_of(Succs,
549  [=](std::pair<SIScheduleBlock*,
551  return PredID == S.first->getID();
552  }) &&
553  "Loop in the Block Graph!");
554 }
555 
558  unsigned SuccID = Succ->getID();
559 
560  // Check if not already predecessor.
561  for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> &S : Succs) {
562  if (SuccID == S.first->getID()) {
563  if (S.second == SIScheduleBlockLinkKind::NoData &&
565  S.second = Kind;
566  return;
567  }
568  }
569  if (Succ->isHighLatencyBlock())
570  ++NumHighLatencySuccessors;
571  Succs.push_back(std::make_pair(Succ, Kind));
572 
573  assert(none_of(Preds,
574  [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) &&
575  "Loop in the Block Graph!");
576 }
577 
578 #ifndef NDEBUG
580  dbgs() << "Block (" << ID << ")\n";
581  if (!full)
582  return;
583 
584  dbgs() << "\nContains High Latency Instruction: "
585  << HighLatencyBlock << '\n';
586  dbgs() << "\nDepends On:\n";
587  for (SIScheduleBlock* P : Preds) {
588  P->printDebug(false);
589  }
590 
591  dbgs() << "\nSuccessors:\n";
592  for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S : Succs) {
593  if (S.second == SIScheduleBlockLinkKind::Data)
594  dbgs() << "(Data Dep) ";
595  S.first->printDebug(false);
596  }
597 
598  if (Scheduled) {
599  dbgs() << "LiveInPressure " << LiveInPressure[DAG->getSGPRSetID()] << ' '
600  << LiveInPressure[DAG->getVGPRSetID()] << '\n';
601  dbgs() << "LiveOutPressure " << LiveOutPressure[DAG->getSGPRSetID()] << ' '
602  << LiveOutPressure[DAG->getVGPRSetID()] << "\n\n";
603  dbgs() << "LiveIns:\n";
604  for (unsigned Reg : LiveInRegs)
605  dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
606 
607  dbgs() << "\nLiveOuts:\n";
608  for (unsigned Reg : LiveOutRegs)
609  dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
610  }
611 
612  dbgs() << "\nInstructions:\n";
613  if (!Scheduled) {
614  for (const SUnit* SU : SUnits)
615  DAG->dumpNode(*SU);
616  } else {
617  for (const SUnit* SU : SUnits)
618  DAG->dumpNode(*SU);
619  }
620 
621  dbgs() << "///////////////////////\n";
622 }
623 #endif
624 
625 // SIScheduleBlockCreator //
626 
628 DAG(DAG) {
629 }
630 
632 
635  std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B =
636  Blocks.find(BlockVariant);
637  if (B == Blocks.end()) {
638  SIScheduleBlocks Res;
639  createBlocksForVariant(BlockVariant);
640  topologicalSort();
641  scheduleInsideBlocks();
642  fillStats();
643  Res.Blocks = CurrentBlocks;
644  Res.TopDownIndex2Block = TopDownIndex2Block;
645  Res.TopDownBlock2Index = TopDownBlock2Index;
646  Blocks[BlockVariant] = Res;
647  return Res;
648  } else {
649  return B->second;
650  }
651 }
652 
654  if (SU->NodeNum >= DAG->SUnits.size())
655  return false;
656  return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID;
657 }
658 
659 void SIScheduleBlockCreator::colorHighLatenciesAlone() {
660  unsigned DAGSize = DAG->SUnits.size();
661 
662  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
663  SUnit *SU = &DAG->SUnits[i];
664  if (DAG->IsHighLatencySU[SU->NodeNum]) {
665  CurrentColoring[SU->NodeNum] = NextReservedID++;
666  }
667  }
668 }
669 
670 static bool
671 hasDataDependencyPred(const SUnit &SU, const SUnit &FromSU) {
672  for (const auto &PredDep : SU.Preds) {
673  if (PredDep.getSUnit() == &FromSU &&
674  PredDep.getKind() == llvm::SDep::Data)
675  return true;
676  }
677  return false;
678 }
679 
680 void SIScheduleBlockCreator::colorHighLatenciesGroups() {
681  unsigned DAGSize = DAG->SUnits.size();
682  unsigned NumHighLatencies = 0;
683  unsigned GroupSize;
684  int Color = NextReservedID;
685  unsigned Count = 0;
686  std::set<unsigned> FormingGroup;
687 
688  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
689  SUnit *SU = &DAG->SUnits[i];
690  if (DAG->IsHighLatencySU[SU->NodeNum])
691  ++NumHighLatencies;
692  }
693 
694  if (NumHighLatencies == 0)
695  return;
696 
697  if (NumHighLatencies <= 6)
698  GroupSize = 2;
699  else if (NumHighLatencies <= 12)
700  GroupSize = 3;
701  else
702  GroupSize = 4;
703 
704  for (unsigned SUNum : DAG->TopDownIndex2SU) {
705  const SUnit &SU = DAG->SUnits[SUNum];
706  if (DAG->IsHighLatencySU[SU.NodeNum]) {
707  unsigned CompatibleGroup = true;
708  int ProposedColor = Color;
709  std::vector<int> AdditionalElements;
710 
711  // We don't want to put in the same block
712  // two high latency instructions that depend
713  // on each other.
714  // One way would be to check canAddEdge
715  // in both directions, but that currently is not
716  // enough because there the high latency order is
717  // enforced (via links).
718  // Instead, look at the dependencies between the
719  // high latency instructions and deduce if it is
720  // a data dependency or not.
721  for (unsigned j : FormingGroup) {
722  bool HasSubGraph;
723  std::vector<int> SubGraph;
724  // By construction (topological order), if SU and
725  // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary
726  // in the parent graph of SU.
727 #ifndef NDEBUG
728  SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j],
729  HasSubGraph);
730  assert(!HasSubGraph);
731 #endif
732  SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU,
733  HasSubGraph);
734  if (!HasSubGraph)
735  continue; // No dependencies between each other
736  else if (SubGraph.size() > 5) {
737  // Too many elements would be required to be added to the block.
738  CompatibleGroup = false;
739  break;
740  }
741  else {
742  // Check the type of dependency
743  for (unsigned k : SubGraph) {
744  // If in the path to join the two instructions,
745  // there is another high latency instruction,
746  // or instructions colored for another block
747  // abort the merge.
748  if (DAG->IsHighLatencySU[k] ||
749  (CurrentColoring[k] != ProposedColor &&
750  CurrentColoring[k] != 0)) {
751  CompatibleGroup = false;
752  break;
753  }
754  // If one of the SU in the subgraph depends on the result of SU j,
755  // there'll be a data dependency.
756  if (hasDataDependencyPred(DAG->SUnits[k], DAG->SUnits[j])) {
757  CompatibleGroup = false;
758  break;
759  }
760  }
761  if (!CompatibleGroup)
762  break;
763  // Same check for the SU
764  if (hasDataDependencyPred(SU, DAG->SUnits[j])) {
765  CompatibleGroup = false;
766  break;
767  }
768  // Add all the required instructions to the block
769  // These cannot live in another block (because they
770  // depend (order dependency) on one of the
771  // instruction in the block, and are required for the
772  // high latency instruction we add.
773  AdditionalElements.insert(AdditionalElements.end(),
774  SubGraph.begin(), SubGraph.end());
775  }
776  }
777  if (CompatibleGroup) {
778  FormingGroup.insert(SU.NodeNum);
779  for (unsigned j : AdditionalElements)
780  CurrentColoring[j] = ProposedColor;
781  CurrentColoring[SU.NodeNum] = ProposedColor;
782  ++Count;
783  }
784  // Found one incompatible instruction,
785  // or has filled a big enough group.
786  // -> start a new one.
787  if (!CompatibleGroup) {
788  FormingGroup.clear();
789  Color = ++NextReservedID;
790  ProposedColor = Color;
791  FormingGroup.insert(SU.NodeNum);
792  CurrentColoring[SU.NodeNum] = ProposedColor;
793  Count = 0;
794  } else if (Count == GroupSize) {
795  FormingGroup.clear();
796  Color = ++NextReservedID;
797  ProposedColor = Color;
798  Count = 0;
799  }
800  }
801  }
802 }
803 
804 void SIScheduleBlockCreator::colorComputeReservedDependencies() {
805  unsigned DAGSize = DAG->SUnits.size();
806  std::map<std::set<unsigned>, unsigned> ColorCombinations;
807 
808  CurrentTopDownReservedDependencyColoring.clear();
809  CurrentBottomUpReservedDependencyColoring.clear();
810 
811  CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0);
812  CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0);
813 
814  // Traverse TopDown, and give different colors to SUs depending
815  // on which combination of High Latencies they depend on.
816 
817  for (unsigned SUNum : DAG->TopDownIndex2SU) {
818  SUnit *SU = &DAG->SUnits[SUNum];
819  std::set<unsigned> SUColors;
820 
821  // Already given.
822  if (CurrentColoring[SU->NodeNum]) {
823  CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
824  CurrentColoring[SU->NodeNum];
825  continue;
826  }
827 
828  for (SDep& PredDep : SU->Preds) {
829  SUnit *Pred = PredDep.getSUnit();
830  if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
831  continue;
832  if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0)
833  SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]);
834  }
835  // Color 0 by default.
836  if (SUColors.empty())
837  continue;
838  // Same color than parents.
839  if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
840  CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
841  *SUColors.begin();
842  else {
843  std::map<std::set<unsigned>, unsigned>::iterator Pos =
844  ColorCombinations.find(SUColors);
845  if (Pos != ColorCombinations.end()) {
846  CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second;
847  } else {
848  CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
849  NextNonReservedID;
850  ColorCombinations[SUColors] = NextNonReservedID++;
851  }
852  }
853  }
854 
855  ColorCombinations.clear();
856 
857  // Same as before, but BottomUp.
858 
859  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
860  SUnit *SU = &DAG->SUnits[SUNum];
861  std::set<unsigned> SUColors;
862 
863  // Already given.
864  if (CurrentColoring[SU->NodeNum]) {
865  CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
866  CurrentColoring[SU->NodeNum];
867  continue;
868  }
869 
870  for (SDep& SuccDep : SU->Succs) {
871  SUnit *Succ = SuccDep.getSUnit();
872  if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
873  continue;
874  if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0)
875  SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]);
876  }
877  // Keep color 0.
878  if (SUColors.empty())
879  continue;
880  // Same color than parents.
881  if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
882  CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
883  *SUColors.begin();
884  else {
885  std::map<std::set<unsigned>, unsigned>::iterator Pos =
886  ColorCombinations.find(SUColors);
887  if (Pos != ColorCombinations.end()) {
888  CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second;
889  } else {
890  CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
891  NextNonReservedID;
892  ColorCombinations[SUColors] = NextNonReservedID++;
893  }
894  }
895  }
896 }
897 
898 void SIScheduleBlockCreator::colorAccordingToReservedDependencies() {
899  unsigned DAGSize = DAG->SUnits.size();
900  std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations;
901 
902  // Every combination of colors given by the top down
903  // and bottom up Reserved node dependency
904 
905  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
906  SUnit *SU = &DAG->SUnits[i];
907  std::pair<unsigned, unsigned> SUColors;
908 
909  // High latency instructions: already given.
910  if (CurrentColoring[SU->NodeNum])
911  continue;
912 
913  SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum];
914  SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum];
915 
916  std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos =
917  ColorCombinations.find(SUColors);
918  if (Pos != ColorCombinations.end()) {
919  CurrentColoring[SU->NodeNum] = Pos->second;
920  } else {
921  CurrentColoring[SU->NodeNum] = NextNonReservedID;
922  ColorCombinations[SUColors] = NextNonReservedID++;
923  }
924  }
925 }
926 
927 void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
928  unsigned DAGSize = DAG->SUnits.size();
929  std::vector<int> PendingColoring = CurrentColoring;
930 
931  assert(DAGSize >= 1 &&
932  CurrentBottomUpReservedDependencyColoring.size() == DAGSize &&
933  CurrentTopDownReservedDependencyColoring.size() == DAGSize);
934  // If there is no reserved block at all, do nothing. We don't want
935  // everything in one block.
936  if (*std::max_element(CurrentBottomUpReservedDependencyColoring.begin(),
937  CurrentBottomUpReservedDependencyColoring.end()) == 0 &&
938  *std::max_element(CurrentTopDownReservedDependencyColoring.begin(),
939  CurrentTopDownReservedDependencyColoring.end()) == 0)
940  return;
941 
942  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
943  SUnit *SU = &DAG->SUnits[SUNum];
944  std::set<unsigned> SUColors;
945  std::set<unsigned> SUColorsPending;
946 
947  if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
948  continue;
949 
950  if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 ||
951  CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0)
952  continue;
953 
954  for (SDep& SuccDep : SU->Succs) {
955  SUnit *Succ = SuccDep.getSUnit();
956  if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
957  continue;
958  if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 ||
959  CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0)
960  SUColors.insert(CurrentColoring[Succ->NodeNum]);
961  SUColorsPending.insert(PendingColoring[Succ->NodeNum]);
962  }
963  // If there is only one child/parent block, and that block
964  // is not among the ones we are removing in this path, then
965  // merge the instruction to that block
966  if (SUColors.size() == 1 && SUColorsPending.size() == 1)
967  PendingColoring[SU->NodeNum] = *SUColors.begin();
968  else // TODO: Attribute new colors depending on color
969  // combination of children.
970  PendingColoring[SU->NodeNum] = NextNonReservedID++;
971  }
972  CurrentColoring = PendingColoring;
973 }
974 
975 
976 void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() {
977  unsigned DAGSize = DAG->SUnits.size();
978  unsigned PreviousColor;
979  std::set<unsigned> SeenColors;
980 
981  if (DAGSize <= 1)
982  return;
983 
984  PreviousColor = CurrentColoring[0];
985 
986  for (unsigned i = 1, e = DAGSize; i != e; ++i) {
987  SUnit *SU = &DAG->SUnits[i];
988  unsigned CurrentColor = CurrentColoring[i];
989  unsigned PreviousColorSave = PreviousColor;
990  assert(i == SU->NodeNum);
991 
992  if (CurrentColor != PreviousColor)
993  SeenColors.insert(PreviousColor);
994  PreviousColor = CurrentColor;
995 
996  if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
997  continue;
998 
999  if (SeenColors.find(CurrentColor) == SeenColors.end())
1000  continue;
1001 
1002  if (PreviousColorSave != CurrentColor)
1003  CurrentColoring[i] = NextNonReservedID++;
1004  else
1005  CurrentColoring[i] = CurrentColoring[i-1];
1006  }
1007 }
1008 
1009 void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() {
1010  unsigned DAGSize = DAG->SUnits.size();
1011 
1012  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1013  SUnit *SU = &DAG->SUnits[SUNum];
1014  std::set<unsigned> SUColors;
1015 
1016  if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1017  continue;
1018 
1019  // No predecessor: Vgpr constant loading.
1020  // Low latency instructions usually have a predecessor (the address)
1021  if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum])
1022  continue;
1023 
1024  for (SDep& SuccDep : SU->Succs) {
1025  SUnit *Succ = SuccDep.getSUnit();
1026  if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1027  continue;
1028  SUColors.insert(CurrentColoring[Succ->NodeNum]);
1029  }
1030  if (SUColors.size() == 1)
1031  CurrentColoring[SU->NodeNum] = *SUColors.begin();
1032  }
1033 }
1034 
1035 void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() {
1036  unsigned DAGSize = DAG->SUnits.size();
1037 
1038  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1039  SUnit *SU = &DAG->SUnits[SUNum];
1040  std::set<unsigned> SUColors;
1041 
1042  if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1043  continue;
1044 
1045  for (SDep& SuccDep : SU->Succs) {
1046  SUnit *Succ = SuccDep.getSUnit();
1047  if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1048  continue;
1049  SUColors.insert(CurrentColoring[Succ->NodeNum]);
1050  }
1051  if (SUColors.size() == 1)
1052  CurrentColoring[SU->NodeNum] = *SUColors.begin();
1053  }
1054 }
1055 
1056 void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() {
1057  unsigned DAGSize = DAG->SUnits.size();
1058 
1059  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1060  SUnit *SU = &DAG->SUnits[SUNum];
1061  std::set<unsigned> SUColors;
1062 
1063  if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1064  continue;
1065 
1066  for (SDep& SuccDep : SU->Succs) {
1067  SUnit *Succ = SuccDep.getSUnit();
1068  if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1069  continue;
1070  SUColors.insert(CurrentColoring[Succ->NodeNum]);
1071  }
1072  if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize)
1073  CurrentColoring[SU->NodeNum] = *SUColors.begin();
1074  }
1075 }
1076 
1077 void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
1078  unsigned DAGSize = DAG->SUnits.size();
1079  std::map<unsigned, unsigned> ColorCount;
1080 
1081  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1082  SUnit *SU = &DAG->SUnits[SUNum];
1083  unsigned color = CurrentColoring[SU->NodeNum];
1084  ++ColorCount[color];
1085  }
1086 
1087  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1088  SUnit *SU = &DAG->SUnits[SUNum];
1089  unsigned color = CurrentColoring[SU->NodeNum];
1090  std::set<unsigned> SUColors;
1091 
1092  if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1093  continue;
1094 
1095  if (ColorCount[color] > 1)
1096  continue;
1097 
1098  for (SDep& SuccDep : SU->Succs) {
1099  SUnit *Succ = SuccDep.getSUnit();
1100  if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1101  continue;
1102  SUColors.insert(CurrentColoring[Succ->NodeNum]);
1103  }
1104  if (SUColors.size() == 1 && *SUColors.begin() != color) {
1105  --ColorCount[color];
1106  CurrentColoring[SU->NodeNum] = *SUColors.begin();
1107  ++ColorCount[*SUColors.begin()];
1108  }
1109  }
1110 }
1111 
1112 void SIScheduleBlockCreator::cutHugeBlocks() {
1113  // TODO
1114 }
1115 
1116 void SIScheduleBlockCreator::regroupNoUserInstructions() {
1117  unsigned DAGSize = DAG->SUnits.size();
1118  int GroupID = NextNonReservedID++;
1119 
1120  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1121  SUnit *SU = &DAG->SUnits[SUNum];
1122  bool hasSuccessor = false;
1123 
1124  if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1125  continue;
1126 
1127  for (SDep& SuccDep : SU->Succs) {
1128  SUnit *Succ = SuccDep.getSUnit();
1129  if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1130  continue;
1131  hasSuccessor = true;
1132  }
1133  if (!hasSuccessor)
1134  CurrentColoring[SU->NodeNum] = GroupID;
1135  }
1136 }
1137 
1138 void SIScheduleBlockCreator::colorExports() {
1139  unsigned ExportColor = NextNonReservedID++;
1140  SmallVector<unsigned, 8> ExpGroup;
1141 
1142  // Put all exports together in a block.
1143  // The block will naturally end up being scheduled last,
1144  // thus putting exports at the end of the schedule, which
1145  // is better for performance.
1146  // However we must ensure, for safety, the exports can be put
1147  // together in the same block without any other instruction.
1148  // This could happen, for example, when scheduling after regalloc
1149  // if reloading a spilled register from memory using the same
1150  // register than used in a previous export.
1151  // If that happens, do not regroup the exports.
1152  for (unsigned SUNum : DAG->TopDownIndex2SU) {
1153  const SUnit &SU = DAG->SUnits[SUNum];
1154  if (SIInstrInfo::isEXP(*SU.getInstr())) {
1155  // Check the EXP can be added to the group safely,
1156  // ie without needing any other instruction.
1157  // The EXP is allowed to depend on other EXP
1158  // (they will be in the same group).
1159  for (unsigned j : ExpGroup) {
1160  bool HasSubGraph;
1161  std::vector<int> SubGraph;
1162  // By construction (topological order), if SU and
1163  // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary
1164  // in the parent graph of SU.
1165 #ifndef NDEBUG
1166  SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j],
1167  HasSubGraph);
1168  assert(!HasSubGraph);
1169 #endif
1170  SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU,
1171  HasSubGraph);
1172  if (!HasSubGraph)
1173  continue; // No dependencies between each other
1174 
1175  // SubGraph contains all the instructions required
1176  // between EXP SUnits[j] and EXP SU.
1177  for (unsigned k : SubGraph) {
1178  if (!SIInstrInfo::isEXP(*DAG->SUnits[k].getInstr()))
1179  // Other instructions than EXP would be required in the group.
1180  // Abort the groupping.
1181  return;
1182  }
1183  }
1184 
1185  ExpGroup.push_back(SUNum);
1186  }
1187  }
1188 
1189  // The group can be formed. Give the color.
1190  for (unsigned j : ExpGroup)
1191  CurrentColoring[j] = ExportColor;
1192 }
1193 
1194 void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) {
1195  unsigned DAGSize = DAG->SUnits.size();
1196  std::map<unsigned,unsigned> RealID;
1197 
1198  CurrentBlocks.clear();
1199  CurrentColoring.clear();
1200  CurrentColoring.resize(DAGSize, 0);
1201  Node2CurrentBlock.clear();
1202 
1203  // Restore links previous scheduling variant has overridden.
1204  DAG->restoreSULinksLeft();
1205 
1206  NextReservedID = 1;
1207  NextNonReservedID = DAGSize + 1;
1208 
1209  LLVM_DEBUG(dbgs() << "Coloring the graph\n");
1210 
1212  colorHighLatenciesGroups();
1213  else
1214  colorHighLatenciesAlone();
1215  colorComputeReservedDependencies();
1216  colorAccordingToReservedDependencies();
1217  colorEndsAccordingToDependencies();
1219  colorForceConsecutiveOrderInGroup();
1220  regroupNoUserInstructions();
1221  colorMergeConstantLoadsNextGroup();
1222  colorMergeIfPossibleNextGroupOnlyForReserved();
1223  colorExports();
1224 
1225  // Put SUs of same color into same block
1226  Node2CurrentBlock.resize(DAGSize, -1);
1227  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1228  SUnit *SU = &DAG->SUnits[i];
1229  unsigned Color = CurrentColoring[SU->NodeNum];
1230  if (RealID.find(Color) == RealID.end()) {
1231  int ID = CurrentBlocks.size();
1232  BlockPtrs.push_back(llvm::make_unique<SIScheduleBlock>(DAG, this, ID));
1233  CurrentBlocks.push_back(BlockPtrs.rbegin()->get());
1234  RealID[Color] = ID;
1235  }
1236  CurrentBlocks[RealID[Color]]->addUnit(SU);
1237  Node2CurrentBlock[SU->NodeNum] = RealID[Color];
1238  }
1239 
1240  // Build dependencies between blocks.
1241  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1242  SUnit *SU = &DAG->SUnits[i];
1243  int SUID = Node2CurrentBlock[i];
1244  for (SDep& SuccDep : SU->Succs) {
1245  SUnit *Succ = SuccDep.getSUnit();
1246  if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1247  continue;
1248  if (Node2CurrentBlock[Succ->NodeNum] != SUID)
1249  CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]],
1250  SuccDep.isCtrl() ? NoData : Data);
1251  }
1252  for (SDep& PredDep : SU->Preds) {
1253  SUnit *Pred = PredDep.getSUnit();
1254  if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
1255  continue;
1256  if (Node2CurrentBlock[Pred->NodeNum] != SUID)
1257  CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]);
1258  }
1259  }
1260 
1261  // Free root and leafs of all blocks to enable scheduling inside them.
1262  for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
1263  SIScheduleBlock *Block = CurrentBlocks[i];
1264  Block->finalizeUnits();
1265  }
1266  LLVM_DEBUG(dbgs() << "Blocks created:\n\n";
1267  for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
1268  SIScheduleBlock *Block = CurrentBlocks[i];
1269  Block->printDebug(true);
1270  });
1271 }
1272 
1273 // Two functions taken from Codegen/MachineScheduler.cpp
1274 
1275 /// Non-const version.
1279  for (; I != End; ++I) {
1280  if (!I->isDebugInstr())
1281  break;
1282  }
1283  return I;
1284 }
1285 
1286 void SIScheduleBlockCreator::topologicalSort() {
1287  unsigned DAGSize = CurrentBlocks.size();
1288  std::vector<int> WorkList;
1289 
1290  LLVM_DEBUG(dbgs() << "Topological Sort\n");
1291 
1292  WorkList.reserve(DAGSize);
1293  TopDownIndex2Block.resize(DAGSize);
1294  TopDownBlock2Index.resize(DAGSize);
1295  BottomUpIndex2Block.resize(DAGSize);
1296 
1297  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1298  SIScheduleBlock *Block = CurrentBlocks[i];
1299  unsigned Degree = Block->getSuccs().size();
1300  TopDownBlock2Index[i] = Degree;
1301  if (Degree == 0) {
1302  WorkList.push_back(i);
1303  }
1304  }
1305 
1306  int Id = DAGSize;
1307  while (!WorkList.empty()) {
1308  int i = WorkList.back();
1309  SIScheduleBlock *Block = CurrentBlocks[i];
1310  WorkList.pop_back();
1311  TopDownBlock2Index[i] = --Id;
1312  TopDownIndex2Block[Id] = i;
1313  for (SIScheduleBlock* Pred : Block->getPreds()) {
1314  if (!--TopDownBlock2Index[Pred->getID()])
1315  WorkList.push_back(Pred->getID());
1316  }
1317  }
1318 
1319 #ifndef NDEBUG
1320  // Check correctness of the ordering.
1321  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1322  SIScheduleBlock *Block = CurrentBlocks[i];
1323  for (SIScheduleBlock* Pred : Block->getPreds()) {
1324  assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] &&
1325  "Wrong Top Down topological sorting");
1326  }
1327  }
1328 #endif
1329 
1330  BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(),
1331  TopDownIndex2Block.rend());
1332 }
1333 
1334 void SIScheduleBlockCreator::scheduleInsideBlocks() {
1335  unsigned DAGSize = CurrentBlocks.size();
1336 
1337  LLVM_DEBUG(dbgs() << "\nScheduling Blocks\n\n");
1338 
1339  // We do schedule a valid scheduling such that a Block corresponds
1340  // to a range of instructions.
1341  LLVM_DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n");
1342  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1343  SIScheduleBlock *Block = CurrentBlocks[i];
1344  Block->fastSchedule();
1345  }
1346 
1347  // Note: the following code, and the part restoring previous position
1348  // is by far the most expensive operation of the Scheduler.
1349 
1350  // Do not update CurrentTop.
1351  MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop();
1352  std::vector<MachineBasicBlock::iterator> PosOld;
1353  std::vector<MachineBasicBlock::iterator> PosNew;
1354  PosOld.reserve(DAG->SUnits.size());
1355  PosNew.reserve(DAG->SUnits.size());
1356 
1357  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1358  int BlockIndice = TopDownIndex2Block[i];
1359  SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1360  std::vector<SUnit*> SUs = Block->getScheduledUnits();
1361 
1362  for (SUnit* SU : SUs) {
1363  MachineInstr *MI = SU->getInstr();
1365  PosOld.push_back(Pos);
1366  if (&*CurrentTopFastSched == MI) {
1367  PosNew.push_back(Pos);
1368  CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched,
1369  DAG->getCurrentBottom());
1370  } else {
1371  // Update the instruction stream.
1372  DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI);
1373 
1374  // Update LiveIntervals.
1375  // Note: Moving all instructions and calling handleMove every time
1376  // is the most cpu intensive operation of the scheduler.
1377  // It would gain a lot if there was a way to recompute the
1378  // LiveIntervals for the entire scheduling region.
1379  DAG->getLIS()->handleMove(*MI, /*UpdateFlags=*/true);
1380  PosNew.push_back(CurrentTopFastSched);
1381  }
1382  }
1383  }
1384 
1385  // Now we have Block of SUs == Block of MI.
1386  // We do the final schedule for the instructions inside the block.
1387  // The property that all the SUs of the Block are grouped together as MI
1388  // is used for correct reg usage tracking.
1389  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1390  SIScheduleBlock *Block = CurrentBlocks[i];
1391  std::vector<SUnit*> SUs = Block->getScheduledUnits();
1392  Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr());
1393  }
1394 
1395  LLVM_DEBUG(dbgs() << "Restoring MI Pos\n");
1396  // Restore old ordering (which prevents a LIS->handleMove bug).
1397  for (unsigned i = PosOld.size(), e = 0; i != e; --i) {
1398  MachineBasicBlock::iterator POld = PosOld[i-1];
1399  MachineBasicBlock::iterator PNew = PosNew[i-1];
1400  if (PNew != POld) {
1401  // Update the instruction stream.
1402  DAG->getBB()->splice(POld, DAG->getBB(), PNew);
1403 
1404  // Update LiveIntervals.
1405  DAG->getLIS()->handleMove(*POld, /*UpdateFlags=*/true);
1406  }
1407  }
1408 
1409  LLVM_DEBUG(for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
1410  SIScheduleBlock *Block = CurrentBlocks[i];
1411  Block->printDebug(true);
1412  });
1413 }
1414 
1415 void SIScheduleBlockCreator::fillStats() {
1416  unsigned DAGSize = CurrentBlocks.size();
1417 
1418  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1419  int BlockIndice = TopDownIndex2Block[i];
1420  SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1421  if (Block->getPreds().empty())
1422  Block->Depth = 0;
1423  else {
1424  unsigned Depth = 0;
1425  for (SIScheduleBlock *Pred : Block->getPreds()) {
1426  if (Depth < Pred->Depth + Pred->getCost())
1427  Depth = Pred->Depth + Pred->getCost();
1428  }
1429  Block->Depth = Depth;
1430  }
1431  }
1432 
1433  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1434  int BlockIndice = BottomUpIndex2Block[i];
1435  SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1436  if (Block->getSuccs().empty())
1437  Block->Height = 0;
1438  else {
1439  unsigned Height = 0;
1440  for (const auto &Succ : Block->getSuccs())
1441  Height = std::max(Height, Succ.first->Height + Succ.first->getCost());
1442  Block->Height = Height;
1443  }
1444  }
1445 }
1446 
1447 // SIScheduleBlockScheduler //
1448 
1451  SIScheduleBlocks BlocksStruct) :
1452  DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks),
1453  LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0),
1454  SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) {
1455 
1456  // Fill the usage of every output
1457  // Warning: while by construction we always have a link between two blocks
1458  // when one needs a result from the other, the number of users of an output
1459  // is not the sum of child blocks having as input the same virtual register.
1460  // Here is an example. A produces x and y. B eats x and produces x'.
1461  // C eats x' and y. The register coalescer may have attributed the same
1462  // virtual register to x and x'.
1463  // To count accurately, we do a topological sort. In case the register is
1464  // found for several parents, we increment the usage of the one with the
1465  // highest topological index.
1466  LiveOutRegsNumUsages.resize(Blocks.size());
1467  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1468  SIScheduleBlock *Block = Blocks[i];
1469  for (unsigned Reg : Block->getInRegs()) {
1470  bool Found = false;
1471  int topoInd = -1;
1472  for (SIScheduleBlock* Pred: Block->getPreds()) {
1473  std::set<unsigned> PredOutRegs = Pred->getOutRegs();
1474  std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
1475 
1476  if (RegPos != PredOutRegs.end()) {
1477  Found = true;
1478  if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) {
1479  topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()];
1480  }
1481  }
1482  }
1483 
1484  if (!Found)
1485  continue;
1486 
1487  int PredID = BlocksStruct.TopDownIndex2Block[topoInd];
1488  ++LiveOutRegsNumUsages[PredID][Reg];
1489  }
1490  }
1491 
1492  LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0);
1493  BlockNumPredsLeft.resize(Blocks.size());
1494  BlockNumSuccsLeft.resize(Blocks.size());
1495 
1496  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1497  SIScheduleBlock *Block = Blocks[i];
1498  BlockNumPredsLeft[i] = Block->getPreds().size();
1499  BlockNumSuccsLeft[i] = Block->getSuccs().size();
1500  }
1501 
1502 #ifndef NDEBUG
1503  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1504  SIScheduleBlock *Block = Blocks[i];
1505  assert(Block->getID() == i);
1506  }
1507 #endif
1508 
1509  std::set<unsigned> InRegs = DAG->getInRegs();
1510  addLiveRegs(InRegs);
1511 
1512  // Increase LiveOutRegsNumUsages for blocks
1513  // producing registers consumed in another
1514  // scheduling region.
1515  for (unsigned Reg : DAG->getOutRegs()) {
1516  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1517  // Do reverse traversal
1518  int ID = BlocksStruct.TopDownIndex2Block[Blocks.size()-1-i];
1519  SIScheduleBlock *Block = Blocks[ID];
1520  const std::set<unsigned> &OutRegs = Block->getOutRegs();
1521 
1522  if (OutRegs.find(Reg) == OutRegs.end())
1523  continue;
1524 
1525  ++LiveOutRegsNumUsages[ID][Reg];
1526  break;
1527  }
1528  }
1529 
1530  // Fill LiveRegsConsumers for regs that were already
1531  // defined before scheduling.
1532  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1533  SIScheduleBlock *Block = Blocks[i];
1534  for (unsigned Reg : Block->getInRegs()) {
1535  bool Found = false;
1536  for (SIScheduleBlock* Pred: Block->getPreds()) {
1537  std::set<unsigned> PredOutRegs = Pred->getOutRegs();
1538  std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
1539 
1540  if (RegPos != PredOutRegs.end()) {
1541  Found = true;
1542  break;
1543  }
1544  }
1545 
1546  if (!Found)
1547  ++LiveRegsConsumers[Reg];
1548  }
1549  }
1550 
1551  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1552  SIScheduleBlock *Block = Blocks[i];
1553  if (BlockNumPredsLeft[i] == 0) {
1554  ReadyBlocks.push_back(Block);
1555  }
1556  }
1557 
1558  while (SIScheduleBlock *Block = pickBlock()) {
1559  BlocksScheduled.push_back(Block);
1560  blockScheduled(Block);
1561  }
1562 
1563  LLVM_DEBUG(dbgs() << "Block Order:"; for (SIScheduleBlock *Block
1564  : BlocksScheduled) {
1565  dbgs() << ' ' << Block->getID();
1566  } dbgs() << '\n';);
1567 }
1568 
1569 bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand,
1570  SIBlockSchedCandidate &TryCand) {
1571  if (!Cand.isValid()) {
1572  TryCand.Reason = NodeOrder;
1573  return true;
1574  }
1575 
1576  // Try to hide high latencies.
1577  if (SISched::tryLess(TryCand.LastPosHighLatParentScheduled,
1578  Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency))
1579  return true;
1580  // Schedule high latencies early so you can hide them better.
1581  if (SISched::tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency,
1582  TryCand, Cand, Latency))
1583  return true;
1584  if (TryCand.IsHighLatency && SISched::tryGreater(TryCand.Height, Cand.Height,
1585  TryCand, Cand, Depth))
1586  return true;
1587  if (SISched::tryGreater(TryCand.NumHighLatencySuccessors,
1588  Cand.NumHighLatencySuccessors,
1589  TryCand, Cand, Successor))
1590  return true;
1591  return false;
1592 }
1593 
1594 bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
1595  SIBlockSchedCandidate &TryCand) {
1596  if (!Cand.isValid()) {
1597  TryCand.Reason = NodeOrder;
1598  return true;
1599  }
1600 
1601  if (SISched::tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0,
1602  TryCand, Cand, RegUsage))
1603  return true;
1604  if (SISched::tryGreater(TryCand.NumSuccessors > 0,
1605  Cand.NumSuccessors > 0,
1606  TryCand, Cand, Successor))
1607  return true;
1608  if (SISched::tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth))
1609  return true;
1610  if (SISched::tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff,
1611  TryCand, Cand, RegUsage))
1612  return true;
1613  return false;
1614 }
1615 
1616 SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
1617  SIBlockSchedCandidate Cand;
1618  std::vector<SIScheduleBlock*>::iterator Best;
1619  SIScheduleBlock *Block;
1620  if (ReadyBlocks.empty())
1621  return nullptr;
1622 
1623  DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(),
1624  VregCurrentUsage, SregCurrentUsage);
1625  if (VregCurrentUsage > maxVregUsage)
1626  maxVregUsage = VregCurrentUsage;
1627  if (SregCurrentUsage > maxSregUsage)
1628  maxSregUsage = SregCurrentUsage;
1629  LLVM_DEBUG(dbgs() << "Picking New Blocks\n"; dbgs() << "Available: ";
1630  for (SIScheduleBlock *Block
1631  : ReadyBlocks) dbgs()
1632  << Block->getID() << ' ';
1633  dbgs() << "\nCurrent Live:\n";
1634  for (unsigned Reg
1635  : LiveRegs) dbgs()
1636  << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
1637  dbgs() << '\n';
1638  dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
1639  dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';);
1640 
1641  Cand.Block = nullptr;
1642  for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(),
1643  E = ReadyBlocks.end(); I != E; ++I) {
1644  SIBlockSchedCandidate TryCand;
1645  TryCand.Block = *I;
1646  TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock();
1647  TryCand.VGPRUsageDiff =
1648  checkRegUsageImpact(TryCand.Block->getInRegs(),
1649  TryCand.Block->getOutRegs())[DAG->getVGPRSetID()];
1650  TryCand.NumSuccessors = TryCand.Block->getSuccs().size();
1651  TryCand.NumHighLatencySuccessors =
1652  TryCand.Block->getNumHighLatencySuccessors();
1653  TryCand.LastPosHighLatParentScheduled =
1654  (unsigned int) std::max<int> (0,
1655  LastPosHighLatencyParentScheduled[TryCand.Block->getID()] -
1656  LastPosWaitedHighLatency);
1657  TryCand.Height = TryCand.Block->Height;
1658  // Try not to increase VGPR usage too much, else we may spill.
1659  if (VregCurrentUsage > 120 ||
1661  if (!tryCandidateRegUsage(Cand, TryCand) &&
1663  tryCandidateLatency(Cand, TryCand);
1664  } else {
1665  if (!tryCandidateLatency(Cand, TryCand))
1666  tryCandidateRegUsage(Cand, TryCand);
1667  }
1668  if (TryCand.Reason != NoCand) {
1669  Cand.setBest(TryCand);
1670  Best = I;
1671  LLVM_DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' '
1672  << getReasonStr(Cand.Reason) << '\n');
1673  }
1674  }
1675 
1676  LLVM_DEBUG(dbgs() << "Picking: " << Cand.Block->getID() << '\n';
1677  dbgs() << "Is a block with high latency instruction: "
1678  << (Cand.IsHighLatency ? "yes\n" : "no\n");
1679  dbgs() << "Position of last high latency dependency: "
1680  << Cand.LastPosHighLatParentScheduled << '\n';
1681  dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n';
1682  dbgs() << '\n';);
1683 
1684  Block = Cand.Block;
1685  ReadyBlocks.erase(Best);
1686  return Block;
1687 }
1688 
1689 // Tracking of currently alive registers to determine VGPR Usage.
1690 
1691 void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) {
1692  for (unsigned Reg : Regs) {
1693  // For now only track virtual registers.
1695  continue;
1696  // If not already in the live set, then add it.
1697  (void) LiveRegs.insert(Reg);
1698  }
1699 }
1700 
1701 void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block,
1702  std::set<unsigned> &Regs) {
1703  for (unsigned Reg : Regs) {
1704  // For now only track virtual registers.
1705  std::set<unsigned>::iterator Pos = LiveRegs.find(Reg);
1706  assert (Pos != LiveRegs.end() && // Reg must be live.
1707  LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() &&
1708  LiveRegsConsumers[Reg] >= 1);
1709  --LiveRegsConsumers[Reg];
1710  if (LiveRegsConsumers[Reg] == 0)
1711  LiveRegs.erase(Pos);
1712  }
1713 }
1714 
1715 void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) {
1716  for (const auto &Block : Parent->getSuccs()) {
1717  if (--BlockNumPredsLeft[Block.first->getID()] == 0)
1718  ReadyBlocks.push_back(Block.first);
1719 
1720  if (Parent->isHighLatencyBlock() &&
1721  Block.second == SIScheduleBlockLinkKind::Data)
1722  LastPosHighLatencyParentScheduled[Block.first->getID()] = NumBlockScheduled;
1723  }
1724 }
1725 
1726 void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
1727  decreaseLiveRegs(Block, Block->getInRegs());
1728  addLiveRegs(Block->getOutRegs());
1729  releaseBlockSuccs(Block);
1730  for (std::map<unsigned, unsigned>::iterator RegI =
1731  LiveOutRegsNumUsages[Block->getID()].begin(),
1732  E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) {
1733  std::pair<unsigned, unsigned> RegP = *RegI;
1734  // We produce this register, thus it must not be previously alive.
1735  assert(LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() ||
1736  LiveRegsConsumers[RegP.first] == 0);
1737  LiveRegsConsumers[RegP.first] += RegP.second;
1738  }
1739  if (LastPosHighLatencyParentScheduled[Block->getID()] >
1740  (unsigned)LastPosWaitedHighLatency)
1741  LastPosWaitedHighLatency =
1742  LastPosHighLatencyParentScheduled[Block->getID()];
1743  ++NumBlockScheduled;
1744 }
1745 
1746 std::vector<int>
1747 SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs,
1748  std::set<unsigned> &OutRegs) {
1749  std::vector<int> DiffSetPressure;
1750  DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0);
1751 
1752  for (unsigned Reg : InRegs) {
1753  // For now only track virtual registers.
1755  continue;
1756  if (LiveRegsConsumers[Reg] > 1)
1757  continue;
1758  PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
1759  for (; PSetI.isValid(); ++PSetI) {
1760  DiffSetPressure[*PSetI] -= PSetI.getWeight();
1761  }
1762  }
1763 
1764  for (unsigned Reg : OutRegs) {
1765  // For now only track virtual registers.
1767  continue;
1768  PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
1769  for (; PSetI.isValid(); ++PSetI) {
1770  DiffSetPressure[*PSetI] += PSetI.getWeight();
1771  }
1772  }
1773 
1774  return DiffSetPressure;
1775 }
1776 
1777 // SIScheduler //
1778 
1779 struct SIScheduleBlockResult
1780 SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
1781  SISchedulerBlockSchedulerVariant ScheduleVariant) {
1782  SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant);
1783  SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks);
1784  std::vector<SIScheduleBlock*> ScheduledBlocks;
1785  struct SIScheduleBlockResult Res;
1786 
1787  ScheduledBlocks = Scheduler.getBlocks();
1788 
1789  for (unsigned b = 0; b < ScheduledBlocks.size(); ++b) {
1790  SIScheduleBlock *Block = ScheduledBlocks[b];
1791  std::vector<SUnit*> SUs = Block->getScheduledUnits();
1792 
1793  for (SUnit* SU : SUs)
1794  Res.SUs.push_back(SU->NodeNum);
1795  }
1796 
1797  Res.MaxSGPRUsage = Scheduler.getSGPRUsage();
1798  Res.MaxVGPRUsage = Scheduler.getVGPRUsage();
1799  return Res;
1800 }
1801 
1802 // SIScheduleDAGMI //
1803 
1806  SITII = static_cast<const SIInstrInfo*>(TII);
1807  SITRI = static_cast<const SIRegisterInfo*>(TRI);
1808 
1809  VGPRSetID = SITRI->getVGPRPressureSet();
1810  SGPRSetID = SITRI->getSGPRPressureSet();
1811 }
1812 
1814 
1815 // Code adapted from scheduleDAG.cpp
1816 // Does a topological sort over the SUs.
1817 // Both TopDown and BottomUp
1818 void SIScheduleDAGMI::topologicalSort() {
1820 
1821  TopDownIndex2SU = std::vector<int>(Topo.begin(), Topo.end());
1822  BottomUpIndex2SU = std::vector<int>(Topo.rbegin(), Topo.rend());
1823 }
1824 
1825 // Move low latencies further from their user without
1826 // increasing SGPR usage (in general)
1827 // This is to be replaced by a better pass that would
1828 // take into account SGPR usage (based on VGPR Usage
1829 // and the corresponding wavefront count), that would
1830 // try to merge groups of loads if it make sense, etc
1831 void SIScheduleDAGMI::moveLowLatencies() {
1832  unsigned DAGSize = SUnits.size();
1833  int LastLowLatencyUser = -1;
1834  int LastLowLatencyPos = -1;
1835 
1836  for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) {
1837  SUnit *SU = &SUnits[ScheduledSUnits[i]];
1838  bool IsLowLatencyUser = false;
1839  unsigned MinPos = 0;
1840 
1841  for (SDep& PredDep : SU->Preds) {
1842  SUnit *Pred = PredDep.getSUnit();
1843  if (SITII->isLowLatencyInstruction(*Pred->getInstr())) {
1844  IsLowLatencyUser = true;
1845  }
1846  if (Pred->NodeNum >= DAGSize)
1847  continue;
1848  unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum];
1849  if (PredPos >= MinPos)
1850  MinPos = PredPos + 1;
1851  }
1852 
1853  if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
1854  unsigned BestPos = LastLowLatencyUser + 1;
1855  if ((int)BestPos <= LastLowLatencyPos)
1856  BestPos = LastLowLatencyPos + 1;
1857  if (BestPos < MinPos)
1858  BestPos = MinPos;
1859  if (BestPos < i) {
1860  for (unsigned u = i; u > BestPos; --u) {
1861  ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
1862  ScheduledSUnits[u] = ScheduledSUnits[u-1];
1863  }
1864  ScheduledSUnits[BestPos] = SU->NodeNum;
1865  ScheduledSUnitsInv[SU->NodeNum] = BestPos;
1866  }
1867  LastLowLatencyPos = BestPos;
1868  if (IsLowLatencyUser)
1869  LastLowLatencyUser = BestPos;
1870  } else if (IsLowLatencyUser) {
1871  LastLowLatencyUser = i;
1872  // Moves COPY instructions on which depends
1873  // the low latency instructions too.
1874  } else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) {
1875  bool CopyForLowLat = false;
1876  for (SDep& SuccDep : SU->Succs) {
1877  SUnit *Succ = SuccDep.getSUnit();
1878  if (SITII->isLowLatencyInstruction(*Succ->getInstr())) {
1879  CopyForLowLat = true;
1880  }
1881  }
1882  if (!CopyForLowLat)
1883  continue;
1884  if (MinPos < i) {
1885  for (unsigned u = i; u > MinPos; --u) {
1886  ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
1887  ScheduledSUnits[u] = ScheduledSUnits[u-1];
1888  }
1889  ScheduledSUnits[MinPos] = SU->NodeNum;
1890  ScheduledSUnitsInv[SU->NodeNum] = MinPos;
1891  }
1892  }
1893  }
1894 }
1895 
1897  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
1898  SUnits[i].isScheduled = false;
1899  SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft;
1900  SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft;
1901  SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft;
1902  SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft;
1903  }
1904 }
1905 
1906 // Return the Vgpr and Sgpr usage corresponding to some virtual registers.
1907 template<typename _Iterator> void
1908 SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
1909  unsigned &VgprUsage, unsigned &SgprUsage) {
1910  VgprUsage = 0;
1911  SgprUsage = 0;
1912  for (_Iterator RegI = First; RegI != End; ++RegI) {
1913  unsigned Reg = *RegI;
1914  // For now only track virtual registers
1916  continue;
1917  PSetIterator PSetI = MRI.getPressureSets(Reg);
1918  for (; PSetI.isValid(); ++PSetI) {
1919  if (*PSetI == VGPRSetID)
1920  VgprUsage += PSetI.getWeight();
1921  else if (*PSetI == SGPRSetID)
1922  SgprUsage += PSetI.getWeight();
1923  }
1924  }
1925 }
1926 
1928 {
1929  SmallVector<SUnit*, 8> TopRoots, BotRoots;
1930  SIScheduleBlockResult Best, Temp;
1931  LLVM_DEBUG(dbgs() << "Preparing Scheduling\n");
1932 
1934  LLVM_DEBUG(dump());
1935 
1936  topologicalSort();
1937  findRootsAndBiasEdges(TopRoots, BotRoots);
1938  // We reuse several ScheduleDAGMI and ScheduleDAGMILive
1939  // functions, but to make them happy we must initialize
1940  // the default Scheduler implementation (even if we do not
1941  // run it)
1942  SchedImpl->initialize(this);
1943  initQueues(TopRoots, BotRoots);
1944 
1945  // Fill some stats to help scheduling.
1946 
1947  SUnitsLinksBackup = SUnits;
1948  IsLowLatencySU.clear();
1949  LowLatencyOffset.clear();
1950  IsHighLatencySU.clear();
1951 
1952  IsLowLatencySU.resize(SUnits.size(), 0);
1953  LowLatencyOffset.resize(SUnits.size(), 0);
1954  IsHighLatencySU.resize(SUnits.size(), 0);
1955 
1956  for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
1957  SUnit *SU = &SUnits[i];
1958  MachineOperand *BaseLatOp;
1959  int64_t OffLatReg;
1960  if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
1961  IsLowLatencySU[i] = 1;
1962  if (SITII->getMemOperandWithOffset(*SU->getInstr(), BaseLatOp, OffLatReg,
1963  TRI))
1964  LowLatencyOffset[i] = OffLatReg;
1965  } else if (SITII->isHighLatencyInstruction(*SU->getInstr()))
1966  IsHighLatencySU[i] = 1;
1967  }
1968 
1969  SIScheduler Scheduler(this);
1972 
1973  // if VGPR usage is extremely high, try other good performing variants
1974  // which could lead to lower VGPR usage
1975  if (Best.MaxVGPRUsage > 180) {
1976  static const std::pair<SISchedulerBlockCreatorVariant,
1978  Variants[] = {
1980 // { LatenciesAlone, BlockRegUsage },
1982 // { LatenciesGrouped, BlockRegUsageLatency },
1983 // { LatenciesGrouped, BlockRegUsage },
1984  { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
1985 // { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
1986 // { LatenciesAlonePlusConsecutive, BlockRegUsage }
1987  };
1988  for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
1989  Temp = Scheduler.scheduleVariant(v.first, v.second);
1990  if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
1991  Best = Temp;
1992  }
1993  }
1994  // if VGPR usage is still extremely high, we may spill. Try other variants
1995  // which are less performing, but that could lead to lower VGPR usage.
1996  if (Best.MaxVGPRUsage > 200) {
1997  static const std::pair<SISchedulerBlockCreatorVariant,
1999  Variants[] = {
2000 // { LatenciesAlone, BlockRegUsageLatency },
2002 // { LatenciesGrouped, BlockLatencyRegUsage },
2004  { LatenciesGrouped, BlockRegUsage },
2005 // { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
2006  { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
2007  { LatenciesAlonePlusConsecutive, BlockRegUsage }
2008  };
2009  for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
2010  Temp = Scheduler.scheduleVariant(v.first, v.second);
2011  if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
2012  Best = Temp;
2013  }
2014  }
2015 
2016  ScheduledSUnits = Best.SUs;
2017  ScheduledSUnitsInv.resize(SUnits.size());
2018 
2019  for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
2020  ScheduledSUnitsInv[ScheduledSUnits[i]] = i;
2021  }
2022 
2023  moveLowLatencies();
2024 
2025  // Tell the outside world about the result of the scheduling.
2026 
2027  assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
2029 
2030  for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(),
2031  E = ScheduledSUnits.end(); I != E; ++I) {
2032  SUnit *SU = &SUnits[*I];
2033 
2034  scheduleMI(SU, true);
2035 
2036  LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
2037  << *SU->getInstr());
2038  }
2039 
2040  assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
2041 
2042  placeDebugValues();
2043 
2044  LLVM_DEBUG({
2045  dbgs() << "*** Final schedule for "
2046  << printMBBReference(*begin()->getParent()) << " ***\n";
2047  dumpSchedule();
2048  dbgs() << '\n';
2049  });
2050 }
uint64_t CallInst * C
Interface definition for SIRegisterInfo.
SIScheduleBlockLinkKind
virtual unsigned getNumRegPressureSets() const =0
Get the number of dimensions of register pressure.
std::vector< SIScheduleBlock * > Blocks
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
SIScheduleDAGMI(MachineSchedContext *C)
void dumpSchedule() const
dump the scheduled Sequence.
This class represents lattice values for constants.
Definition: AllocatorList.h:24
ScheduleDAGTopologicalSort Topo
Topo - A topological ordering for SUnits which permits fast IsReachable and similar queries...
MachineBasicBlock::iterator CurrentTop
The top of the unscheduled zone.
SIScheduleCandReason Reason
void dump() const override
static bool isVirtualRegister(unsigned Reg)
Return true if the specified register number is in the virtual register namespace.
unsigned Reg
std::vector< unsigned > IsLowLatencySU
void addUnit(SUnit *SU)
Functions for Block construction.
std::vector< SIScheduleBlock * > getBlocks()
static MachineBasicBlock::iterator nextIfDebug(MachineBasicBlock::iterator I, MachineBasicBlock::const_iterator End)
Non-const version.
std::enable_if<!std::is_array< T >::value, std::unique_ptr< T > >::type make_unique(Args &&... args)
Constructs a new T() with the given args and returns a unique_ptr<T> which owns the object...
Definition: STLExtras.h:1349
std::unique_ptr< MachineSchedStrategy > SchedImpl
SmallVector< SDep, 4 > Preds
All sunit predecessors.
Definition: ScheduleDAG.h:260
ScheduleDAGMILive is an implementation of ScheduleDAGInstrs that schedules machine instructions while...
bool isScheduled
True once scheduled.
Definition: ScheduleDAG.h:288
std::vector< unsigned > LowLatencyOffset
static bool hasDataDependencyPred(const SUnit &SU, const SUnit &FromSU)
Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
Regular data dependence (aka true-dependence).
Definition: ScheduleDAG.h:54
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1200
ArrayRef< std::pair< SIScheduleBlock *, SIScheduleBlockLinkKind > > getSuccs() const
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:409
void InitDAGTopologicalSorting()
Creates the initial topological ordering from the DAG to be scheduled.
MachineBasicBlock::const_iterator getPos() const
Get the MI position corresponding to this register pressure.
static bool tryLess(int TryVal, int CandVal, SISchedulerCandidate &TryCand, SISchedulerCandidate &Cand, SIScheduleCandReason Reason)
defusechain_iterator - This class provides iterator support for machine operands in the function that...
static const char * getReasonStr(SIScheduleCandReason Reason)
std::set< unsigned > getOutRegs()
void scheduleMI(SUnit *SU, bool IsTopNode)
Move an instruction and update register pressure.
std::set< unsigned > getInRegs()
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def...
Definition: SlotIndexes.h:255
std::vector< int > GetSubGraph(const SUnit &StartSU, const SUnit &TargetSU, bool &Success)
Returns an array of SUs that are both in the successor subtree of StartSU and in the predecessor subt...
MachineBasicBlock::iterator RegionBegin
The beginning of the range to be scheduled.
void buildDAGWithRegPressure()
Call ScheduleDAGInstrs::buildSchedGraph with register pressure tracking enabled.
static def_instr_iterator def_instr_end()
MachineBasicBlock::iterator begin() const
Returns an iterator to the top of the current scheduling region.
void schedule() override
Implement ScheduleDAGInstrs interface for scheduling a sequence of reorderable instructions.
bool isCtrl() const
Shorthand for getKind() != SDep::Data.
Definition: ScheduleDAG.h:162
unsigned NumPredsLeft
of preds not scheduled.
Definition: ScheduleDAG.h:272
SUnit * getSUnit() const
Definition: ScheduleDAG.h:484
std::vector< int > TopDownIndex2Block
struct SIScheduleBlockResult scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant, SISchedulerBlockSchedulerVariant ScheduleVariant)
SI Machine Scheduler interface.
Scheduling dependency.
Definition: ScheduleDAG.h:50
void addLiveRegs(ArrayRef< RegisterMaskPair > Regs)
Force liveness of virtual registers or physical register units.
#define P(N)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:377
unsigned const MachineRegisterInfo * MRI
~SIScheduleDAGMI() override
RegisterPressure computed within a region of instructions delimited by TopIdx and BottomIdx...
bool isSUInBlock(SUnit *SU, unsigned ID)
std::vector< SUnit * > getScheduledUnits()
std::vector< int > TopDownIndex2SU
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, SISchedulerBlockSchedulerVariant Variant, SIScheduleBlocks BlocksStruct)
Printable printVRegOrUnit(unsigned VRegOrUnit, const TargetRegisterInfo *TRI)
Create Printable object to print virtual registers and physical registers on a raw_ostream.
void addSucc(SIScheduleBlock *Succ, SIScheduleBlockLinkKind Kind)
MachineRegisterInfo * getMRI()
MachineBasicBlock * getBB()
Track the current register pressure at some position in the instruction stream, and remember the high...
LiveIntervals * getLIS()
unsigned getSGPRPressureSet() const
const TargetRegisterInfo * getTRI()
SIScheduleBlocks getBlocks(SISchedulerBlockCreatorVariant BlockVariant)
void initQueues(ArrayRef< SUnit *> TopRoots, ArrayRef< SUnit *> BotRoots)
Release ExitSU predecessors and setup scheduler queues.
auto find(R &&Range, const T &Val) -> decltype(adl_begin(Range))
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1207
MachineBasicBlock::iterator getCurrentBottom()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
void schedule(MachineBasicBlock::iterator BeginBlock, MachineBasicBlock::iterator EndBlock)
unsigned getVGPRPressureSet() const
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:487
unsigned WeakPredsLeft
of weak preds not scheduled.
Definition: ScheduleDAG.h:274
std::vector< unsigned > MaxSetPressure
Map of max reg pressure indexed by pressure set ID, not class ID.
std::set< unsigned > & getOutRegs()
Color
A "color", which is either even or odd.
GenericScheduler shrinks the unscheduled zone using heuristics to balance the schedule.
bool isDebugValue() const
Definition: MachineInstr.h:997
std::vector< unsigned > IsHighLatencySU
MachineOperand class - Representation of each machine instruction operand.
std::vector< unsigned > SUs
SISchedulerBlockCreatorVariant
SIScheduleBlockCreator(SIScheduleDAGMI *DAG)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:133
MachineBasicBlock::iterator getCurrentTop()
void fillVgprSgprCost(_Iterator First, _Iterator End, unsigned &VgprUsage, unsigned &SgprUsage)
static bool tryGreater(int TryVal, int CandVal, SISchedulerCandidate &TryCand, SISchedulerCandidate &Cand, SIScheduleCandReason Reason)
def_instr_iterator def_instr_begin(unsigned RegNo) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
ScheduleDAGTopologicalSort * GetTopo()
Provides AMDGPU specific target descriptions.
Representation of each machine instruction.
Definition: MachineInstr.h:64
Interface definition for SIInstrInfo.
void addPred(SIScheduleBlock *Pred)
const TargetRegisterInfo * TRI
Target processor register info.
Definition: ScheduleDAG.h:563
void findRootsAndBiasEdges(SmallVectorImpl< SUnit *> &TopRoots, SmallVectorImpl< SUnit *> &BotRoots)
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB &#39;Other&#39; at the position From, and insert it into this MBB right before &#39;...
std::vector< int > BottomUpIndex2SU
Iterate over the pressure sets affected by the given physical or virtual register.
PSetIterator getPressureSets(unsigned RegUnit) const
Get an iterator over the pressure sets affected by the given physical or virtual register.
MachineSchedContext provides enough context from the MachineScheduler pass for the target to instanti...
#define I(x, y, z)
Definition: MD5.cpp:58
static bool isDefBetween(unsigned Reg, SlotIndex First, SlotIndex Last, const MachineRegisterInfo *MRI, const LiveIntervals *LIS)
bool isHighLatencyInstruction(const MachineInstr &MI) const
bool isLowLatencyInstruction(const MachineInstr &MI) const
void placeDebugValues()
Reinsert debug_values recorded in ScheduleDAGInstrs::DbgValues.
const TargetInstrInfo * TII
Target instruction information.
Definition: ScheduleDAG.h:562
unsigned NodeNum
Entry # of node in the node vector.
Definition: ScheduleDAG.h:268
const unsigned Kind
MachineBasicBlock::iterator CurrentBottom
The bottom of the unscheduled zone.
std::set< unsigned > & getInRegs()
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
std::vector< int > TopDownBlock2Index
const std::vector< SIScheduleBlock * > & getPreds() const
void setRepeat(SIScheduleCandReason R)
SmallVector< SDep, 4 > Succs
All sunit successors.
Definition: ScheduleDAG.h:261
static const Function * getParent(const Value *V)
bool isWeak() const
Tests if this a weak dependence.
Definition: ScheduleDAG.h:195
Machine Instruction Scheduler
IRTranslator LLVM IR MI
void setPos(MachineBasicBlock::const_iterator Pos)
unsigned getWeight() const
MachineRegisterInfo & MRI
Virtual/real register map.
Definition: ScheduleDAG.h:565
bool getMemOperandWithOffset(MachineInstr &LdSt, MachineOperand *&BaseOp, int64_t &Offset, const TargetRegisterInfo *TRI) const final
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:566
#define LLVM_DEBUG(X)
Definition: Debug.h:123
for(unsigned i=Desc.getNumOperands(), e=OldMI.getNumOperands();i !=e;++i)
SlotIndex - An opaque wrapper around machine indexes.
Definition: SlotIndexes.h:84
unsigned getVGPRSetID() const
RegPressureTracker TopRPTracker
SISchedulerBlockSchedulerVariant
void handleMove(MachineInstr &MI, bool UpdateFlags=false)
Call this method to notify LiveIntervals that instruction MI has been moved within a basic block...
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:246