LLVM  8.0.1
SILowerI1Copies.cpp
Go to the documentation of this file.
1 //===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass lowers all occurrences of i1 values (with a vreg_1 register class)
11 // to lane masks (64-bit scalar registers). The pass assumes machine SSA form
12 // and a wave-level control flow graph.
13 //
14 // Before this pass, values that are semantically i1 and are defined and used
15 // within the same basic block are already represented as lane masks in scalar
16 // registers. However, values that cross basic blocks are always transferred
17 // between basic blocks in vreg_1 virtual registers and are lowered by this
18 // pass.
19 //
20 // The only instructions that use or define vreg_1 virtual registers are COPY,
21 // PHI, and IMPLICIT_DEF.
22 //
23 //===----------------------------------------------------------------------===//
24 
25 #include "AMDGPU.h"
26 #include "AMDGPUSubtarget.h"
28 #include "SIInstrInfo.h"
35 #include "llvm/IR/Function.h"
36 #include "llvm/IR/LLVMContext.h"
37 #include "llvm/Support/Debug.h"
39 
40 #define DEBUG_TYPE "si-i1-copies"
41 
42 using namespace llvm;
43 
44 static unsigned createLaneMaskReg(MachineFunction &MF);
45 static unsigned insertUndefLaneMask(MachineBasicBlock &MBB);
46 
47 namespace {
48 
49 class SILowerI1Copies : public MachineFunctionPass {
50 public:
51  static char ID;
52 
53 private:
54  MachineFunction *MF = nullptr;
55  MachineDominatorTree *DT = nullptr;
56  MachinePostDominatorTree *PDT = nullptr;
57  MachineRegisterInfo *MRI = nullptr;
58  const GCNSubtarget *ST = nullptr;
59  const SIInstrInfo *TII = nullptr;
60 
61  DenseSet<unsigned> ConstrainRegs;
62 
63 public:
64  SILowerI1Copies() : MachineFunctionPass(ID) {
66  }
67 
68  bool runOnMachineFunction(MachineFunction &MF) override;
69 
70  StringRef getPassName() const override { return "SI Lower i1 Copies"; }
71 
72  void getAnalysisUsage(AnalysisUsage &AU) const override {
73  AU.setPreservesCFG();
77  }
78 
79 private:
80  void lowerCopiesFromI1();
81  void lowerPhis();
82  void lowerCopiesToI1();
83  bool isConstantLaneMask(unsigned Reg, bool &Val) const;
84  void buildMergeLaneMasks(MachineBasicBlock &MBB,
86  unsigned DstReg, unsigned PrevReg, unsigned CurReg);
88  getSaluInsertionAtEnd(MachineBasicBlock &MBB) const;
89 
90  bool isLaneMaskReg(unsigned Reg) const {
91  return TII->getRegisterInfo().isSGPRReg(*MRI, Reg) &&
92  TII->getRegisterInfo().getRegSizeInBits(Reg, *MRI) ==
93  ST->getWavefrontSize();
94  }
95 };
96 
97 /// Helper class that determines the relationship between incoming values of a
98 /// phi in the control flow graph to determine where an incoming value can
99 /// simply be taken as a scalar lane mask as-is, and where it needs to be
100 /// merged with another, previously defined lane mask.
101 ///
102 /// The approach is as follows:
103 /// - Determine all basic blocks which, starting from the incoming blocks,
104 /// a wave may reach before entering the def block (the block containing the
105 /// phi).
106 /// - If an incoming block has no predecessors in this set, we can take the
107 /// incoming value as a scalar lane mask as-is.
108 /// -- A special case of this is when the def block has a self-loop.
109 /// - Otherwise, the incoming value needs to be merged with a previously
110 /// defined lane mask.
111 /// - If there is a path into the set of reachable blocks that does _not_ go
112 /// through an incoming block where we can take the scalar lane mask as-is,
113 /// we need to invent an available value for the SSAUpdater. Choices are
114 /// 0 and undef, with differing consequences for how to merge values etc.
115 ///
116 /// TODO: We could use region analysis to quickly skip over SESE regions during
117 /// the traversal.
118 ///
119 class PhiIncomingAnalysis {
121 
122  // For each reachable basic block, whether it is a source in the induced
123  // subgraph of the CFG.
125  SmallVector<MachineBasicBlock *, 4> ReachableOrdered;
128 
129 public:
130  PhiIncomingAnalysis(MachinePostDominatorTree &PDT) : PDT(PDT) {}
131 
132  /// Returns whether \p MBB is a source in the induced subgraph of reachable
133  /// blocks.
134  bool isSource(MachineBasicBlock &MBB) const {
135  return ReachableMap.find(&MBB)->second;
136  }
137 
138  ArrayRef<MachineBasicBlock *> predecessors() const { return Predecessors; }
139 
140  void analyze(MachineBasicBlock &DefBlock,
141  ArrayRef<MachineBasicBlock *> IncomingBlocks) {
142  assert(Stack.empty());
143  ReachableMap.clear();
144  ReachableOrdered.clear();
145  Predecessors.clear();
146 
147  // Insert the def block first, so that it acts as an end point for the
148  // traversal.
149  ReachableMap.try_emplace(&DefBlock, false);
150  ReachableOrdered.push_back(&DefBlock);
151 
152  for (MachineBasicBlock *MBB : IncomingBlocks) {
153  if (MBB == &DefBlock) {
154  ReachableMap[&DefBlock] = true; // self-loop on DefBlock
155  continue;
156  }
157 
158  ReachableMap.try_emplace(MBB, false);
159  ReachableOrdered.push_back(MBB);
160 
161  // If this block has a divergent terminator and the def block is its
162  // post-dominator, the wave may first visit the other successors.
163  bool Divergent = false;
164  for (MachineInstr &MI : MBB->terminators()) {
165  if (MI.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO ||
166  MI.getOpcode() == AMDGPU::SI_IF ||
167  MI.getOpcode() == AMDGPU::SI_ELSE ||
168  MI.getOpcode() == AMDGPU::SI_LOOP) {
169  Divergent = true;
170  break;
171  }
172  }
173 
174  if (Divergent && PDT.dominates(&DefBlock, MBB)) {
175  for (MachineBasicBlock *Succ : MBB->successors())
176  Stack.push_back(Succ);
177  }
178  }
179 
180  while (!Stack.empty()) {
181  MachineBasicBlock *MBB = Stack.pop_back_val();
182  if (!ReachableMap.try_emplace(MBB, false).second)
183  continue;
184  ReachableOrdered.push_back(MBB);
185 
186  for (MachineBasicBlock *Succ : MBB->successors())
187  Stack.push_back(Succ);
188  }
189 
190  for (MachineBasicBlock *MBB : ReachableOrdered) {
191  bool HaveReachablePred = false;
192  for (MachineBasicBlock *Pred : MBB->predecessors()) {
193  if (ReachableMap.count(Pred)) {
194  HaveReachablePred = true;
195  } else {
196  Stack.push_back(Pred);
197  }
198  }
199  if (!HaveReachablePred)
200  ReachableMap[MBB] = true;
201  if (HaveReachablePred) {
202  for (MachineBasicBlock *UnreachablePred : Stack) {
203  if (llvm::find(Predecessors, UnreachablePred) == Predecessors.end())
204  Predecessors.push_back(UnreachablePred);
205  }
206  }
207  Stack.clear();
208  }
209  }
210 };
211 
212 /// Helper class that detects loops which require us to lower an i1 COPY into
213 /// bitwise manipulation.
214 ///
215 /// Unfortunately, we cannot use LoopInfo because LoopInfo does not distinguish
216 /// between loops with the same header. Consider this example:
217 ///
218 /// A-+-+
219 /// | | |
220 /// B-+ |
221 /// | |
222 /// C---+
223 ///
224 /// A is the header of a loop containing A, B, and C as far as LoopInfo is
225 /// concerned. However, an i1 COPY in B that is used in C must be lowered to
226 /// bitwise operations to combine results from different loop iterations when
227 /// B has a divergent branch (since by default we will compile this code such
228 /// that threads in a wave are merged at the entry of C).
229 ///
230 /// The following rule is implemented to determine whether bitwise operations
231 /// are required: use the bitwise lowering for a def in block B if a backward
232 /// edge to B is reachable without going through the nearest common
233 /// post-dominator of B and all uses of the def.
234 ///
235 /// TODO: This rule is conservative because it does not check whether the
236 /// relevant branches are actually divergent.
237 ///
238 /// The class is designed to cache the CFG traversal so that it can be re-used
239 /// for multiple defs within the same basic block.
240 ///
241 /// TODO: We could use region analysis to quickly skip over SESE regions during
242 /// the traversal.
243 ///
244 class LoopFinder {
247 
248  // All visited / reachable block, tagged by level (level 0 is the def block,
249  // level 1 are all blocks reachable including but not going through the def
250  // block's IPDOM, etc.).
252 
253  // Nearest common dominator of all visited blocks by level (level 0 is the
254  // def block). Used for seeding the SSAUpdater.
255  SmallVector<MachineBasicBlock *, 4> CommonDominators;
256 
257  // Post-dominator of all visited blocks.
258  MachineBasicBlock *VisitedPostDom = nullptr;
259 
260  // Level at which a loop was found: 0 is not possible; 1 = a backward edge is
261  // reachable without going through the IPDOM of the def block (if the IPDOM
262  // itself has an edge to the def block, the loop level is 2), etc.
263  unsigned FoundLoopLevel = ~0u;
264 
265  MachineBasicBlock *DefBlock = nullptr;
268 
269 public:
270  LoopFinder(MachineDominatorTree &DT, MachinePostDominatorTree &PDT)
271  : DT(DT), PDT(PDT) {}
272 
273  void initialize(MachineBasicBlock &MBB) {
274  Visited.clear();
275  CommonDominators.clear();
276  Stack.clear();
277  NextLevel.clear();
278  VisitedPostDom = nullptr;
279  FoundLoopLevel = ~0u;
280 
281  DefBlock = &MBB;
282  }
283 
284  /// Check whether a backward edge can be reached without going through the
285  /// given \p PostDom of the def block.
286  ///
287  /// Return the level of \p PostDom if a loop was found, or 0 otherwise.
288  unsigned findLoop(MachineBasicBlock *PostDom) {
289  MachineDomTreeNode *PDNode = PDT.getNode(DefBlock);
290 
291  if (!VisitedPostDom)
292  advanceLevel();
293 
294  unsigned Level = 0;
295  while (PDNode->getBlock() != PostDom) {
296  if (PDNode->getBlock() == VisitedPostDom)
297  advanceLevel();
298  PDNode = PDNode->getIDom();
299  Level++;
300  if (FoundLoopLevel == Level)
301  return Level;
302  }
303 
304  return 0;
305  }
306 
307  /// Add undef values dominating the loop and the optionally given additional
308  /// blocks, so that the SSA updater doesn't have to search all the way to the
309  /// function entry.
310  void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater,
311  ArrayRef<MachineBasicBlock *> Blocks = {}) {
312  assert(LoopLevel < CommonDominators.size());
313 
314  MachineBasicBlock *Dom = CommonDominators[LoopLevel];
315  for (MachineBasicBlock *MBB : Blocks)
316  Dom = DT.findNearestCommonDominator(Dom, MBB);
317 
318  if (!inLoopLevel(*Dom, LoopLevel, Blocks)) {
319  SSAUpdater.AddAvailableValue(Dom, insertUndefLaneMask(*Dom));
320  } else {
321  // The dominator is part of the loop or the given blocks, so add the
322  // undef value to unreachable predecessors instead.
323  for (MachineBasicBlock *Pred : Dom->predecessors()) {
324  if (!inLoopLevel(*Pred, LoopLevel, Blocks))
325  SSAUpdater.AddAvailableValue(Pred, insertUndefLaneMask(*Pred));
326  }
327  }
328  }
329 
330 private:
331  bool inLoopLevel(MachineBasicBlock &MBB, unsigned LoopLevel,
332  ArrayRef<MachineBasicBlock *> Blocks) const {
333  auto DomIt = Visited.find(&MBB);
334  if (DomIt != Visited.end() && DomIt->second <= LoopLevel)
335  return true;
336 
337  if (llvm::find(Blocks, &MBB) != Blocks.end())
338  return true;
339 
340  return false;
341  }
342 
343  void advanceLevel() {
344  MachineBasicBlock *VisitedDom;
345 
346  if (!VisitedPostDom) {
347  VisitedPostDom = DefBlock;
348  VisitedDom = DefBlock;
349  Stack.push_back(DefBlock);
350  } else {
351  VisitedPostDom = PDT.getNode(VisitedPostDom)->getIDom()->getBlock();
352  VisitedDom = CommonDominators.back();
353 
354  for (unsigned i = 0; i < NextLevel.size();) {
355  if (PDT.dominates(VisitedPostDom, NextLevel[i])) {
356  Stack.push_back(NextLevel[i]);
357 
358  NextLevel[i] = NextLevel.back();
359  NextLevel.pop_back();
360  } else {
361  i++;
362  }
363  }
364  }
365 
366  unsigned Level = CommonDominators.size();
367  while (!Stack.empty()) {
368  MachineBasicBlock *MBB = Stack.pop_back_val();
369  if (!PDT.dominates(VisitedPostDom, MBB))
370  NextLevel.push_back(MBB);
371 
372  Visited[MBB] = Level;
373  VisitedDom = DT.findNearestCommonDominator(VisitedDom, MBB);
374 
375  for (MachineBasicBlock *Succ : MBB->successors()) {
376  if (Succ == DefBlock) {
377  if (MBB == VisitedPostDom)
378  FoundLoopLevel = std::min(FoundLoopLevel, Level + 1);
379  else
380  FoundLoopLevel = std::min(FoundLoopLevel, Level);
381  continue;
382  }
383 
384  if (Visited.try_emplace(Succ, ~0u).second) {
385  if (MBB == VisitedPostDom)
386  NextLevel.push_back(Succ);
387  else
388  Stack.push_back(Succ);
389  }
390  }
391  }
392 
393  CommonDominators.push_back(VisitedDom);
394  }
395 };
396 
397 } // End anonymous namespace.
398 
399 INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false,
400  false)
404  false)
405 
406 char SILowerI1Copies::ID = 0;
407 
408 char &llvm::SILowerI1CopiesID = SILowerI1Copies::ID;
409 
411  return new SILowerI1Copies();
412 }
413 
414 static unsigned createLaneMaskReg(MachineFunction &MF) {
415  MachineRegisterInfo &MRI = MF.getRegInfo();
416  return MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
417 }
418 
419 static unsigned insertUndefLaneMask(MachineBasicBlock &MBB) {
420  MachineFunction &MF = *MBB.getParent();
421  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
422  const SIInstrInfo *TII = ST.getInstrInfo();
423  unsigned UndefReg = createLaneMaskReg(MF);
424  BuildMI(MBB, MBB.getFirstTerminator(), {}, TII->get(AMDGPU::IMPLICIT_DEF),
425  UndefReg);
426  return UndefReg;
427 }
428 
429 /// Lower all instructions that def or use vreg_1 registers.
430 ///
431 /// In a first pass, we lower COPYs from vreg_1 to vector registers, as can
432 /// occur around inline assembly. We do this first, before vreg_1 registers
433 /// are changed to scalar mask registers.
434 ///
435 /// Then we lower all defs of vreg_1 registers. Phi nodes are lowered before
436 /// all others, because phi lowering looks through copies and can therefore
437 /// often make copy lowering unnecessary.
438 bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) {
439  MF = &TheMF;
440  MRI = &MF->getRegInfo();
441  DT = &getAnalysis<MachineDominatorTree>();
442  PDT = &getAnalysis<MachinePostDominatorTree>();
443 
444  ST = &MF->getSubtarget<GCNSubtarget>();
445  TII = ST->getInstrInfo();
446 
447  lowerCopiesFromI1();
448  lowerPhis();
449  lowerCopiesToI1();
450 
451  for (unsigned Reg : ConstrainRegs)
452  MRI->constrainRegClass(Reg, &AMDGPU::SReg_64_XEXECRegClass);
453  ConstrainRegs.clear();
454 
455  return true;
456 }
457 
458 void SILowerI1Copies::lowerCopiesFromI1() {
460 
461  for (MachineBasicBlock &MBB : *MF) {
462  for (MachineInstr &MI : MBB) {
463  if (MI.getOpcode() != AMDGPU::COPY)
464  continue;
465 
466  unsigned DstReg = MI.getOperand(0).getReg();
467  unsigned SrcReg = MI.getOperand(1).getReg();
469  MRI->getRegClass(SrcReg) != &AMDGPU::VReg_1RegClass)
470  continue;
471 
472  if (isLaneMaskReg(DstReg) ||
474  MRI->getRegClass(DstReg) == &AMDGPU::VReg_1RegClass))
475  continue;
476 
477  // Copy into a 32-bit vector register.
478  LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI);
479  DebugLoc DL = MI.getDebugLoc();
480 
481  assert(TII->getRegisterInfo().getRegSizeInBits(DstReg, *MRI) == 32);
482  assert(!MI.getOperand(0).getSubReg());
483 
484  ConstrainRegs.insert(SrcReg);
485  BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
486  .addImm(0)
487  .addImm(-1)
488  .addReg(SrcReg);
489  DeadCopies.push_back(&MI);
490  }
491 
492  for (MachineInstr *MI : DeadCopies)
493  MI->eraseFromParent();
494  DeadCopies.clear();
495  }
496 }
497 
498 void SILowerI1Copies::lowerPhis() {
499  MachineSSAUpdater SSAUpdater(*MF);
500  LoopFinder LF(*DT, *PDT);
501  PhiIncomingAnalysis PIA(*PDT);
504  SmallVector<unsigned, 4> IncomingRegs;
505  SmallVector<unsigned, 4> IncomingUpdated;
506 
507  for (MachineBasicBlock &MBB : *MF) {
508  LF.initialize(MBB);
509 
510  for (MachineInstr &MI : MBB.phis()) {
511  unsigned DstReg = MI.getOperand(0).getReg();
512  if (MRI->getRegClass(DstReg) != &AMDGPU::VReg_1RegClass)
513  continue;
514 
515  LLVM_DEBUG(dbgs() << "Lower PHI: " << MI);
516 
517  MRI->setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
518 
519  // Collect incoming values.
520  for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
521  assert(i + 1 < MI.getNumOperands());
522  unsigned IncomingReg = MI.getOperand(i).getReg();
523  MachineBasicBlock *IncomingMBB = MI.getOperand(i + 1).getMBB();
524  MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg);
525 
526  if (IncomingDef->getOpcode() == AMDGPU::COPY) {
527  IncomingReg = IncomingDef->getOperand(1).getReg();
528  assert(isLaneMaskReg(IncomingReg));
529  assert(!IncomingDef->getOperand(1).getSubReg());
530  } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) {
531  continue;
532  } else {
533  assert(IncomingDef->isPHI());
534  }
535 
536  IncomingBlocks.push_back(IncomingMBB);
537  IncomingRegs.push_back(IncomingReg);
538  }
539 
540  // Phis in a loop that are observed outside the loop receive a simple but
541  // conservatively correct treatment.
542  MachineBasicBlock *PostDomBound = &MBB;
543  for (MachineInstr &Use : MRI->use_instructions(DstReg)) {
544  PostDomBound =
545  PDT->findNearestCommonDominator(PostDomBound, Use.getParent());
546  }
547 
548  unsigned FoundLoopLevel = LF.findLoop(PostDomBound);
549 
550  SSAUpdater.Initialize(DstReg);
551 
552  if (FoundLoopLevel) {
553  LF.addLoopEntries(FoundLoopLevel, SSAUpdater, IncomingBlocks);
554 
555  for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
556  IncomingUpdated.push_back(createLaneMaskReg(*MF));
557  SSAUpdater.AddAvailableValue(IncomingBlocks[i],
558  IncomingUpdated.back());
559  }
560 
561  for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
562  MachineBasicBlock &IMBB = *IncomingBlocks[i];
563  buildMergeLaneMasks(
564  IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i],
565  SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]);
566  }
567  } else {
568  // The phi is not observed from outside a loop. Use a more accurate
569  // lowering.
570  PIA.analyze(MBB, IncomingBlocks);
571 
572  for (MachineBasicBlock *MBB : PIA.predecessors())
573  SSAUpdater.AddAvailableValue(MBB, insertUndefLaneMask(*MBB));
574 
575  for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
576  MachineBasicBlock &IMBB = *IncomingBlocks[i];
577  if (PIA.isSource(IMBB)) {
578  IncomingUpdated.push_back(0);
579  SSAUpdater.AddAvailableValue(&IMBB, IncomingRegs[i]);
580  } else {
581  IncomingUpdated.push_back(createLaneMaskReg(*MF));
582  SSAUpdater.AddAvailableValue(&IMBB, IncomingUpdated.back());
583  }
584  }
585 
586  for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
587  if (!IncomingUpdated[i])
588  continue;
589 
590  MachineBasicBlock &IMBB = *IncomingBlocks[i];
591  buildMergeLaneMasks(
592  IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i],
593  SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]);
594  }
595  }
596 
597  unsigned NewReg = SSAUpdater.GetValueInMiddleOfBlock(&MBB);
598  if (NewReg != DstReg) {
599  MRI->replaceRegWith(NewReg, DstReg);
600 
601  // Ensure that DstReg has a single def and mark the old PHI node for
602  // deletion.
603  MI.getOperand(0).setReg(NewReg);
604  DeadPhis.push_back(&MI);
605  }
606 
607  IncomingBlocks.clear();
608  IncomingRegs.clear();
609  IncomingUpdated.clear();
610  }
611 
612  for (MachineInstr *MI : DeadPhis)
613  MI->eraseFromParent();
614  DeadPhis.clear();
615  }
616 }
617 
618 void SILowerI1Copies::lowerCopiesToI1() {
619  MachineSSAUpdater SSAUpdater(*MF);
620  LoopFinder LF(*DT, *PDT);
622 
623  for (MachineBasicBlock &MBB : *MF) {
624  LF.initialize(MBB);
625 
626  for (MachineInstr &MI : MBB) {
627  if (MI.getOpcode() != AMDGPU::IMPLICIT_DEF &&
628  MI.getOpcode() != AMDGPU::COPY)
629  continue;
630 
631  unsigned DstReg = MI.getOperand(0).getReg();
633  MRI->getRegClass(DstReg) != &AMDGPU::VReg_1RegClass)
634  continue;
635 
636  if (MRI->use_empty(DstReg)) {
637  DeadCopies.push_back(&MI);
638  continue;
639  }
640 
641  LLVM_DEBUG(dbgs() << "Lower Other: " << MI);
642 
643  MRI->setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
644  if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF)
645  continue;
646 
647  DebugLoc DL = MI.getDebugLoc();
648  unsigned SrcReg = MI.getOperand(1).getReg();
649  assert(!MI.getOperand(1).getSubReg());
650 
652  !isLaneMaskReg(SrcReg)) {
653  assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32);
654  unsigned TmpReg = createLaneMaskReg(*MF);
655  BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg)
656  .addReg(SrcReg)
657  .addImm(0);
658  MI.getOperand(1).setReg(TmpReg);
659  SrcReg = TmpReg;
660  }
661 
662  // Defs in a loop that are observed outside the loop must be transformed
663  // into appropriate bit manipulation.
664  MachineBasicBlock *PostDomBound = &MBB;
665  for (MachineInstr &Use : MRI->use_instructions(DstReg)) {
666  PostDomBound =
667  PDT->findNearestCommonDominator(PostDomBound, Use.getParent());
668  }
669 
670  unsigned FoundLoopLevel = LF.findLoop(PostDomBound);
671  if (FoundLoopLevel) {
672  SSAUpdater.Initialize(DstReg);
673  SSAUpdater.AddAvailableValue(&MBB, DstReg);
674  LF.addLoopEntries(FoundLoopLevel, SSAUpdater);
675 
676  buildMergeLaneMasks(MBB, MI, DL, DstReg,
677  SSAUpdater.GetValueInMiddleOfBlock(&MBB), SrcReg);
678  DeadCopies.push_back(&MI);
679  }
680  }
681 
682  for (MachineInstr *MI : DeadCopies)
683  MI->eraseFromParent();
684  DeadCopies.clear();
685  }
686 }
687 
688 bool SILowerI1Copies::isConstantLaneMask(unsigned Reg, bool &Val) const {
689  const MachineInstr *MI;
690  for (;;) {
691  MI = MRI->getUniqueVRegDef(Reg);
692  if (MI->getOpcode() != AMDGPU::COPY)
693  break;
694 
695  Reg = MI->getOperand(1).getReg();
697  return false;
698  if (!isLaneMaskReg(Reg))
699  return false;
700  }
701 
702  if (MI->getOpcode() != AMDGPU::S_MOV_B64)
703  return false;
704 
705  if (!MI->getOperand(1).isImm())
706  return false;
707 
708  int64_t Imm = MI->getOperand(1).getImm();
709  if (Imm == 0) {
710  Val = false;
711  return true;
712  }
713  if (Imm == -1) {
714  Val = true;
715  return true;
716  }
717 
718  return false;
719 }
720 
721 static void instrDefsUsesSCC(const MachineInstr &MI, bool &Def, bool &Use) {
722  Def = false;
723  Use = false;
724 
725  for (const MachineOperand &MO : MI.operands()) {
726  if (MO.isReg() && MO.getReg() == AMDGPU::SCC) {
727  if (MO.isUse())
728  Use = true;
729  else
730  Def = true;
731  }
732  }
733 }
734 
735 /// Return a point at the end of the given \p MBB to insert SALU instructions
736 /// for lane mask calculation. Take terminators and SCC into account.
738 SILowerI1Copies::getSaluInsertionAtEnd(MachineBasicBlock &MBB) const {
739  auto InsertionPt = MBB.getFirstTerminator();
740  bool TerminatorsUseSCC = false;
741  for (auto I = InsertionPt, E = MBB.end(); I != E; ++I) {
742  bool DefsSCC;
743  instrDefsUsesSCC(*I, DefsSCC, TerminatorsUseSCC);
744  if (TerminatorsUseSCC || DefsSCC)
745  break;
746  }
747 
748  if (!TerminatorsUseSCC)
749  return InsertionPt;
750 
751  while (InsertionPt != MBB.begin()) {
752  InsertionPt--;
753 
754  bool DefSCC, UseSCC;
755  instrDefsUsesSCC(*InsertionPt, DefSCC, UseSCC);
756  if (DefSCC)
757  return InsertionPt;
758  }
759 
760  // We should have at least seen an IMPLICIT_DEF or COPY
761  llvm_unreachable("SCC used by terminator but no def in block");
762 }
763 
764 void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB,
766  const DebugLoc &DL, unsigned DstReg,
767  unsigned PrevReg, unsigned CurReg) {
768  bool PrevVal;
769  bool PrevConstant = isConstantLaneMask(PrevReg, PrevVal);
770  bool CurVal;
771  bool CurConstant = isConstantLaneMask(CurReg, CurVal);
772 
773  if (PrevConstant && CurConstant) {
774  if (PrevVal == CurVal) {
775  BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(CurReg);
776  } else if (CurVal) {
777  BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(AMDGPU::EXEC);
778  } else {
779  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), DstReg)
780  .addReg(AMDGPU::EXEC)
781  .addImm(-1);
782  }
783  return;
784  }
785 
786  unsigned PrevMaskedReg = 0;
787  unsigned CurMaskedReg = 0;
788  if (!PrevConstant) {
789  if (CurConstant && CurVal) {
790  PrevMaskedReg = PrevReg;
791  } else {
792  PrevMaskedReg = createLaneMaskReg(*MF);
793  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ANDN2_B64), PrevMaskedReg)
794  .addReg(PrevReg)
795  .addReg(AMDGPU::EXEC);
796  }
797  }
798  if (!CurConstant) {
799  // TODO: check whether CurReg is already masked by EXEC
800  if (PrevConstant && PrevVal) {
801  CurMaskedReg = CurReg;
802  } else {
803  CurMaskedReg = createLaneMaskReg(*MF);
804  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_AND_B64), CurMaskedReg)
805  .addReg(CurReg)
806  .addReg(AMDGPU::EXEC);
807  }
808  }
809 
810  if (PrevConstant && !PrevVal) {
811  BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg)
812  .addReg(CurMaskedReg);
813  } else if (CurConstant && !CurVal) {
814  BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg)
815  .addReg(PrevMaskedReg);
816  } else if (PrevConstant && PrevVal) {
817  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ORN2_B64), DstReg)
818  .addReg(CurMaskedReg)
819  .addReg(AMDGPU::EXEC);
820  } else {
821  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_OR_B64), DstReg)
822  .addReg(PrevMaskedReg)
823  .addReg(CurMaskedReg ? CurMaskedReg : (unsigned)AMDGPU::EXEC);
824  }
825 }
Helper class for SSA formation on a set of values defined in multiple blocks.
Definition: SSAUpdater.h:39
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
AMDGPU specific subclass of TargetSubtarget.
const TargetRegisterClass * getRegClass(unsigned Reg) const
Return the register class of the specified virtual register.
This class represents lattice values for constants.
Definition: AllocatorList.h:24
MachineBasicBlock * findNearestCommonDominator(MachineBasicBlock *A, MachineBasicBlock *B)
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
unsigned getReg() const
getReg - Returns the register number.
#define DEBUG_TYPE
static bool isVirtualRegister(unsigned Reg)
Return true if the specified register number is in the virtual register namespace.
INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false, false) INITIALIZE_PASS_END(SILowerI1Copies
unsigned Reg
unsigned getSubReg() const
iterator_range< iterator > phis()
Returns a range that iterates over the phis in the basic block.
const SIInstrInfo * getInstrInfo() const override
A debug info location.
Definition: DebugLoc.h:34
static void instrDefsUsesSCC(const MachineInstr &MI, bool &Def, bool &Use)
iterator_range< mop_iterator > operands()
Definition: MachineInstr.h:459
bool isPHI() const
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
MachineSSAUpdater - This class updates SSA form for a set of virtual registers defined in multiple bl...
const SIRegisterInfo & getRegisterInfo() const
Definition: SIInstrInfo.h:165
iterator_range< succ_iterator > successors()
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:51
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const
A Use represents the edge between a Value definition and its users.
Definition: Use.h:56
iterator_range< iterator > terminators()
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:409
static bool isSource(Value *V)
Return true if the given value is a source in the use-def chain, producing a narrow &#39;TypeSize&#39; value...
FunctionPass * createSILowerI1CopiesPass()
Base class for the actual dominator tree node.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:33
MachineBasicBlock * findNearestCommonDominator(MachineBasicBlock *A, MachineBasicBlock *B)
findNearestCommonDominator - Find nearest common dominator basic block for basic block A and B...
void Initialize(unsigned V)
Initialize - Reset this object to get ready for a new set of SSA updates.
const TargetRegisterClass * constrainRegClass(unsigned Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
NodeT * getBlock() const
unsigned const MachineRegisterInfo * MRI
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
DomTreeNodeBase * getIDom() const
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:188
bool dominates(const MachineDomTreeNode *A, const MachineDomTreeNode *B) const
Represent the analysis usage information of a pass.
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:285
iterator_range< pred_iterator > predecessors()
char & SILowerI1CopiesID
size_t size() const
Definition: SmallVector.h:53
auto find(R &&Range, const T &Val) -> decltype(adl_begin(Range))
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1207
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned getWavefrontSize() const
PostDominatorTree Class - Concrete subclass of DominatorTree that is used to compute the post-dominat...
MachineOperand class - Representation of each machine instruction operand.
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:847
iterator end() const
Definition: ArrayRef.h:138
SI Lower i1 Copies
LLVM_NODISCARD T pop_back_val()
Definition: SmallVector.h:381
pred_range predecessors(BasicBlock *BB)
Definition: CFG.h:125
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:286
int64_t getImm() const
MachineInstr * getUniqueVRegDef(unsigned Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:133
static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, ArrayRef< StringRef > StandardNames)
Initialize the set of available library functions based on the specified target triple.
bool use_empty(unsigned RegNo) const
use_empty - Return true if there are no instructions using the specified register.
void replaceRegWith(unsigned FromReg, unsigned ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
Provides AMDGPU specific target descriptions.
Representation of each machine instruction.
Definition: MachineInstr.h:64
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ATTRIBUTE_ALWAYS_INLINE iterator end()
Definition: SmallVector.h:133
Interface definition for SIInstrInfo.
MachineDomTreeNode * getNode(MachineBasicBlock *BB) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
static unsigned createLaneMaskReg(MachineFunction &MF)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
LLVM_NODISCARD bool empty() const
Definition: SmallVector.h:56
static unsigned insertUndefLaneMask(MachineBasicBlock &MBB)
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned GetValueInMiddleOfBlock(MachineBasicBlock *BB)
GetValueInMiddleOfBlock - Construct SSA form, materializing a value that is live in the middle of the...
void AddAvailableValue(MachineBasicBlock *BB, unsigned V)
AddAvailableValue - Indicate that a rewritten value is available at the end of the specified block wi...
const MachineInstrBuilder & addReg(unsigned RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:171
iterator_range< use_instr_iterator > use_instructions(unsigned Reg) const
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
IRTranslator LLVM IR MI
void setRegClass(unsigned Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:49
void initializeSILowerI1CopiesPass(PassRegistry &)
#define LLVM_DEBUG(X)
Definition: Debug.h:123
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:414
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
unsigned createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...