LLVM  8.0.1
SILoadStoreOptimizer.cpp
Go to the documentation of this file.
1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass tries to fuse DS instructions with close by immediate offsets.
11 // This will fuse operations such as
12 // ds_read_b32 v0, v2 offset:16
13 // ds_read_b32 v1, v2 offset:32
14 // ==>
15 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
16 //
17 // The same is done for certain SMEM and VMEM opcodes, e.g.:
18 // s_buffer_load_dword s4, s[0:3], 4
19 // s_buffer_load_dword s5, s[0:3], 8
20 // ==>
21 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4
22 //
23 // This pass also tries to promote constant offset to the immediate by
24 // adjusting the base. It tries to use a base from the nearby instructions that
25 // allows it to have a 13bit constant offset and then promotes the 13bit offset
26 // to the immediate.
27 // E.g.
28 // s_movk_i32 s0, 0x1800
29 // v_add_co_u32_e32 v0, vcc, s0, v2
30 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
31 //
32 // s_movk_i32 s0, 0x1000
33 // v_add_co_u32_e32 v5, vcc, s0, v2
34 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
35 // global_load_dwordx2 v[5:6], v[5:6], off
36 // global_load_dwordx2 v[0:1], v[0:1], off
37 // =>
38 // s_movk_i32 s0, 0x1000
39 // v_add_co_u32_e32 v5, vcc, s0, v2
40 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
41 // global_load_dwordx2 v[5:6], v[5:6], off
42 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048
43 //
44 // Future improvements:
45 //
46 // - This currently relies on the scheduler to place loads and stores next to
47 // each other, and then only merges adjacent pairs of instructions. It would
48 // be good to be more flexible with interleaved instructions, and possibly run
49 // before scheduling. It currently missing stores of constants because loading
50 // the constant into the data register is placed between the stores, although
51 // this is arguably a scheduling problem.
52 //
53 // - Live interval recomputing seems inefficient. This currently only matches
54 // one pair, and recomputes live intervals and moves on to the next pair. It
55 // would be better to compute a list of all merges that need to occur.
56 //
57 // - With a list of instructions to process, we can also merge more. If a
58 // cluster of loads have offsets that are too large to fit in the 8-bit
59 // offsets, but are close enough to fit in the 8 bits, we can add to the base
60 // pointer and use the new reduced offsets.
61 //
62 //===----------------------------------------------------------------------===//
63 
64 #include "AMDGPU.h"
65 #include "AMDGPUSubtarget.h"
67 #include "SIInstrInfo.h"
68 #include "SIRegisterInfo.h"
69 #include "Utils/AMDGPUBaseInfo.h"
70 #include "llvm/ADT/ArrayRef.h"
71 #include "llvm/ADT/SmallVector.h"
72 #include "llvm/ADT/StringRef.h"
81 #include "llvm/IR/DebugLoc.h"
82 #include "llvm/Pass.h"
83 #include "llvm/Support/Debug.h"
86 #include <algorithm>
87 #include <cassert>
88 #include <cstdlib>
89 #include <iterator>
90 #include <utility>
91 
92 using namespace llvm;
93 
94 #define DEBUG_TYPE "si-load-store-opt"
95 
96 namespace {
98  UNKNOWN,
99  DS_READ,
100  DS_WRITE,
101  S_BUFFER_LOAD_IMM,
102  BUFFER_LOAD_OFFEN = AMDGPU::BUFFER_LOAD_DWORD_OFFEN,
103  BUFFER_LOAD_OFFSET = AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
104  BUFFER_STORE_OFFEN = AMDGPU::BUFFER_STORE_DWORD_OFFEN,
105  BUFFER_STORE_OFFSET = AMDGPU::BUFFER_STORE_DWORD_OFFSET,
106  BUFFER_LOAD_OFFEN_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact,
107  BUFFER_LOAD_OFFSET_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact,
108  BUFFER_STORE_OFFEN_exact = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact,
109  BUFFER_STORE_OFFSET_exact = AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact,
110 };
111 
113  SBASE = 0x1,
114  SRSRC = 0x2,
115  SOFFSET = 0x4,
116  VADDR = 0x8,
117  ADDR = 0x10,
118 };
119 
120 class SILoadStoreOptimizer : public MachineFunctionPass {
121  struct CombineInfo {
124  unsigned EltSize;
125  unsigned Offset0;
126  unsigned Offset1;
127  unsigned Width0;
128  unsigned Width1;
129  unsigned BaseOff;
130  InstClassEnum InstClass;
131  bool GLC0;
132  bool GLC1;
133  bool SLC0;
134  bool SLC1;
135  bool UseST64;
136  SmallVector<MachineInstr *, 8> InstsToMove;
137  };
138 
139  struct BaseRegisters {
140  unsigned LoReg = 0;
141  unsigned HiReg = 0;
142 
143  unsigned LoSubReg = 0;
144  unsigned HiSubReg = 0;
145  };
146 
147  struct MemAddress {
148  BaseRegisters Base;
149  int64_t Offset = 0;
150  };
151 
152  using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
153 
154 private:
155  const GCNSubtarget *STM = nullptr;
156  const SIInstrInfo *TII = nullptr;
157  const SIRegisterInfo *TRI = nullptr;
158  MachineRegisterInfo *MRI = nullptr;
159  AliasAnalysis *AA = nullptr;
160  bool OptimizeAgain;
161 
162  static bool offsetsCanBeCombined(CombineInfo &CI);
163  static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI);
164  static unsigned getNewOpcode(const CombineInfo &CI);
165  static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI);
166  const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI);
167  unsigned getOpcodeWidth(const MachineInstr &MI);
168  InstClassEnum getInstClass(unsigned Opc);
169  unsigned getRegs(unsigned Opc);
170 
171  bool findMatchingInst(CombineInfo &CI);
172 
173  unsigned read2Opcode(unsigned EltSize) const;
174  unsigned read2ST64Opcode(unsigned EltSize) const;
175  MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
176 
177  unsigned write2Opcode(unsigned EltSize) const;
178  unsigned write2ST64Opcode(unsigned EltSize) const;
179  MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
180  MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
181  MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
182  MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);
183 
184  void updateBaseAndOffset(MachineInstr &I, unsigned NewBase,
185  int32_t NewOffset);
186  unsigned computeBase(MachineInstr &MI, const MemAddress &Addr);
187  MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI);
188  Optional<int32_t> extractConstOffset(const MachineOperand &Op);
189  void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr);
190  /// Promotes constant offset to the immediate by adjusting the base. It
191  /// tries to use a base from the nearby instructions that allows it to have
192  /// a 13bit constant offset which gets promoted to the immediate.
193  bool promoteConstantOffsetToImm(MachineInstr &CI,
194  MemInfoMap &Visited,
196 
197 public:
198  static char ID;
199 
200  SILoadStoreOptimizer() : MachineFunctionPass(ID) {
202  }
203 
204  bool optimizeBlock(MachineBasicBlock &MBB);
205 
206  bool runOnMachineFunction(MachineFunction &MF) override;
207 
208  StringRef getPassName() const override { return "SI Load Store Optimizer"; }
209 
210  void getAnalysisUsage(AnalysisUsage &AU) const override {
211  AU.setPreservesCFG();
213 
215  }
216 };
217 
218 } // end anonymous namespace.
219 
220 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
221  "SI Load Store Optimizer", false, false)
224  false, false)
225 
226 char SILoadStoreOptimizer::ID = 0;
227 
228 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
229 
231  return new SILoadStoreOptimizer();
232 }
233 
235  ArrayRef<MachineInstr *> InstsToMove) {
236  MachineBasicBlock *MBB = I->getParent();
237  ++I;
238  for (MachineInstr *MI : InstsToMove) {
239  MI->removeFromParent();
240  MBB->insert(I, MI);
241  }
242 }
243 
244 static void addDefsUsesToList(const MachineInstr &MI,
245  DenseSet<unsigned> &RegDefs,
246  DenseSet<unsigned> &PhysRegUses) {
247  for (const MachineOperand &Op : MI.operands()) {
248  if (Op.isReg()) {
249  if (Op.isDef())
250  RegDefs.insert(Op.getReg());
251  else if (Op.readsReg() &&
253  PhysRegUses.insert(Op.getReg());
254  }
255  }
256 }
257 
260  const SIInstrInfo *TII,
261  AliasAnalysis *AA) {
262  // RAW or WAR - cannot reorder
263  // WAW - cannot reorder
264  // RAR - safe to reorder
265  return !(A->mayStore() || B->mayStore()) ||
266  TII->areMemAccessesTriviallyDisjoint(*A, *B, AA);
267 }
268 
269 // Add MI and its defs to the lists if MI reads one of the defs that are
270 // already in the list. Returns true in that case.
272  DenseSet<unsigned> &PhysRegUses,
274  for (MachineOperand &Use : MI.operands()) {
275  // If one of the defs is read, then there is a use of Def between I and the
276  // instruction that I will potentially be merged with. We will need to move
277  // this instruction after the merged instructions.
278  //
279  // Similarly, if there is a def which is read by an instruction that is to
280  // be moved for merging, then we need to move the def-instruction as well.
281  // This can only happen for physical registers such as M0; virtual
282  // registers are in SSA form.
283  if (Use.isReg() &&
284  ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
285  (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) &&
286  PhysRegUses.count(Use.getReg())))) {
287  Insts.push_back(&MI);
288  addDefsUsesToList(MI, RegDefs, PhysRegUses);
289  return true;
290  }
291  }
292 
293  return false;
294 }
295 
297  ArrayRef<MachineInstr *> InstsToMove,
298  const SIInstrInfo *TII, AliasAnalysis *AA) {
299  assert(MemOp.mayLoadOrStore());
300 
301  for (MachineInstr *InstToMove : InstsToMove) {
302  if (!InstToMove->mayLoadOrStore())
303  continue;
304  if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA))
305  return false;
306  }
307  return true;
308 }
309 
310 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
311  // XXX - Would the same offset be OK? Is there any reason this would happen or
312  // be useful?
313  if (CI.Offset0 == CI.Offset1)
314  return false;
315 
316  // This won't be valid if the offset isn't aligned.
317  if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0))
318  return false;
319 
320  unsigned EltOffset0 = CI.Offset0 / CI.EltSize;
321  unsigned EltOffset1 = CI.Offset1 / CI.EltSize;
322  CI.UseST64 = false;
323  CI.BaseOff = 0;
324 
325  // Handle SMEM and VMEM instructions.
326  if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
327  return (EltOffset0 + CI.Width0 == EltOffset1 ||
328  EltOffset1 + CI.Width1 == EltOffset0) &&
329  CI.GLC0 == CI.GLC1 &&
330  (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1);
331  }
332 
333  // If the offset in elements doesn't fit in 8-bits, we might be able to use
334  // the stride 64 versions.
335  if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
336  isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
337  CI.Offset0 = EltOffset0 / 64;
338  CI.Offset1 = EltOffset1 / 64;
339  CI.UseST64 = true;
340  return true;
341  }
342 
343  // Check if the new offsets fit in the reduced 8-bit range.
344  if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
345  CI.Offset0 = EltOffset0;
346  CI.Offset1 = EltOffset1;
347  return true;
348  }
349 
350  // Try to shift base address to decrease offsets.
351  unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0);
352  CI.BaseOff = std::min(CI.Offset0, CI.Offset1);
353 
354  if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) {
355  CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
356  CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
357  CI.UseST64 = true;
358  return true;
359  }
360 
361  if (isUInt<8>(OffsetDiff)) {
362  CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize;
363  CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize;
364  return true;
365  }
366 
367  return false;
368 }
369 
370 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
371  const CombineInfo &CI) {
372  const unsigned Width = (CI.Width0 + CI.Width1);
373  switch (CI.InstClass) {
374  default:
375  return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
376  case S_BUFFER_LOAD_IMM:
377  switch (Width) {
378  default:
379  return false;
380  case 2:
381  case 4:
382  return true;
383  }
384  }
385 }
386 
387 unsigned SILoadStoreOptimizer::getOpcodeWidth(const MachineInstr &MI) {
388  const unsigned Opc = MI.getOpcode();
389 
390  if (TII->isMUBUF(MI)) {
391  return AMDGPU::getMUBUFDwords(Opc);
392  }
393 
394  switch (Opc) {
395  default:
396  return 0;
397  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
398  return 1;
399  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
400  return 2;
401  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
402  return 4;
403  }
404 }
405 
406 InstClassEnum SILoadStoreOptimizer::getInstClass(unsigned Opc) {
407  if (TII->isMUBUF(Opc)) {
408  const int baseOpcode = AMDGPU::getMUBUFBaseOpcode(Opc);
409 
410  // If we couldn't identify the opcode, bail out.
411  if (baseOpcode == -1) {
412  return UNKNOWN;
413  }
414 
415  switch (baseOpcode) {
416  default:
417  return UNKNOWN;
418  case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
419  return BUFFER_LOAD_OFFEN;
420  case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
421  return BUFFER_LOAD_OFFSET;
422  case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
423  return BUFFER_STORE_OFFEN;
424  case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
425  return BUFFER_STORE_OFFSET;
426  case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
427  return BUFFER_LOAD_OFFEN_exact;
428  case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
429  return BUFFER_LOAD_OFFSET_exact;
430  case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
431  return BUFFER_STORE_OFFEN_exact;
432  case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
433  return BUFFER_STORE_OFFSET_exact;
434  }
435  }
436 
437  switch (Opc) {
438  default:
439  return UNKNOWN;
440  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
441  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
442  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
443  return S_BUFFER_LOAD_IMM;
444  case AMDGPU::DS_READ_B32:
445  case AMDGPU::DS_READ_B64:
446  case AMDGPU::DS_READ_B32_gfx9:
447  case AMDGPU::DS_READ_B64_gfx9:
448  return DS_READ;
449  case AMDGPU::DS_WRITE_B32:
450  case AMDGPU::DS_WRITE_B64:
451  case AMDGPU::DS_WRITE_B32_gfx9:
452  case AMDGPU::DS_WRITE_B64_gfx9:
453  return DS_WRITE;
454  }
455 }
456 
457 unsigned SILoadStoreOptimizer::getRegs(unsigned Opc) {
458  if (TII->isMUBUF(Opc)) {
459  unsigned result = 0;
460 
461  if (AMDGPU::getMUBUFHasVAddr(Opc)) {
462  result |= VADDR;
463  }
464 
465  if (AMDGPU::getMUBUFHasSrsrc(Opc)) {
466  result |= SRSRC;
467  }
468 
469  if (AMDGPU::getMUBUFHasSoffset(Opc)) {
470  result |= SOFFSET;
471  }
472 
473  return result;
474  }
475 
476  switch (Opc) {
477  default:
478  return 0;
479  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
480  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
481  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
482  return SBASE;
483  case AMDGPU::DS_READ_B32:
484  case AMDGPU::DS_READ_B64:
485  case AMDGPU::DS_READ_B32_gfx9:
486  case AMDGPU::DS_READ_B64_gfx9:
487  case AMDGPU::DS_WRITE_B32:
488  case AMDGPU::DS_WRITE_B64:
489  case AMDGPU::DS_WRITE_B32_gfx9:
490  case AMDGPU::DS_WRITE_B64_gfx9:
491  return ADDR;
492  }
493 }
494 
495 bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
496  MachineBasicBlock *MBB = CI.I->getParent();
498  MachineBasicBlock::iterator MBBI = CI.I;
499 
500  const unsigned Opc = CI.I->getOpcode();
501  const InstClassEnum InstClass = getInstClass(Opc);
502 
503  if (InstClass == UNKNOWN) {
504  return false;
505  }
506 
507  const unsigned Regs = getRegs(Opc);
508 
509  unsigned AddrOpName[5] = {0};
510  int AddrIdx[5];
511  const MachineOperand *AddrReg[5];
512  unsigned NumAddresses = 0;
513 
514  if (Regs & ADDR) {
515  AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
516  }
517 
518  if (Regs & SBASE) {
519  AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
520  }
521 
522  if (Regs & SRSRC) {
523  AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
524  }
525 
526  if (Regs & SOFFSET) {
527  AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
528  }
529 
530  if (Regs & VADDR) {
531  AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
532  }
533 
534  for (unsigned i = 0; i < NumAddresses; i++) {
535  AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]);
536  AddrReg[i] = &CI.I->getOperand(AddrIdx[i]);
537 
538  // We only ever merge operations with the same base address register, so
539  // don't bother scanning forward if there are no other uses.
540  if (AddrReg[i]->isReg() &&
542  MRI->hasOneNonDBGUse(AddrReg[i]->getReg())))
543  return false;
544  }
545 
546  ++MBBI;
547 
548  DenseSet<unsigned> RegDefsToMove;
549  DenseSet<unsigned> PhysRegUsesToMove;
550  addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
551 
552  for (; MBBI != E; ++MBBI) {
553  const bool IsDS = (InstClass == DS_READ) || (InstClass == DS_WRITE);
554 
555  if ((getInstClass(MBBI->getOpcode()) != InstClass) ||
556  (IsDS && (MBBI->getOpcode() != Opc))) {
557  // This is not a matching DS instruction, but we can keep looking as
558  // long as one of these conditions are met:
559  // 1. It is safe to move I down past MBBI.
560  // 2. It is safe to move MBBI down past the instruction that I will
561  // be merged into.
562 
563  if (MBBI->hasUnmodeledSideEffects()) {
564  // We can't re-order this instruction with respect to other memory
565  // operations, so we fail both conditions mentioned above.
566  return false;
567  }
568 
569  if (MBBI->mayLoadOrStore() &&
570  (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
571  !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) {
572  // We fail condition #1, but we may still be able to satisfy condition
573  // #2. Add this instruction to the move list and then we will check
574  // if condition #2 holds once we have selected the matching instruction.
575  CI.InstsToMove.push_back(&*MBBI);
576  addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
577  continue;
578  }
579 
580  // When we match I with another DS instruction we will be moving I down
581  // to the location of the matched instruction any uses of I will need to
582  // be moved down as well.
583  addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
584  CI.InstsToMove);
585  continue;
586  }
587 
588  // Don't merge volatiles.
589  if (MBBI->hasOrderedMemoryRef())
590  return false;
591 
592  // Handle a case like
593  // DS_WRITE_B32 addr, v, idx0
594  // w = DS_READ_B32 addr, idx0
595  // DS_WRITE_B32 addr, f(w), idx1
596  // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
597  // merging of the two writes.
598  if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
599  CI.InstsToMove))
600  continue;
601 
602  bool Match = true;
603  for (unsigned i = 0; i < NumAddresses; i++) {
604  const MachineOperand &AddrRegNext = MBBI->getOperand(AddrIdx[i]);
605 
606  if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
607  if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
608  AddrReg[i]->getImm() != AddrRegNext.getImm()) {
609  Match = false;
610  break;
611  }
612  continue;
613  }
614 
615  // Check same base pointer. Be careful of subregisters, which can occur
616  // with vectors of pointers.
617  if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
618  AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
619  Match = false;
620  break;
621  }
622  }
623 
624  if (Match) {
625  int OffsetIdx =
626  AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::offset);
627  CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm();
628  CI.Width0 = getOpcodeWidth(*CI.I);
629  CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm();
630  CI.Width1 = getOpcodeWidth(*MBBI);
631  CI.Paired = MBBI;
632 
633  if ((CI.InstClass == DS_READ) || (CI.InstClass == DS_WRITE)) {
634  CI.Offset0 &= 0xffff;
635  CI.Offset1 &= 0xffff;
636  } else {
637  CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm();
638  CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm();
639  if (CI.InstClass != S_BUFFER_LOAD_IMM) {
640  CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm();
641  CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm();
642  }
643  }
644 
645  // Check both offsets fit in the reduced range.
646  // We also need to go through the list of instructions that we plan to
647  // move and make sure they are all safe to move down past the merged
648  // instruction.
649  if (widthsFit(*STM, CI) && offsetsCanBeCombined(CI))
650  if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
651  return true;
652  }
653 
654  // We've found a load/store that we couldn't merge for some reason.
655  // We could potentially keep looking, but we'd need to make sure that
656  // it was safe to move I and also all the instruction in InstsToMove
657  // down past this instruction.
658  // check if we can move I across MBBI and if we can move all I's users
659  if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
660  !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
661  break;
662  }
663  return false;
664 }
665 
666 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
667  if (STM->ldsRequiresM0Init())
668  return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
669  return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
670 }
671 
672 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
673  if (STM->ldsRequiresM0Init())
674  return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
675 
676  return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
677  : AMDGPU::DS_READ2ST64_B64_gfx9;
678 }
679 
681 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) {
682  MachineBasicBlock *MBB = CI.I->getParent();
683 
684  // Be careful, since the addresses could be subregisters themselves in weird
685  // cases, like vectors of pointers.
686  const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
687 
688  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
689  const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst);
690 
691  unsigned NewOffset0 = CI.Offset0;
692  unsigned NewOffset1 = CI.Offset1;
693  unsigned Opc =
694  CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
695 
696  unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
697  unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
698 
699  if (NewOffset0 > NewOffset1) {
700  // Canonicalize the merged instruction so the smaller offset comes first.
701  std::swap(NewOffset0, NewOffset1);
702  std::swap(SubRegIdx0, SubRegIdx1);
703  }
704 
705  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
706  (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
707 
708  const MCInstrDesc &Read2Desc = TII->get(Opc);
709 
710  const TargetRegisterClass *SuperRC =
711  (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
712  unsigned DestReg = MRI->createVirtualRegister(SuperRC);
713 
714  DebugLoc DL = CI.I->getDebugLoc();
715 
716  unsigned BaseReg = AddrReg->getReg();
717  unsigned BaseSubReg = AddrReg->getSubReg();
718  unsigned BaseRegFlags = 0;
719  if (CI.BaseOff) {
720  unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
721  BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
722  .addImm(CI.BaseOff);
723 
724  BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
725  BaseRegFlags = RegState::Kill;
726 
727  TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
728  .addReg(ImmReg)
729  .addReg(AddrReg->getReg(), 0, BaseSubReg);
730  BaseSubReg = 0;
731  }
732 
733  MachineInstrBuilder Read2 =
734  BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
735  .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
736  .addImm(NewOffset0) // offset0
737  .addImm(NewOffset1) // offset1
738  .addImm(0) // gds
739  .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
740 
741  (void)Read2;
742 
743  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
744 
745  // Copy to the old destination registers.
746  BuildMI(*MBB, CI.Paired, DL, CopyDesc)
747  .add(*Dest0) // Copy to same destination including flags and sub reg.
748  .addReg(DestReg, 0, SubRegIdx0);
749  MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
750  .add(*Dest1)
751  .addReg(DestReg, RegState::Kill, SubRegIdx1);
752 
753  moveInstsAfter(Copy1, CI.InstsToMove);
754 
755  MachineBasicBlock::iterator Next = std::next(CI.I);
756  CI.I->eraseFromParent();
757  CI.Paired->eraseFromParent();
758 
759  LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
760  return Next;
761 }
762 
763 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
764  if (STM->ldsRequiresM0Init())
765  return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
766  return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
767  : AMDGPU::DS_WRITE2_B64_gfx9;
768 }
769 
770 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
771  if (STM->ldsRequiresM0Init())
772  return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
773  : AMDGPU::DS_WRITE2ST64_B64;
774 
775  return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
776  : AMDGPU::DS_WRITE2ST64_B64_gfx9;
777 }
778 
780 SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) {
781  MachineBasicBlock *MBB = CI.I->getParent();
782 
783  // Be sure to use .addOperand(), and not .addReg() with these. We want to be
784  // sure we preserve the subregister index and any register flags set on them.
785  const MachineOperand *AddrReg =
786  TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
787  const MachineOperand *Data0 =
788  TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
789  const MachineOperand *Data1 =
790  TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
791 
792  unsigned NewOffset0 = CI.Offset0;
793  unsigned NewOffset1 = CI.Offset1;
794  unsigned Opc =
795  CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
796 
797  if (NewOffset0 > NewOffset1) {
798  // Canonicalize the merged instruction so the smaller offset comes first.
799  std::swap(NewOffset0, NewOffset1);
800  std::swap(Data0, Data1);
801  }
802 
803  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
804  (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
805 
806  const MCInstrDesc &Write2Desc = TII->get(Opc);
807  DebugLoc DL = CI.I->getDebugLoc();
808 
809  unsigned BaseReg = AddrReg->getReg();
810  unsigned BaseSubReg = AddrReg->getSubReg();
811  unsigned BaseRegFlags = 0;
812  if (CI.BaseOff) {
813  unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
814  BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
815  .addImm(CI.BaseOff);
816 
817  BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
818  BaseRegFlags = RegState::Kill;
819 
820  TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
821  .addReg(ImmReg)
822  .addReg(AddrReg->getReg(), 0, BaseSubReg);
823  BaseSubReg = 0;
824  }
825 
826  MachineInstrBuilder Write2 =
827  BuildMI(*MBB, CI.Paired, DL, Write2Desc)
828  .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
829  .add(*Data0) // data0
830  .add(*Data1) // data1
831  .addImm(NewOffset0) // offset0
832  .addImm(NewOffset1) // offset1
833  .addImm(0) // gds
834  .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
835 
836  moveInstsAfter(Write2, CI.InstsToMove);
837 
838  MachineBasicBlock::iterator Next = std::next(CI.I);
839  CI.I->eraseFromParent();
840  CI.Paired->eraseFromParent();
841 
842  LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
843  return Next;
844 }
845 
847 SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) {
848  MachineBasicBlock *MBB = CI.I->getParent();
849  DebugLoc DL = CI.I->getDebugLoc();
850  const unsigned Opcode = getNewOpcode(CI);
851 
852  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
853 
854  unsigned DestReg = MRI->createVirtualRegister(SuperRC);
855  unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
856 
857  BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
858  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
859  .addImm(MergedOffset) // offset
860  .addImm(CI.GLC0) // glc
861  .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
862 
863  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
864  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
865  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
866 
867  // Copy to the old destination registers.
868  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
869  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
870  const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst);
871 
872  BuildMI(*MBB, CI.Paired, DL, CopyDesc)
873  .add(*Dest0) // Copy to same destination including flags and sub reg.
874  .addReg(DestReg, 0, SubRegIdx0);
875  MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
876  .add(*Dest1)
877  .addReg(DestReg, RegState::Kill, SubRegIdx1);
878 
879  moveInstsAfter(Copy1, CI.InstsToMove);
880 
881  MachineBasicBlock::iterator Next = std::next(CI.I);
882  CI.I->eraseFromParent();
883  CI.Paired->eraseFromParent();
884  return Next;
885 }
886 
888 SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {
889  MachineBasicBlock *MBB = CI.I->getParent();
890  DebugLoc DL = CI.I->getDebugLoc();
891 
892  const unsigned Opcode = getNewOpcode(CI);
893 
894  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
895 
896  // Copy to the new source register.
897  unsigned DestReg = MRI->createVirtualRegister(SuperRC);
898  unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
899 
900  auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
901 
902  const unsigned Regs = getRegs(Opcode);
903 
904  if (Regs & VADDR)
905  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
906 
907  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
908  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
909  .addImm(MergedOffset) // offset
910  .addImm(CI.GLC0) // glc
911  .addImm(CI.SLC0) // slc
912  .addImm(0) // tfe
913  .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
914 
915  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
916  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
917  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
918 
919  // Copy to the old destination registers.
920  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
921  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
922  const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
923 
924  BuildMI(*MBB, CI.Paired, DL, CopyDesc)
925  .add(*Dest0) // Copy to same destination including flags and sub reg.
926  .addReg(DestReg, 0, SubRegIdx0);
927  MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
928  .add(*Dest1)
929  .addReg(DestReg, RegState::Kill, SubRegIdx1);
930 
931  moveInstsAfter(Copy1, CI.InstsToMove);
932 
933  MachineBasicBlock::iterator Next = std::next(CI.I);
934  CI.I->eraseFromParent();
935  CI.Paired->eraseFromParent();
936  return Next;
937 }
938 
939 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) {
940  const unsigned Width = CI.Width0 + CI.Width1;
941 
942  switch (CI.InstClass) {
943  default:
944  return AMDGPU::getMUBUFOpcode(CI.InstClass, Width);
945  case UNKNOWN:
946  llvm_unreachable("Unknown instruction class");
947  case S_BUFFER_LOAD_IMM:
948  switch (Width) {
949  default:
950  return 0;
951  case 2:
952  return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
953  case 4:
954  return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
955  }
956  }
957 }
958 
959 std::pair<unsigned, unsigned>
960 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) {
961  if (CI.Offset0 > CI.Offset1) {
962  switch (CI.Width0) {
963  default:
964  return std::make_pair(0, 0);
965  case 1:
966  switch (CI.Width1) {
967  default:
968  return std::make_pair(0, 0);
969  case 1:
970  return std::make_pair(AMDGPU::sub1, AMDGPU::sub0);
971  case 2:
972  return std::make_pair(AMDGPU::sub2, AMDGPU::sub0_sub1);
973  case 3:
974  return std::make_pair(AMDGPU::sub3, AMDGPU::sub0_sub1_sub2);
975  }
976  case 2:
977  switch (CI.Width1) {
978  default:
979  return std::make_pair(0, 0);
980  case 1:
981  return std::make_pair(AMDGPU::sub1_sub2, AMDGPU::sub0);
982  case 2:
983  return std::make_pair(AMDGPU::sub2_sub3, AMDGPU::sub0_sub1);
984  }
985  case 3:
986  switch (CI.Width1) {
987  default:
988  return std::make_pair(0, 0);
989  case 1:
990  return std::make_pair(AMDGPU::sub1_sub2_sub3, AMDGPU::sub0);
991  }
992  }
993  } else {
994  switch (CI.Width0) {
995  default:
996  return std::make_pair(0, 0);
997  case 1:
998  switch (CI.Width1) {
999  default:
1000  return std::make_pair(0, 0);
1001  case 1:
1002  return std::make_pair(AMDGPU::sub0, AMDGPU::sub1);
1003  case 2:
1004  return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2);
1005  case 3:
1006  return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2_sub3);
1007  }
1008  case 2:
1009  switch (CI.Width1) {
1010  default:
1011  return std::make_pair(0, 0);
1012  case 1:
1013  return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2);
1014  case 2:
1015  return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2_sub3);
1016  }
1017  case 3:
1018  switch (CI.Width1) {
1019  default:
1020  return std::make_pair(0, 0);
1021  case 1:
1022  return std::make_pair(AMDGPU::sub0_sub1_sub2, AMDGPU::sub3);
1023  }
1024  }
1025  }
1026 }
1027 
1028 const TargetRegisterClass *
1029 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI) {
1030  if (CI.InstClass == S_BUFFER_LOAD_IMM) {
1031  switch (CI.Width0 + CI.Width1) {
1032  default:
1033  return nullptr;
1034  case 2:
1035  return &AMDGPU::SReg_64_XEXECRegClass;
1036  case 4:
1037  return &AMDGPU::SReg_128RegClass;
1038  case 8:
1039  return &AMDGPU::SReg_256RegClass;
1040  case 16:
1041  return &AMDGPU::SReg_512RegClass;
1042  }
1043  } else {
1044  switch (CI.Width0 + CI.Width1) {
1045  default:
1046  return nullptr;
1047  case 2:
1048  return &AMDGPU::VReg_64RegClass;
1049  case 3:
1050  return &AMDGPU::VReg_96RegClass;
1051  case 4:
1052  return &AMDGPU::VReg_128RegClass;
1053  }
1054  }
1055 }
1056 
1058 SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) {
1059  MachineBasicBlock *MBB = CI.I->getParent();
1060  DebugLoc DL = CI.I->getDebugLoc();
1061 
1062  const unsigned Opcode = getNewOpcode(CI);
1063 
1064  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
1065  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1066  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1067 
1068  // Copy to the new source register.
1069  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
1070  unsigned SrcReg = MRI->createVirtualRegister(SuperRC);
1071 
1072  const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1073  const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
1074 
1075  BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1076  .add(*Src0)
1077  .addImm(SubRegIdx0)
1078  .add(*Src1)
1079  .addImm(SubRegIdx1);
1080 
1081  auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode))
1082  .addReg(SrcReg, RegState::Kill);
1083 
1084  const unsigned Regs = getRegs(Opcode);
1085 
1086  if (Regs & VADDR)
1087  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1088 
1089  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1090  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1091  .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
1092  .addImm(CI.GLC0) // glc
1093  .addImm(CI.SLC0) // slc
1094  .addImm(0) // tfe
1095  .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
1096 
1097  moveInstsAfter(MIB, CI.InstsToMove);
1098 
1099  MachineBasicBlock::iterator Next = std::next(CI.I);
1100  CI.I->eraseFromParent();
1101  CI.Paired->eraseFromParent();
1102  return Next;
1103 }
1104 
1106 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) {
1107  APInt V(32, Val, true);
1108  if (TII->isInlineConstant(V))
1109  return MachineOperand::CreateImm(Val);
1110 
1111  unsigned Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1112  MachineInstr *Mov =
1113  BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1114  TII->get(AMDGPU::S_MOV_B32), Reg)
1115  .addImm(Val);
1116  (void)Mov;
1117  LLVM_DEBUG(dbgs() << " "; Mov->dump());
1118  return MachineOperand::CreateReg(Reg, false);
1119 }
1120 
1121 // Compute base address using Addr and return the final register.
1122 unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1123  const MemAddress &Addr) {
1124  MachineBasicBlock *MBB = MI.getParent();
1126  DebugLoc DL = MI.getDebugLoc();
1127 
1128  assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1129  Addr.Base.LoSubReg) &&
1130  "Expected 32-bit Base-Register-Low!!");
1131 
1132  assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1133  Addr.Base.HiSubReg) &&
1134  "Expected 32-bit Base-Register-Hi!!");
1135 
1136  LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
1137  MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1138  MachineOperand OffsetHi =
1139  createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1140  unsigned CarryReg = MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
1141  unsigned DeadCarryReg =
1142  MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
1143 
1144  unsigned DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1145  unsigned DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1146  MachineInstr *LoHalf =
1147  BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0)
1148  .addReg(CarryReg, RegState::Define)
1149  .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1150  .add(OffsetLo);
1151  (void)LoHalf;
1152  LLVM_DEBUG(dbgs() << " "; LoHalf->dump(););
1153 
1154  MachineInstr *HiHalf =
1155  BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1156  .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1157  .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1158  .add(OffsetHi)
1159  .addReg(CarryReg, RegState::Kill);
1160  (void)HiHalf;
1161  LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
1162 
1163  unsigned FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
1164  MachineInstr *FullBase =
1165  BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1166  .addReg(DestSub0)
1167  .addImm(AMDGPU::sub0)
1168  .addReg(DestSub1)
1169  .addImm(AMDGPU::sub1);
1170  (void)FullBase;
1171  LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";);
1172 
1173  return FullDestReg;
1174 }
1175 
1176 // Update base and offset with the NewBase and NewOffset in MI.
1177 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1178  unsigned NewBase,
1179  int32_t NewOffset) {
1180  TII->getNamedOperand(MI, AMDGPU::OpName::vaddr)->setReg(NewBase);
1181  TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1182 }
1183 
1185 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) {
1186  if (Op.isImm())
1187  return Op.getImm();
1188 
1189  if (!Op.isReg())
1190  return None;
1191 
1192  MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
1193  if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
1194  !Def->getOperand(1).isImm())
1195  return None;
1196 
1197  return Def->getOperand(1).getImm();
1198 }
1199 
1200 // Analyze Base and extracts:
1201 // - 32bit base registers, subregisters
1202 // - 64bit constant offset
1203 // Expecting base computation as:
1204 // %OFFSET0:sgpr_32 = S_MOV_B32 8000
1205 // %LO:vgpr_32, %c:sreg_64_xexec =
1206 // V_ADD_I32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
1207 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
1208 // %Base:vreg_64 =
1209 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
1210 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
1211  MemAddress &Addr) {
1212  if (!Base.isReg())
1213  return;
1214 
1215  MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
1216  if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
1217  || Def->getNumOperands() != 5)
1218  return;
1219 
1220  MachineOperand BaseLo = Def->getOperand(1);
1221  MachineOperand BaseHi = Def->getOperand(3);
1222  if (!BaseLo.isReg() || !BaseHi.isReg())
1223  return;
1224 
1225  MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
1226  MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
1227 
1228  if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_I32_e64 ||
1229  !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
1230  return;
1231 
1232  const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
1233  const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
1234 
1235  auto Offset0P = extractConstOffset(*Src0);
1236  if (Offset0P)
1237  BaseLo = *Src1;
1238  else {
1239  if (!(Offset0P = extractConstOffset(*Src1)))
1240  return;
1241  BaseLo = *Src0;
1242  }
1243 
1244  Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
1245  Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
1246 
1247  if (Src0->isImm())
1248  std::swap(Src0, Src1);
1249 
1250  if (!Src1->isImm())
1251  return;
1252 
1253  uint64_t Offset1 = Src1->getImm();
1254  BaseHi = *Src0;
1255 
1256  Addr.Base.LoReg = BaseLo.getReg();
1257  Addr.Base.HiReg = BaseHi.getReg();
1258  Addr.Base.LoSubReg = BaseLo.getSubReg();
1259  Addr.Base.HiSubReg = BaseHi.getSubReg();
1260  Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
1261 }
1262 
1263 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
1264  MachineInstr &MI,
1265  MemInfoMap &Visited,
1266  SmallPtrSet<MachineInstr *, 4> &AnchorList) {
1267 
1268  // TODO: Support flat and scratch.
1269  if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0 ||
1270  TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL)
1271  return false;
1272 
1273  // TODO: Support Store.
1274  if (!MI.mayLoad())
1275  return false;
1276 
1277  if (AnchorList.count(&MI))
1278  return false;
1279 
1280  LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
1281 
1282  if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
1283  LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);
1284  return false;
1285  }
1286 
1287  // Step1: Find the base-registers and a 64bit constant offset.
1288  MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1289  MemAddress MAddr;
1290  if (Visited.find(&MI) == Visited.end()) {
1291  processBaseWithConstOffset(Base, MAddr);
1292  Visited[&MI] = MAddr;
1293  } else
1294  MAddr = Visited[&MI];
1295 
1296  if (MAddr.Offset == 0) {
1297  LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
1298  " constant offsets that can be promoted.\n";);
1299  return false;
1300  }
1301 
1302  LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", "
1303  << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
1304 
1305  // Step2: Traverse through MI's basic block and find an anchor(that has the
1306  // same base-registers) with the highest 13bit distance from MI's offset.
1307  // E.g. (64bit loads)
1308  // bb:
1309  // addr1 = &a + 4096; load1 = load(addr1, 0)
1310  // addr2 = &a + 6144; load2 = load(addr2, 0)
1311  // addr3 = &a + 8192; load3 = load(addr3, 0)
1312  // addr4 = &a + 10240; load4 = load(addr4, 0)
1313  // addr5 = &a + 12288; load5 = load(addr5, 0)
1314  //
1315  // Starting from the first load, the optimization will try to find a new base
1316  // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
1317  // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
1318  // as the new-base(anchor) because of the maximum distance which can
1319  // accomodate more intermediate bases presumeably.
1320  //
1321  // Step3: move (&a + 8192) above load1. Compute and promote offsets from
1322  // (&a + 8192) for load1, load2, load4.
1323  // addr = &a + 8192
1324  // load1 = load(addr, -4096)
1325  // load2 = load(addr, -2048)
1326  // load3 = load(addr, 0)
1327  // load4 = load(addr, 2048)
1328  // addr5 = &a + 12288; load5 = load(addr5, 0)
1329  //
1330  MachineInstr *AnchorInst = nullptr;
1331  MemAddress AnchorAddr;
1332  uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
1334 
1335  MachineBasicBlock *MBB = MI.getParent();
1338  ++MBBI;
1339  const SITargetLowering *TLI =
1340  static_cast<const SITargetLowering *>(STM->getTargetLowering());
1341 
1342  for ( ; MBBI != E; ++MBBI) {
1343  MachineInstr &MINext = *MBBI;
1344  // TODO: Support finding an anchor(with same base) from store addresses or
1345  // any other load addresses where the opcodes are different.
1346  if (MINext.getOpcode() != MI.getOpcode() ||
1347  TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
1348  continue;
1349 
1350  const MachineOperand &BaseNext =
1351  *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
1352  MemAddress MAddrNext;
1353  if (Visited.find(&MINext) == Visited.end()) {
1354  processBaseWithConstOffset(BaseNext, MAddrNext);
1355  Visited[&MINext] = MAddrNext;
1356  } else
1357  MAddrNext = Visited[&MINext];
1358 
1359  if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
1360  MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
1361  MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
1362  MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
1363  continue;
1364 
1365  InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset));
1366 
1367  int64_t Dist = MAddr.Offset - MAddrNext.Offset;
1369  AM.HasBaseReg = true;
1370  AM.BaseOffs = Dist;
1371  if (TLI->isLegalGlobalAddressingMode(AM) &&
1372  (uint32_t)std::abs(Dist) > MaxDist) {
1373  MaxDist = std::abs(Dist);
1374 
1375  AnchorAddr = MAddrNext;
1376  AnchorInst = &MINext;
1377  }
1378  }
1379 
1380  if (AnchorInst) {
1381  LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
1382  AnchorInst->dump());
1383  LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
1384  << AnchorAddr.Offset << "\n\n");
1385 
1386  // Instead of moving up, just re-compute anchor-instruction's base address.
1387  unsigned Base = computeBase(MI, AnchorAddr);
1388 
1389  updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
1390  LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump(););
1391 
1392  for (auto P : InstsWCommonBase) {
1394  AM.HasBaseReg = true;
1395  AM.BaseOffs = P.second - AnchorAddr.Offset;
1396 
1397  if (TLI->isLegalGlobalAddressingMode(AM)) {
1398  LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second;
1399  dbgs() << ")"; P.first->dump());
1400  updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
1401  LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump());
1402  }
1403  }
1404  AnchorList.insert(AnchorInst);
1405  return true;
1406  }
1407 
1408  return false;
1409 }
1410 
1411 // Scan through looking for adjacent LDS operations with constant offsets from
1412 // the same base register. We rely on the scheduler to do the hard work of
1413 // clustering nearby loads, and assume these are all adjacent.
1414 bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
1415  bool Modified = false;
1416 
1417  // Contain the list
1418  MemInfoMap Visited;
1419  // Contains the list of instructions for which constant offsets are being
1420  // promoted to the IMM.
1421  SmallPtrSet<MachineInstr *, 4> AnchorList;
1422 
1423  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
1424  MachineInstr &MI = *I;
1425 
1426  if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
1427  Modified = true;
1428 
1429  // Don't combine if volatile.
1430  if (MI.hasOrderedMemoryRef()) {
1431  ++I;
1432  continue;
1433  }
1434 
1435  const unsigned Opc = MI.getOpcode();
1436 
1437  CombineInfo CI;
1438  CI.I = I;
1439  CI.InstClass = getInstClass(Opc);
1440 
1441  switch (CI.InstClass) {
1442  default:
1443  break;
1444  case DS_READ:
1445  CI.EltSize =
1446  (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
1447  : 4;
1448  if (findMatchingInst(CI)) {
1449  Modified = true;
1450  I = mergeRead2Pair(CI);
1451  } else {
1452  ++I;
1453  }
1454  continue;
1455  case DS_WRITE:
1456  CI.EltSize =
1457  (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
1458  : 4;
1459  if (findMatchingInst(CI)) {
1460  Modified = true;
1461  I = mergeWrite2Pair(CI);
1462  } else {
1463  ++I;
1464  }
1465  continue;
1466  case S_BUFFER_LOAD_IMM:
1467  CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4);
1468  if (findMatchingInst(CI)) {
1469  Modified = true;
1470  I = mergeSBufferLoadImmPair(CI);
1471  OptimizeAgain |= (CI.Width0 + CI.Width1) < 16;
1472  } else {
1473  ++I;
1474  }
1475  continue;
1476  case BUFFER_LOAD_OFFEN:
1477  case BUFFER_LOAD_OFFSET:
1478  case BUFFER_LOAD_OFFEN_exact:
1479  case BUFFER_LOAD_OFFSET_exact:
1480  CI.EltSize = 4;
1481  if (findMatchingInst(CI)) {
1482  Modified = true;
1483  I = mergeBufferLoadPair(CI);
1484  OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
1485  } else {
1486  ++I;
1487  }
1488  continue;
1489  case BUFFER_STORE_OFFEN:
1490  case BUFFER_STORE_OFFSET:
1491  case BUFFER_STORE_OFFEN_exact:
1492  case BUFFER_STORE_OFFSET_exact:
1493  CI.EltSize = 4;
1494  if (findMatchingInst(CI)) {
1495  Modified = true;
1496  I = mergeBufferStorePair(CI);
1497  OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
1498  } else {
1499  ++I;
1500  }
1501  continue;
1502  }
1503 
1504  ++I;
1505  }
1506 
1507  return Modified;
1508 }
1509 
1510 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
1511  if (skipFunction(MF.getFunction()))
1512  return false;
1513 
1514  STM = &MF.getSubtarget<GCNSubtarget>();
1515  if (!STM->loadStoreOptEnabled())
1516  return false;
1517 
1518  TII = STM->getInstrInfo();
1519  TRI = &TII->getRegisterInfo();
1520 
1521  MRI = &MF.getRegInfo();
1522  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1523 
1524  assert(MRI->isSSA() && "Must be run on SSA");
1525 
1526  LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
1527 
1528  bool Modified = false;
1529 
1530  for (MachineBasicBlock &MBB : MF) {
1531  do {
1532  OptimizeAgain = false;
1533  Modified |= optimizeBlock(MBB);
1534  } while (OptimizeAgain);
1535  }
1536 
1537  return Modified;
1538 }
static bool isReg(const MCInst &MI, unsigned OpNo)
int getMUBUFOpcode(unsigned BaseOpc, unsigned Dwords)
const MachineInstrBuilder & add(const MachineOperand &MO) const
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
Interface definition for SIRegisterInfo.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
AMDGPU specific subclass of TargetSubtarget.
bool getMUBUFHasSrsrc(unsigned Opc)
This class represents lattice values for constants.
Definition: AllocatorList.h:24
char & SILoadStoreOptimizerID
void push_back(const T &Elt)
Definition: SmallVector.h:218
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:383
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:164
unsigned getReg() const
getReg - Returns the register number.
unsigned Reg
unsigned getSubReg() const
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
Definition: MachineInstr.h:830
const SIInstrInfo * getInstrInfo() const override
unsigned const TargetRegisterInfo * TRI
A debug info location.
Definition: DebugLoc.h:34
int getMUBUFDwords(unsigned Opc)
iterator_range< mop_iterator > operands()
Definition: MachineInstr.h:459
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateReg(unsigned Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:51
static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, MachineBasicBlock::iterator B, const SIInstrInfo *TII, AliasAnalysis *AA)
bool hasDwordx3LoadStores() const
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:412
A Use represents the edge between a Value definition and its users.
Definition: Use.h:56
MachineBasicBlock iterator that automatically skips over MIs that are inside bundles (i...
#define DEBUG_TYPE
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:409
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:33
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
#define P(N)
int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
constexpr bool isUInt< 8 >(uint64_t x)
Definition: MathExtras.h:343
unsigned const MachineRegisterInfo * MRI
bool getMUBUFHasSoffset(unsigned Opc)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:188
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:371
int getMUBUFBaseOpcode(unsigned Opc)
Represent the analysis usage information of a pass.
SI Load Store Optimizer
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:285
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:382
self_iterator getIterator()
Definition: ilist_node.h:82
static void moveInstsAfter(MachineBasicBlock::iterator I, ArrayRef< MachineInstr *> InstsToMove)
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:418
MachineOperand class - Representation of each machine instruction operand.
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:847
bool areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis *AA=nullptr) const override
bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:286
int64_t getImm() const
const Function & getFunction() const
Return the LLVM function that this machine code represents.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:133
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:941
Class for arbitrary precision integers.
Definition: APInt.h:70
static bool addToListsIfDependent(MachineInstr &MI, DenseSet< unsigned > &RegDefs, DenseSet< unsigned > &PhysRegUses, SmallVectorImpl< MachineInstr *> &Insts)
static unsigned getReg(const void *D, unsigned RC, unsigned RegNo)
INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", false, false) INITIALIZE_PASS_END(SILoadStoreOptimizer
void initializeSILoadStoreOptimizerPass(PassRegistry &)
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:254
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
Provides AMDGPU specific target descriptions.
Representation of each machine instruction.
Definition: MachineInstr.h:64
static bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
Interface definition for SIInstrInfo.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
bool loadStoreOptEnabled() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
static MachineOperand CreateImm(int64_t Val)
#define I(x, y, z)
Definition: MD5.cpp:58
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1213
const MachineInstrBuilder & cloneMergedMemRefs(ArrayRef< const MachineInstr *> OtherMIs) const
static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, ArrayRef< MachineInstr *> InstsToMove, const SIInstrInfo *TII, AliasAnalysis *AA)
const MachineInstrBuilder & addReg(unsigned RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:92
bool isReg() const
isReg - Tests if this is a MO_Register operand.
FunctionPass * createSILoadStoreOptimizerPass()
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be iniitalized. ...
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
Definition: MachineInstr.h:807
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
IRTranslator LLVM IR MI
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:49
bool getMUBUFHasVAddr(unsigned Opc)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object...
const SITargetLowering * getTargetLowering() const override
#define LLVM_DEBUG(X)
Definition: Debug.h:123
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:414
static void addDefsUsesToList(const MachineInstr &MI, DenseSet< unsigned > &RegDefs, DenseSet< unsigned > &PhysRegUses)