94 #define DEBUG_TYPE "si-load-store-opt" 102 BUFFER_LOAD_OFFEN = AMDGPU::BUFFER_LOAD_DWORD_OFFEN,
103 BUFFER_LOAD_OFFSET = AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
104 BUFFER_STORE_OFFEN = AMDGPU::BUFFER_STORE_DWORD_OFFEN,
105 BUFFER_STORE_OFFSET = AMDGPU::BUFFER_STORE_DWORD_OFFSET,
106 BUFFER_LOAD_OFFEN_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact,
107 BUFFER_LOAD_OFFSET_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact,
108 BUFFER_STORE_OFFEN_exact = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact,
109 BUFFER_STORE_OFFSET_exact = AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact,
139 struct BaseRegisters {
143 unsigned LoSubReg = 0;
144 unsigned HiSubReg = 0;
162 static bool offsetsCanBeCombined(CombineInfo &CI);
163 static bool widthsFit(
const GCNSubtarget &STM,
const CombineInfo &CI);
164 static unsigned getNewOpcode(
const CombineInfo &CI);
165 static std::pair<unsigned, unsigned> getSubRegIdxs(
const CombineInfo &CI);
169 unsigned getRegs(
unsigned Opc);
171 bool findMatchingInst(CombineInfo &CI);
173 unsigned read2Opcode(
unsigned EltSize)
const;
174 unsigned read2ST64Opcode(
unsigned EltSize)
const;
177 unsigned write2Opcode(
unsigned EltSize)
const;
178 unsigned write2ST64Opcode(
unsigned EltSize)
const;
186 unsigned computeBase(
MachineInstr &MI,
const MemAddress &Addr);
208 StringRef getPassName()
const override {
return "SI Load Store Optimizer"; }
221 "SI Load Store Optimizer",
false,
false)
226 char SILoadStoreOptimizer::
ID = 0;
231 return new SILoadStoreOptimizer();
239 MI->removeFromParent();
251 else if (
Op.readsReg() &&
265 return !(A->mayStore() || B->mayStore()) ||
284 ((
Use.readsReg() && RegDefs.
count(
Use.getReg())) ||
286 PhysRegUses.
count(
Use.getReg())))) {
302 if (!InstToMove->mayLoadOrStore())
310 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
313 if (CI.Offset0 == CI.Offset1)
317 if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0))
320 unsigned EltOffset0 = CI.Offset0 / CI.EltSize;
321 unsigned EltOffset1 = CI.Offset1 / CI.EltSize;
326 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
327 return (EltOffset0 + CI.Width0 == EltOffset1 ||
328 EltOffset1 + CI.Width1 == EltOffset0) &&
329 CI.GLC0 == CI.GLC1 &&
330 (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1);
335 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
337 CI.Offset0 = EltOffset0 / 64;
338 CI.Offset1 = EltOffset1 / 64;
345 CI.Offset0 = EltOffset0;
346 CI.Offset1 = EltOffset1;
351 unsigned OffsetDiff =
std::abs((
int)EltOffset1 - (
int)EltOffset0);
352 CI.BaseOff = std::min(CI.Offset0, CI.Offset1);
354 if ((OffsetDiff % 64 == 0) &&
isUInt<8>(OffsetDiff / 64)) {
355 CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
356 CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
362 CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize;
363 CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize;
370 bool SILoadStoreOptimizer::widthsFit(
const GCNSubtarget &STM,
371 const CombineInfo &CI) {
372 const unsigned Width = (CI.Width0 + CI.Width1);
373 switch (CI.InstClass) {
376 case S_BUFFER_LOAD_IMM:
387 unsigned SILoadStoreOptimizer::getOpcodeWidth(
const MachineInstr &
MI) {
390 if (
TII->isMUBUF(MI)) {
397 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
399 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
401 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
406 InstClassEnum SILoadStoreOptimizer::getInstClass(
unsigned Opc) {
407 if (
TII->isMUBUF(Opc)) {
411 if (baseOpcode == -1) {
415 switch (baseOpcode) {
418 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
419 return BUFFER_LOAD_OFFEN;
420 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
421 return BUFFER_LOAD_OFFSET;
422 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
423 return BUFFER_STORE_OFFEN;
424 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
425 return BUFFER_STORE_OFFSET;
426 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
427 return BUFFER_LOAD_OFFEN_exact;
428 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
429 return BUFFER_LOAD_OFFSET_exact;
430 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
431 return BUFFER_STORE_OFFEN_exact;
432 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
433 return BUFFER_STORE_OFFSET_exact;
440 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
441 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
442 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
443 return S_BUFFER_LOAD_IMM;
444 case AMDGPU::DS_READ_B32:
445 case AMDGPU::DS_READ_B64:
446 case AMDGPU::DS_READ_B32_gfx9:
447 case AMDGPU::DS_READ_B64_gfx9:
449 case AMDGPU::DS_WRITE_B32:
450 case AMDGPU::DS_WRITE_B64:
451 case AMDGPU::DS_WRITE_B32_gfx9:
452 case AMDGPU::DS_WRITE_B64_gfx9:
457 unsigned SILoadStoreOptimizer::getRegs(
unsigned Opc) {
458 if (
TII->isMUBUF(Opc)) {
479 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
480 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
481 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
483 case AMDGPU::DS_READ_B32:
484 case AMDGPU::DS_READ_B64:
485 case AMDGPU::DS_READ_B32_gfx9:
486 case AMDGPU::DS_READ_B64_gfx9:
487 case AMDGPU::DS_WRITE_B32:
488 case AMDGPU::DS_WRITE_B64:
489 case AMDGPU::DS_WRITE_B32_gfx9:
490 case AMDGPU::DS_WRITE_B64_gfx9:
495 bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
500 const unsigned Opc = CI.I->getOpcode();
507 const unsigned Regs = getRegs(Opc);
509 unsigned AddrOpName[5] = {0};
512 unsigned NumAddresses = 0;
515 AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
519 AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
523 AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
526 if (Regs & SOFFSET) {
527 AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
531 AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
534 for (
unsigned i = 0; i < NumAddresses; i++) {
536 AddrReg[i] = &CI.I->getOperand(AddrIdx[i]);
540 if (AddrReg[i]->
isReg() &&
542 MRI->hasOneNonDBGUse(AddrReg[i]->getReg())))
552 for (; MBBI !=
E; ++MBBI) {
553 const bool IsDS = (InstClass == DS_READ) || (InstClass == DS_WRITE);
555 if ((getInstClass(MBBI->getOpcode()) != InstClass) ||
556 (IsDS && (MBBI->getOpcode() != Opc))) {
563 if (MBBI->hasUnmodeledSideEffects()) {
569 if (MBBI->mayLoadOrStore() &&
575 CI.InstsToMove.push_back(&*MBBI);
589 if (MBBI->hasOrderedMemoryRef())
603 for (
unsigned i = 0; i < NumAddresses; i++) {
606 if (AddrReg[i]->isImm() || AddrRegNext.
isImm()) {
607 if (AddrReg[i]->isImm() != AddrRegNext.
isImm() ||
627 CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm();
628 CI.Width0 = getOpcodeWidth(*CI.I);
629 CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm();
630 CI.Width1 = getOpcodeWidth(*MBBI);
633 if ((CI.InstClass == DS_READ) || (CI.InstClass == DS_WRITE)) {
634 CI.Offset0 &= 0xffff;
635 CI.Offset1 &= 0xffff;
637 CI.GLC0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm();
638 CI.GLC1 =
TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm();
639 if (CI.InstClass != S_BUFFER_LOAD_IMM) {
640 CI.SLC0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm();
641 CI.SLC1 =
TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm();
649 if (widthsFit(*STM, CI) && offsetsCanBeCombined(CI))
666 unsigned SILoadStoreOptimizer::read2Opcode(
unsigned EltSize)
const {
668 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
669 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
672 unsigned SILoadStoreOptimizer::read2ST64Opcode(
unsigned EltSize)
const {
674 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
676 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
677 : AMDGPU::DS_READ2ST64_B64_gfx9;
681 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) {
686 const auto *AddrReg =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
688 const auto *Dest0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
689 const auto *Dest1 =
TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst);
691 unsigned NewOffset0 = CI.Offset0;
692 unsigned NewOffset1 = CI.Offset1;
694 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
696 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
697 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
699 if (NewOffset0 > NewOffset1) {
706 (NewOffset0 != NewOffset1) &&
"Computed offset doesn't fit");
711 (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
712 unsigned DestReg =
MRI->createVirtualRegister(SuperRC);
716 unsigned BaseReg = AddrReg->getReg();
717 unsigned BaseSubReg = AddrReg->getSubReg();
718 unsigned BaseRegFlags = 0;
720 unsigned ImmReg =
MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
721 BuildMI(*MBB, CI.Paired, DL,
TII->get(AMDGPU::S_MOV_B32), ImmReg)
724 BaseReg =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
727 TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
729 .addReg(AddrReg->getReg(), 0, BaseSubReg);
734 BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
735 .
addReg(BaseReg, BaseRegFlags, BaseSubReg)
746 BuildMI(*MBB, CI.Paired, DL, CopyDesc)
748 .
addReg(DestReg, 0, SubRegIdx0);
756 CI.I->eraseFromParent();
757 CI.Paired->eraseFromParent();
763 unsigned SILoadStoreOptimizer::write2Opcode(
unsigned EltSize)
const {
765 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
766 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
767 : AMDGPU::DS_WRITE2_B64_gfx9;
770 unsigned SILoadStoreOptimizer::write2ST64Opcode(
unsigned EltSize)
const {
772 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
773 : AMDGPU::DS_WRITE2ST64_B64;
775 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
776 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
780 SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) {
786 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
788 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
790 TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
792 unsigned NewOffset0 = CI.Offset0;
793 unsigned NewOffset1 = CI.Offset1;
795 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
797 if (NewOffset0 > NewOffset1) {
804 (NewOffset0 != NewOffset1) &&
"Computed offset doesn't fit");
809 unsigned BaseReg = AddrReg->
getReg();
810 unsigned BaseSubReg = AddrReg->
getSubReg();
811 unsigned BaseRegFlags = 0;
813 unsigned ImmReg =
MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
814 BuildMI(*MBB, CI.Paired, DL,
TII->get(AMDGPU::S_MOV_B32), ImmReg)
817 BaseReg =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
820 TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
822 .addReg(AddrReg->
getReg(), 0, BaseSubReg);
827 BuildMI(*MBB, CI.Paired, DL, Write2Desc)
828 .
addReg(BaseReg, BaseRegFlags, BaseSubReg)
839 CI.I->eraseFromParent();
840 CI.Paired->eraseFromParent();
842 LLVM_DEBUG(
dbgs() <<
"Inserted write2 inst: " << *Write2 <<
'\n');
847 SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) {
850 const unsigned Opcode = getNewOpcode(CI);
854 unsigned DestReg =
MRI->createVirtualRegister(SuperRC);
855 unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
857 BuildMI(*MBB, CI.Paired, DL,
TII->get(Opcode), DestReg)
858 .
add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
859 .addImm(MergedOffset)
861 .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
863 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
864 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
865 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
869 const auto *Dest0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
870 const auto *Dest1 =
TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst);
872 BuildMI(*MBB, CI.Paired, DL, CopyDesc)
874 .
addReg(DestReg, 0, SubRegIdx0);
882 CI.I->eraseFromParent();
883 CI.Paired->eraseFromParent();
888 SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {
892 const unsigned Opcode = getNewOpcode(CI);
897 unsigned DestReg =
MRI->createVirtualRegister(SuperRC);
898 unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
900 auto MIB =
BuildMI(*MBB, CI.Paired, DL,
TII->get(Opcode), DestReg);
902 const unsigned Regs = getRegs(Opcode);
905 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
907 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
908 .
add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
909 .addImm(MergedOffset)
913 .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
915 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
916 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
917 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
921 const auto *Dest0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
922 const auto *Dest1 =
TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
924 BuildMI(*MBB, CI.Paired, DL, CopyDesc)
926 .
addReg(DestReg, 0, SubRegIdx0);
934 CI.I->eraseFromParent();
935 CI.Paired->eraseFromParent();
939 unsigned SILoadStoreOptimizer::getNewOpcode(
const CombineInfo &CI) {
940 const unsigned Width = CI.Width0 + CI.Width1;
942 switch (CI.InstClass) {
947 case S_BUFFER_LOAD_IMM:
952 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
954 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
959 std::pair<unsigned, unsigned>
960 SILoadStoreOptimizer::getSubRegIdxs(
const CombineInfo &CI) {
961 if (CI.Offset0 > CI.Offset1) {
964 return std::make_pair(0, 0);
968 return std::make_pair(0, 0);
970 return std::make_pair(AMDGPU::sub1, AMDGPU::sub0);
972 return std::make_pair(AMDGPU::sub2, AMDGPU::sub0_sub1);
974 return std::make_pair(AMDGPU::sub3, AMDGPU::sub0_sub1_sub2);
979 return std::make_pair(0, 0);
981 return std::make_pair(AMDGPU::sub1_sub2, AMDGPU::sub0);
983 return std::make_pair(AMDGPU::sub2_sub3, AMDGPU::sub0_sub1);
988 return std::make_pair(0, 0);
990 return std::make_pair(AMDGPU::sub1_sub2_sub3, AMDGPU::sub0);
996 return std::make_pair(0, 0);
1000 return std::make_pair(0, 0);
1002 return std::make_pair(AMDGPU::sub0, AMDGPU::sub1);
1004 return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2);
1006 return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2_sub3);
1009 switch (CI.Width1) {
1011 return std::make_pair(0, 0);
1013 return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2);
1015 return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2_sub3);
1018 switch (CI.Width1) {
1020 return std::make_pair(0, 0);
1022 return std::make_pair(AMDGPU::sub0_sub1_sub2, AMDGPU::sub3);
1029 SILoadStoreOptimizer::getTargetRegisterClass(
const CombineInfo &CI) {
1030 if (CI.InstClass == S_BUFFER_LOAD_IMM) {
1031 switch (CI.Width0 + CI.Width1) {
1035 return &AMDGPU::SReg_64_XEXECRegClass;
1037 return &AMDGPU::SReg_128RegClass;
1039 return &AMDGPU::SReg_256RegClass;
1041 return &AMDGPU::SReg_512RegClass;
1044 switch (CI.Width0 + CI.Width1) {
1048 return &AMDGPU::VReg_64RegClass;
1050 return &AMDGPU::VReg_96RegClass;
1052 return &AMDGPU::VReg_128RegClass;
1058 SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) {
1062 const unsigned Opcode = getNewOpcode(CI);
1064 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
1065 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1066 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1070 unsigned SrcReg =
MRI->createVirtualRegister(SuperRC);
1072 const auto *Src0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1073 const auto *Src1 =
TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
1075 BuildMI(*MBB, CI.Paired, DL,
TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1079 .addImm(SubRegIdx1);
1081 auto MIB =
BuildMI(*MBB, CI.Paired, DL,
TII->get(Opcode))
1084 const unsigned Regs = getRegs(Opcode);
1087 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1089 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1090 .
add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1091 .addImm(std::min(CI.Offset0, CI.Offset1))
1095 .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
1100 CI.I->eraseFromParent();
1101 CI.Paired->eraseFromParent();
1106 SILoadStoreOptimizer::createRegOrImm(int32_t Val,
MachineInstr &MI) {
1107 APInt V(32, Val,
true);
1108 if (
TII->isInlineConstant(V))
1111 unsigned Reg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1114 TII->get(AMDGPU::S_MOV_B32),
Reg)
1122 unsigned SILoadStoreOptimizer::computeBase(
MachineInstr &MI,
1123 const MemAddress &Addr) {
1128 assert((
TRI->getRegSizeInBits(Addr.Base.LoReg, *
MRI) == 32 ||
1129 Addr.Base.LoSubReg) &&
1130 "Expected 32-bit Base-Register-Low!!");
1132 assert((
TRI->getRegSizeInBits(Addr.Base.HiReg, *
MRI) == 32 ||
1133 Addr.Base.HiSubReg) &&
1134 "Expected 32-bit Base-Register-Hi!!");
1137 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1139 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1140 unsigned CarryReg =
MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
1141 unsigned DeadCarryReg =
1142 MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
1144 unsigned DestSub0 =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1145 unsigned DestSub1 =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1147 BuildMI(*MBB, MBBI, DL,
TII->get(AMDGPU::V_ADD_I32_e64), DestSub0)
1149 .
addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1155 BuildMI(*MBB, MBBI, DL,
TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1157 .
addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1163 unsigned FullDestReg =
MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
1165 BuildMI(*MBB, MBBI, DL,
TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1177 void SILoadStoreOptimizer::updateBaseAndOffset(
MachineInstr &MI,
1179 int32_t NewOffset) {
1180 TII->getNamedOperand(MI, AMDGPU::OpName::vaddr)->setReg(NewBase);
1181 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1193 if (!Def || Def->
getOpcode() != AMDGPU::S_MOV_B32 ||
1216 if (!Def || Def->
getOpcode() != AMDGPU::REG_SEQUENCE
1228 if (!BaseLoDef || BaseLoDef->
getOpcode() != AMDGPU::V_ADD_I32_e64 ||
1229 !BaseHiDef || BaseHiDef->
getOpcode() != AMDGPU::V_ADDC_U32_e64)
1232 const auto *Src0 =
TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
1233 const auto *Src1 =
TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
1235 auto Offset0P = extractConstOffset(*Src0);
1239 if (!(Offset0P = extractConstOffset(*Src1)))
1244 Src0 =
TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
1245 Src1 =
TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
1253 uint64_t Offset1 = Src1->getImm();
1256 Addr.Base.LoReg = BaseLo.
getReg();
1257 Addr.Base.HiReg = BaseHi.
getReg();
1258 Addr.Base.LoSubReg = BaseLo.
getSubReg();
1259 Addr.Base.HiSubReg = BaseHi.
getSubReg();
1260 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
1263 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
1265 MemInfoMap &Visited,
1270 TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL)
1277 if (AnchorList.
count(&MI))
1282 if (
TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
1290 if (Visited.find(&MI) == Visited.end()) {
1291 processBaseWithConstOffset(Base, MAddr);
1292 Visited[&
MI] = MAddr;
1294 MAddr = Visited[&
MI];
1296 if (MAddr.Offset == 0) {
1297 LLVM_DEBUG(
dbgs() <<
" Failed to extract constant-offset or there are no" 1298 " constant offsets that can be promoted.\n";);
1303 << MAddr.Base.LoReg <<
"} Offset: " << MAddr.Offset <<
"\n\n";);
1331 MemAddress AnchorAddr;
1332 uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
1342 for ( ; MBBI !=
E; ++MBBI) {
1347 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
1351 *
TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
1352 MemAddress MAddrNext;
1353 if (Visited.find(&MINext) == Visited.end()) {
1354 processBaseWithConstOffset(BaseNext, MAddrNext);
1355 Visited[&MINext] = MAddrNext;
1357 MAddrNext = Visited[&MINext];
1359 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
1360 MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
1361 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
1362 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
1365 InstsWCommonBase.
push_back(std::make_pair(&MINext, MAddrNext.Offset));
1367 int64_t Dist = MAddr.Offset - MAddrNext.Offset;
1375 AnchorAddr = MAddrNext;
1376 AnchorInst = &MINext;
1381 LLVM_DEBUG(
dbgs() <<
" Anchor-Inst(with max-distance from Offset): ";
1382 AnchorInst->
dump());
1384 << AnchorAddr.Offset <<
"\n\n");
1387 unsigned Base = computeBase(MI, AnchorAddr);
1389 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
1392 for (
auto P : InstsWCommonBase) {
1395 AM.
BaseOffs =
P.second - AnchorAddr.Offset;
1399 dbgs() <<
")";
P.first->dump());
1400 updateBaseAndOffset(*
P.first, Base,
P.second - AnchorAddr.Offset);
1404 AnchorList.
insert(AnchorInst);
1426 if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
1439 CI.InstClass = getInstClass(Opc);
1441 switch (CI.InstClass) {
1446 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
1448 if (findMatchingInst(CI)) {
1450 I = mergeRead2Pair(CI);
1457 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
1459 if (findMatchingInst(CI)) {
1461 I = mergeWrite2Pair(CI);
1466 case S_BUFFER_LOAD_IMM:
1468 if (findMatchingInst(CI)) {
1470 I = mergeSBufferLoadImmPair(CI);
1471 OptimizeAgain |= (CI.Width0 + CI.Width1) < 16;
1476 case BUFFER_LOAD_OFFEN:
1477 case BUFFER_LOAD_OFFSET:
1478 case BUFFER_LOAD_OFFEN_exact:
1479 case BUFFER_LOAD_OFFSET_exact:
1481 if (findMatchingInst(CI)) {
1483 I = mergeBufferLoadPair(CI);
1484 OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
1489 case BUFFER_STORE_OFFEN:
1490 case BUFFER_STORE_OFFSET:
1491 case BUFFER_STORE_OFFEN_exact:
1492 case BUFFER_STORE_OFFSET_exact:
1494 if (findMatchingInst(CI)) {
1496 I = mergeBufferStorePair(CI);
1497 OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
1510 bool SILoadStoreOptimizer::runOnMachineFunction(
MachineFunction &MF) {
1519 TRI = &
TII->getRegisterInfo();
1522 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1524 assert(
MRI->isSSA() &&
"Must be run on SSA");
1532 OptimizeAgain =
false;
1533 Modified |= optimizeBlock(MBB);
1534 }
while (OptimizeAgain);
static bool isReg(const MCInst &MI, unsigned OpNo)
int getMUBUFOpcode(unsigned BaseOpc, unsigned Dwords)
const MachineInstrBuilder & add(const MachineOperand &MO) const
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
Interface definition for SIRegisterInfo.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
AMDGPU specific subclass of TargetSubtarget.
bool getMUBUFHasSrsrc(unsigned Opc)
This class represents lattice values for constants.
char & SILoadStoreOptimizerID
void push_back(const T &Elt)
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Describe properties that are true of each instruction in the target description file.
unsigned getReg() const
getReg - Returns the register number.
unsigned getSubReg() const
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const SIInstrInfo * getInstrInfo() const override
unsigned const TargetRegisterInfo * TRI
int getMUBUFDwords(unsigned Opc)
iterator_range< mop_iterator > operands()
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateReg(unsigned Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, MachineBasicBlock::iterator B, const SIInstrInfo *TII, AliasAnalysis *AA)
bool hasDwordx3LoadStores() const
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
unsigned getNumOperands() const
Retuns the total number of operands.
A Use represents the edge between a Value definition and its users.
MachineBasicBlock iterator that automatically skips over MIs that are inside bundles (i...
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
constexpr bool isUInt< 8 >(uint64_t x)
unsigned const MachineRegisterInfo * MRI
bool getMUBUFHasSoffset(unsigned Opc)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
std::pair< iterator, bool > insert(const ValueT &V)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
int getMUBUFBaseOpcode(unsigned Opc)
Represent the analysis usage information of a pass.
FunctionPass class - This class is used to implement most global optimizations.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
self_iterator getIterator()
static void moveInstsAfter(MachineBasicBlock::iterator I, ArrayRef< MachineInstr *> InstsToMove)
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
MachineOperand class - Representation of each machine instruction operand.
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small...
bool areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis *AA=nullptr) const override
bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
void setPreservesCFG()
This function should be called by the pass, iff they do not:
const Function & getFunction() const
Return the LLVM function that this machine code represents.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Class for arbitrary precision integers.
static bool addToListsIfDependent(MachineInstr &MI, DenseSet< unsigned > &RegDefs, DenseSet< unsigned > &PhysRegUses, SmallVectorImpl< MachineInstr *> &Insts)
static unsigned getReg(const void *D, unsigned RC, unsigned RegNo)
INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", false, false) INITIALIZE_PASS_END(SILoadStoreOptimizer
void initializeSILoadStoreOptimizerPass(PassRegistry &)
const MachineBasicBlock * getParent() const
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
Provides AMDGPU specific target descriptions.
Representation of each machine instruction.
static bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
Interface definition for SIInstrInfo.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
bool loadStoreOptEnabled() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
static MachineOperand CreateImm(int64_t Val)
APFloat abs(APFloat X)
Returns the absolute value of the argument.
const MachineInstrBuilder & cloneMergedMemRefs(ArrayRef< const MachineInstr *> OtherMIs) const
static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, ArrayRef< MachineInstr *> InstsToMove, const SIInstrInfo *TII, AliasAnalysis *AA)
const MachineInstrBuilder & addReg(unsigned RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
FunctionPass * createSILoadStoreOptimizerPass()
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be iniitalized. ...
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
StringRef - Represent a constant reference to a string, i.e.
bool getMUBUFHasVAddr(unsigned Opc)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object...
const SITargetLowering * getTargetLowering() const override
const MachineOperand & getOperand(unsigned i) const
static void addDefsUsesToList(const MachineInstr &MI, DenseSet< unsigned > &RegDefs, DenseSet< unsigned > &PhysRegUses)