43 #include "llvm/Config/llvm-config.h" 53 #include <unordered_map> 57 #define DEBUG_TYPE "si-peephole-sdwa" 59 STATISTIC(NumSDWAPatternsFound,
"Number of SDWA patterns found.");
61 "Number of instruction converted to SDWA.");
77 std::unordered_map<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
78 std::unordered_map<MachineInstr *, SDWAOperandsVector> PotentialMatches;
96 bool convertToSDWA(
MachineInstr &
MI,
const SDWAOperandsVector &SDWAOperands);
99 StringRef getPassName()
const override {
return "SI Peephole SDWA"; }
114 : Target(TargetOp), Replaced(ReplacedOp) {
119 virtual ~SDWAOperand() =
default;
129 return &getParentInst()->getParent()->getParent()->getRegInfo();
132 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 140 class SDWASrcOperand :
public SDWAOperand {
149 SdwaSel SrcSel_ =
DWORD,
bool Abs_ =
false,
bool Neg_ =
false,
151 : SDWAOperand(TargetOp, ReplacedOp),
152 SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {}
157 SdwaSel getSrcSel()
const {
return SrcSel; }
158 bool getAbs()
const {
return Abs; }
159 bool getNeg()
const {
return Neg; }
160 bool getSext()
const {
return Sext; }
165 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 170 class SDWADstOperand :
public SDWAOperand {
179 : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
184 SdwaSel getDstSel()
const {
return DstSel; }
185 DstUnused getDstUnused()
const {
return DstUn; }
187 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 192 class SDWADstPreserveOperand :
public SDWADstOperand {
200 Preserve(PreserveOp) {}
206 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 215 char SIPeepholeSDWA::ID = 0;
220 return new SIPeepholeSDWA();
224 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 227 case BYTE_0: OS <<
"BYTE_0";
break;
228 case BYTE_1: OS <<
"BYTE_1";
break;
229 case BYTE_2: OS <<
"BYTE_2";
break;
230 case BYTE_3: OS <<
"BYTE_3";
break;
231 case WORD_0: OS <<
"WORD_0";
break;
232 case WORD_1: OS <<
"WORD_1";
break;
233 case DWORD: OS <<
"DWORD";
break;
254 OS <<
"SDWA src: " << *getTargetOperand()
255 <<
" src_sel:" << getSrcSel()
256 <<
" abs:" << getAbs() <<
" neg:" << getNeg()
257 <<
" sext:" << getSext() <<
'\n';
262 OS <<
"SDWA dst: " << *getTargetOperand()
263 <<
" dst_sel:" << getDstSel()
264 <<
" dst_unused:" << getDstUnused() <<
'\n';
269 OS <<
"SDWA preserve dst: " << *getTargetOperand()
270 <<
" dst_sel:" << getDstSel()
271 <<
" preserve:" << *getPreservedOperand() <<
'\n';
289 return LHS.
isReg() &&
326 for (
auto &DefMO : DefInstr->
defs()) {
327 if (DefMO.isReg() && DefMO.getReg() == Reg->
getReg())
335 uint64_t SDWASrcOperand::getSrcMods(
const SIInstrInfo *TII,
341 Mods =
Mod->getImm();
345 Mods =
Mod->getImm();
350 "Float and integer src modifiers can't be set simulteniously");
373 bool IsPreserveSrc =
false;
379 if (!
isSameReg(*Src, *getReplacedOperand())) {
386 !
isSameReg(*Src, *getReplacedOperand())) {
407 IsPreserveSrc =
true;
409 AMDGPU::OpName::vdst);
422 if ((MI.
getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
423 MI.
getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
424 !
isSameReg(*Src, *getReplacedOperand())) {
431 (IsPreserveSrc || (SrcSel && SrcMods)));
434 if (!IsPreserveSrc) {
435 SrcSel->
setImm(getSrcSel());
436 SrcMods->
setImm(getSrcMods(TII, Src));
438 getTargetOperand()->setIsKill(
false);
454 if (&UseInst != ParentMI)
464 if ((MI.
getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
465 MI.
getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
474 isSameReg(*Operand, *getReplacedOperand()));
478 DstSel->
setImm(getDstSel());
481 DstUnused->
setImm(getDstUnused());
485 getParentInst()->eraseFromParent();
489 bool SDWADstPreserveOperand::convertToSDWA(
MachineInstr &MI,
497 getMRI()->clearKillFlags(MO.getReg());
503 MBB->insert(getParentInst(), &MI);
509 getPreservedOperand()->getSubReg());
516 return SDWADstOperand::convertToSDWA(MI, TII);
546 std::unique_ptr<SDWAOperand>
550 case AMDGPU::V_LSHRREV_B32_e32:
551 case AMDGPU::V_ASHRREV_I32_e32:
552 case AMDGPU::V_LSHLREV_B32_e32:
553 case AMDGPU::V_LSHRREV_B32_e64:
554 case AMDGPU::V_ASHRREV_I32_e64:
555 case AMDGPU::V_LSHLREV_B32_e64: {
565 auto Imm = foldToImm(*Src0);
569 if (*Imm != 16 && *Imm != 24)
574 if (TRI->isPhysicalRegister(Src1->
getReg()) ||
575 TRI->isPhysicalRegister(Dst->
getReg()))
578 if (Opcode == AMDGPU::V_LSHLREV_B32_e32 ||
579 Opcode == AMDGPU::V_LSHLREV_B32_e64) {
580 return make_unique<SDWADstOperand>(
583 return make_unique<SDWASrcOperand>(
585 Opcode != AMDGPU::V_LSHRREV_B32_e32 &&
586 Opcode != AMDGPU::V_LSHRREV_B32_e64);
591 case AMDGPU::V_LSHRREV_B16_e32:
592 case AMDGPU::V_ASHRREV_I16_e32:
593 case AMDGPU::V_LSHLREV_B16_e32:
594 case AMDGPU::V_LSHRREV_B16_e64:
595 case AMDGPU::V_ASHRREV_I16_e64:
596 case AMDGPU::V_LSHLREV_B16_e64: {
606 auto Imm = foldToImm(*Src0);
607 if (!Imm || *Imm != 8)
613 if (TRI->isPhysicalRegister(Src1->
getReg()) ||
614 TRI->isPhysicalRegister(Dst->
getReg()))
617 if (Opcode == AMDGPU::V_LSHLREV_B16_e32 ||
618 Opcode == AMDGPU::V_LSHLREV_B16_e64) {
621 return make_unique<SDWASrcOperand>(
622 Src1, Dst,
BYTE_1,
false,
false,
623 Opcode != AMDGPU::V_LSHRREV_B16_e32 &&
624 Opcode != AMDGPU::V_LSHRREV_B16_e64);
629 case AMDGPU::V_BFE_I32:
630 case AMDGPU::V_BFE_U32: {
646 auto Offset = foldToImm(*Src1);
651 auto Width = foldToImm(*Src2);
657 if (*
Offset == 0 && *Width == 8)
659 else if (*
Offset == 0 && *Width == 16)
661 else if (*
Offset == 0 && *Width == 32)
663 else if (*
Offset == 8 && *Width == 8)
665 else if (*
Offset == 16 && *Width == 8)
667 else if (*
Offset == 16 && *Width == 16)
669 else if (*
Offset == 24 && *Width == 8)
677 if (TRI->isPhysicalRegister(Src0->
getReg()) ||
678 TRI->isPhysicalRegister(Dst->
getReg()))
681 return make_unique<SDWASrcOperand>(
682 Src0, Dst, SrcSel,
false,
false, Opcode != AMDGPU::V_BFE_U32);
685 case AMDGPU::V_AND_B32_e32:
686 case AMDGPU::V_AND_B32_e64: {
694 auto Imm = foldToImm(*Src0);
697 Imm = foldToImm(*Src1);
701 if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff))
706 if (TRI->isPhysicalRegister(ValSrc->getReg()) ||
707 TRI->isPhysicalRegister(Dst->
getReg()))
710 return make_unique<SDWASrcOperand>(
714 case AMDGPU::V_OR_B32_e32:
715 case AMDGPU::V_OR_B32_e64: {
726 auto CheckOROperandsForSDWA =
728 if (!Op1 || !Op1->
isReg() || !Op2 || !Op2->isReg())
729 return CheckRetType(
None);
733 return CheckRetType(
None);
736 if (!TII->
isSDWA(*Op1Inst))
737 return CheckRetType(
None);
741 return CheckRetType(
None);
743 return CheckRetType(std::make_pair(Op1Def, Op2Def));
748 assert(OrSDWA && OrOther);
749 auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
753 assert(OrSDWA && OrOther);
754 Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
761 assert(OrSDWADef && OrOtherDef);
786 if (!TII->
isSDWA(*OtherInst))
794 bool DstSelAgree =
false;
797 (OtherDstSel ==
BYTE_3) ||
801 (OtherDstSel ==
BYTE_1) ||
805 (OtherDstSel ==
BYTE_2) ||
806 (OtherDstSel ==
BYTE_3) ||
810 (OtherDstSel ==
BYTE_2) ||
811 (OtherDstSel ==
BYTE_3) ||
815 (OtherDstSel ==
BYTE_1) ||
816 (OtherDstSel ==
BYTE_3) ||
820 (OtherDstSel ==
BYTE_1) ||
821 (OtherDstSel ==
BYTE_2) ||
824 default: DstSelAgree =
false;
840 return make_unique<SDWADstPreserveOperand>(
841 OrDst, OrSDWADef, OrOtherDef, DstSel);
846 return std::unique_ptr<SDWAOperand>(
nullptr);
851 if (
auto Operand = matchSDWAOperand(MI)) {
852 LLVM_DEBUG(
dbgs() <<
"Match: " << MI <<
"To: " << *Operand <<
'\n');
853 SDWAOperands[&
MI] = std::move(Operand);
854 ++NumSDWAPatternsFound;
877 void SIPeepholeSDWA::pseudoOpConvertToVOP2(
MachineInstr &MI,
880 assert((Opc == AMDGPU::V_ADD_I32_e64 || Opc == AMDGPU::V_SUB_I32_e64) &&
881 "Currently only handles V_ADD_I32_e64 or V_SUB_I32_e64");
916 if (
I->modifiesRegister(AMDGPU::VCC, TRI))
934 bool SIPeepholeSDWA::isConvertibleToSDWA(
MachineInstr &MI,
954 if (SDst && SDst->
getReg() != AMDGPU::VCC)
968 if (!ST.
hasSDWAMac() && (Opc == AMDGPU::V_MAC_F16_e32 ||
969 Opc == AMDGPU::V_MAC_F32_e32))
973 if (Opc == AMDGPU::V_CNDMASK_B32_e32)
980 const SDWAOperandsVector &SDWAOperands) {
987 if (TII->
isSDWA(Opcode)) {
991 if (SDWAOpcode == -1)
996 const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode);
1027 SDWAInst.
add(*Src0);
1039 SDWAInst.
add(*Src1);
1042 if (SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
1043 SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) {
1047 SDWAInst.
add(*Src2);
1054 SDWAInst.
add(*Clamp);
1063 SDWAInst.
add(*OMod);
1073 SDWAInst.
add(*DstSel);
1083 SDWAInst.
add(*DstUnused);
1093 SDWAInst.
add(*Src0Sel);
1103 SDWAInst.
add(*Src1Sel);
1115 assert(Dst && Dst->isTied());
1116 assert(Opcode == static_cast<unsigned int>(SDWAOpcode));
1119 assert(PreserveDstIdx != -1);
1129 bool Converted =
false;
1130 for (
auto &Operand : SDWAOperands) {
1142 if (PotentialMatches.count(Operand->getParentInst()) == 0)
1143 Converted |= Operand->convertToSDWA(*SDWAInst, TII);
1146 ConvertedInstructions.
push_back(SDWAInst);
1153 ++NumSDWAInstructionsPeepholed;
1161 void SIPeepholeSDWA::legalizeScalarOperands(
MachineInstr &MI,
1164 unsigned ConstantBusCount = 0;
1182 TII->get(AMDGPU::V_MOV_B32_e32), VGPR);
1184 Copy.addImm(Op.
getImm());
1185 else if (Op.
isReg())
1205 bool Changed =
false;
1211 matchSDWAOperands(MBB);
1212 for (
const auto &OperandPair : SDWAOperands) {
1213 const auto &Operand = OperandPair.second;
1214 MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
1216 (PotentialMI->
getOpcode() == AMDGPU::V_ADD_I32_e64 ||
1217 PotentialMI->
getOpcode() == AMDGPU::V_SUB_I32_e64))
1218 pseudoOpConvertToVOP2(*PotentialMI, ST);
1220 SDWAOperands.clear();
1223 matchSDWAOperands(MBB);
1225 for (
const auto &OperandPair : SDWAOperands) {
1226 const auto &Operand = OperandPair.second;
1227 MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
1228 if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) {
1229 PotentialMatches[PotentialMI].push_back(Operand.get());
1233 for (
auto &PotentialPair : PotentialMatches) {
1235 convertToSDWA(PotentialMI, PotentialPair.second);
1238 PotentialMatches.clear();
1239 SDWAOperands.clear();
1241 Changed = !ConvertedInstructions.
empty();
1245 while (!ConvertedInstructions.
empty())
1246 legalizeScalarOperands(*ConvertedInstructions.
pop_back_val(),
ST);
const MachineInstrBuilder & add(const MachineOperand &MO) const
Interface definition for SIRegisterInfo.
A common definition of LaneBitmask for use in TableGen and CodeGen.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
AMDGPU specific subclass of TargetSubtarget.
FunctionPass * createSIPeepholeSDWAPass()
This class represents lattice values for constants.
iterator_range< mop_iterator > uses()
Returns a range that includes all operands that are register uses.
void ChangeToRegister(unsigned Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value...
LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, unsigned Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before...
void push_back(const T &Elt)
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
bool hasModifiersSet(const MachineInstr &MI, unsigned OpName) const
iterator_range< use_nodbg_iterator > use_nodbg_operands(unsigned Reg) const
Describe properties that are true of each instruction in the target description file.
unsigned getReg() const
getReg - Returns the register number.
unsigned getOperandNo(const_mop_iterator I) const
Returns the number of the operand iterator I points to.
void setIsUndef(bool Val=true)
unsigned getSubReg() const
static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS)
const SIInstrInfo * getInstrInfo() const override
int64_t getNamedImmOperand(const MachineInstr &MI, unsigned OpName) const
Get required immediate operand.
static MachineOperand * findSingleRegDef(const MachineOperand *Reg, const MachineRegisterInfo *MRI)
STATISTIC(NumFunctions, "Total number of functions")
unsigned const TargetRegisterInfo * TRI
void setIsDead(bool Val=true)
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const
unsigned getNumOperands() const
Retuns the total number of operands.
static MachineOperand * findSingleRegUse(const MachineOperand *Reg, const MachineRegisterInfo *MRI)
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
bool hasSDWAOutModsVOPC() const
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool isVGPR(const MachineRegisterInfo &MRI, unsigned Reg) const
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, unsigned OperandName) const
Returns the operand named Op.
iterator_range< def_iterator > def_operands(unsigned Reg) const
LLVM_READONLY int getSDWAOp(uint16_t Opcode)
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
unsigned const MachineRegisterInfo * MRI
bool isFoldableCopy(const MachineInstr &MI) const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
static void copyRegOperand(MachineOperand &To, const MachineOperand &From)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
bool hasVGPRs(const TargetRegisterClass *RC) const
void initializeSIPeepholeSDWAPass(PassRegistry &)
Register is known to be fully dead.
Represent the analysis usage information of a pass.
iterator_range< mop_iterator > defs()
Returns a range over all explicit operands that are register definitions.
void setImm(int64_t immVal)
FunctionPass class - This class is used to implement most global optimizations.
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
self_iterator getIterator()
iterator_range< mop_iterator > explicit_uses()
void setIsKill(bool Val=true)
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
unsigned findTiedOperandIdx(unsigned OpIdx) const
Given the index of a tied register operand, find the operand it is tied to.
BlockVerifier::State From
LLVM_READONLY int getVOPe32(uint16_t Opcode)
MachineOperand class - Representation of each machine instruction operand.
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small...
LLVM_NODISCARD T pop_back_val()
void setPreservesCFG()
This function should be called by the pass, iff they do not:
MachineInstr * remove(MachineInstr *I)
Remove the unbundled instruction from the instruction list without deleting it.
MachineInstr * getUniqueVRegDef(unsigned Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
const Function & getFunction() const
Return the LLVM function that this machine code represents.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
The access may modify the value stored in memory.
Target - Wrapper for Target specific information.
static unsigned getReg(const void *D, unsigned RC, unsigned RegNo)
bool use_empty(unsigned RegNo) const
use_empty - Return true if there are no instructions using the specified register.
const MachineBasicBlock * getParent() const
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
Provides AMDGPU specific target descriptions.
Representation of each machine instruction.
Interface definition for SIInstrInfo.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
bool hasOneUse(unsigned RegNo) const
hasOneUse - Return true if there is exactly one instruction using the specified register.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
LLVM_NODISCARD bool empty() const
static bool isVOPC(const MachineInstr &MI)
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
void setReg(unsigned Reg)
Change the register this operand corresponds to.
void setSubReg(unsigned subReg)
const MachineInstrBuilder & addReg(unsigned RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
raw_ostream & operator<<(raw_ostream &OS, const APInt &I)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool hasSDWAScalar() const
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isSDWA(const MachineInstr &MI)
const MCOperandInfo * OpInfo
This class implements an extremely fast bulk output stream that can only output to a stream...
StringRef - Represent a constant reference to a string, i.e.
const MachineOperand & getOperand(unsigned i) const
iterator_range< use_instr_nodbg_iterator > use_nodbg_instructions(unsigned Reg) const
unsigned createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
const SIRegisterInfo * getRegisterInfo() const override