53 #include <unordered_map> 57 #define DEBUG_TYPE "aarch64-simdinstr-opt" 60 "Number of SIMD instructions modified");
62 #define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME \ 63 "AArch64 SIMD instructions optimization pass" 77 std::map<std::pair<unsigned, std::string>,
bool> SIMDInstrTable;
80 std::unordered_map<std::string, bool> InterlEarlyExit;
90 std::vector<unsigned> ReplOpc;
94 #define RuleST2(OpcOrg, OpcR0, OpcR1, OpcR2, RC) \ 95 {OpcOrg, {OpcR0, OpcR1, OpcR2}, RC} 96 #define RuleST4(OpcOrg, OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, \ 97 OpcR7, OpcR8, OpcR9, RC) \ 99 {OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, OpcR7, OpcR8, OpcR9}, RC} 102 std::vector<InstReplInfo> IRT = {
104 RuleST2(AArch64::ST2Twov2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
105 AArch64::STPQi, AArch64::FPR128RegClass),
106 RuleST2(AArch64::ST2Twov4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
107 AArch64::STPQi, AArch64::FPR128RegClass),
108 RuleST2(AArch64::ST2Twov2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
109 AArch64::STPDi, AArch64::FPR64RegClass),
110 RuleST2(AArch64::ST2Twov8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
111 AArch64::STPQi, AArch64::FPR128RegClass),
112 RuleST2(AArch64::ST2Twov4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
113 AArch64::STPDi, AArch64::FPR64RegClass),
114 RuleST2(AArch64::ST2Twov16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
115 AArch64::STPQi, AArch64::FPR128RegClass),
116 RuleST2(AArch64::ST2Twov8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
117 AArch64::STPDi, AArch64::FPR64RegClass),
119 RuleST4(AArch64::ST4Fourv2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
120 AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, AArch64::ZIP1v2i64,
121 AArch64::ZIP2v2i64, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
122 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
123 RuleST4(AArch64::ST4Fourv4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
124 AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, AArch64::ZIP1v4i32,
125 AArch64::ZIP2v4i32, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
126 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
127 RuleST4(AArch64::ST4Fourv2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
128 AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, AArch64::ZIP1v2i32,
129 AArch64::ZIP2v2i32, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
130 AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass),
131 RuleST4(AArch64::ST4Fourv8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
132 AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, AArch64::ZIP1v8i16,
133 AArch64::ZIP2v8i16, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
134 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
135 RuleST4(AArch64::ST4Fourv4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
136 AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, AArch64::ZIP1v4i16,
137 AArch64::ZIP2v4i16, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
138 AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass),
139 RuleST4(AArch64::ST4Fourv16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
140 AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, AArch64::ZIP1v16i8,
141 AArch64::ZIP2v16i8, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
142 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
143 RuleST4(AArch64::ST4Fourv8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
144 AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, AArch64::ZIP1v8i8,
145 AArch64::ZIP2v8i8, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
146 AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass)
151 static const unsigned MaxNumRepl = 10;
174 bool reuseDUP(
MachineInstr &
MI,
unsigned DupOpcode,
unsigned SrcReg,
175 unsigned LaneNumber,
unsigned *DestReg)
const;
188 bool processSeqRegInst(
MachineInstr *DefiningMI,
unsigned* StReg,
189 unsigned* StRegKill,
unsigned NumArg)
const;
218 bool AArch64SIMDInstrOpt::
223 std::string Subtarget = SchedModel.getSubtargetInfo()->getCPU();
224 auto InstID = std::make_pair(InstDesc->getOpcode(), Subtarget);
225 if (SIMDInstrTable.find(InstID) != SIMDInstrTable.end())
226 return SIMDInstrTable[InstID];
228 unsigned SCIdx = InstDesc->getSchedClass();
230 SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx);
237 SIMDInstrTable[InstID] =
false;
240 for (
auto IDesc : InstDescRepl)
242 SCDescRepl = SchedModel.getMCSchedModel()->getSchedClassDesc(
243 IDesc->getSchedClass());
246 SIMDInstrTable[InstID] =
false;
252 unsigned ReplCost = 0;
253 for (
auto IDesc :InstDescRepl)
254 ReplCost += SchedModel.computeInstrLatency(IDesc->getOpcode());
256 if (SchedModel.computeInstrLatency(InstDesc->getOpcode()) > ReplCost)
258 SIMDInstrTable[InstID] =
true;
263 SIMDInstrTable[InstID] =
false;
283 OriginalMCID = &
TII->get(AArch64::FMLAv4i32_indexed);
284 ReplInstrMCID.
push_back(&
TII->get(AArch64::DUPv4i32lane));
286 if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID))
292 std::string Subtarget = SchedModel.getSubtargetInfo()->getCPU();
293 if (InterlEarlyExit.find(Subtarget) != InterlEarlyExit.end())
294 return InterlEarlyExit[Subtarget];
296 for (
auto &
I : IRT) {
297 OriginalMCID = &
TII->get(
I.OrigOpc);
298 for (
auto &Repl :
I.ReplOpc)
300 if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID)) {
301 InterlEarlyExit[Subtarget] =
false;
304 ReplInstrMCID.
clear();
306 InterlEarlyExit[Subtarget] =
true;
317 bool AArch64SIMDInstrOpt::reuseDUP(
MachineInstr &
MI,
unsigned DupOpcode,
318 unsigned SrcReg,
unsigned LaneNumber,
319 unsigned *DestReg)
const {
325 if (CurrentMI->
getOpcode() == DupOpcode &&
351 bool AArch64SIMDInstrOpt::optimizeVectElement(
MachineInstr &MI) {
360 case AArch64::FMLAv4i32_indexed:
361 DupMCID = &
TII->get(AArch64::DUPv4i32lane);
362 MulMCID = &
TII->get(AArch64::FMLAv4f32);
364 case AArch64::FMLSv4i32_indexed:
365 DupMCID = &
TII->get(AArch64::DUPv4i32lane);
366 MulMCID = &
TII->get(AArch64::FMLSv4f32);
368 case AArch64::FMULXv4i32_indexed:
369 DupMCID = &
TII->get(AArch64::DUPv4i32lane);
370 MulMCID = &
TII->get(AArch64::FMULXv4f32);
372 case AArch64::FMULv4i32_indexed:
373 DupMCID = &
TII->get(AArch64::DUPv4i32lane);
374 MulMCID = &
TII->get(AArch64::FMULv4f32);
378 case AArch64::FMLAv2i64_indexed:
379 DupMCID = &
TII->get(AArch64::DUPv2i64lane);
380 MulMCID = &
TII->get(AArch64::FMLAv2f64);
382 case AArch64::FMLSv2i64_indexed:
383 DupMCID = &
TII->get(AArch64::DUPv2i64lane);
384 MulMCID = &
TII->get(AArch64::FMLSv2f64);
386 case AArch64::FMULXv2i64_indexed:
387 DupMCID = &
TII->get(AArch64::DUPv2i64lane);
388 MulMCID = &
TII->get(AArch64::FMULXv2f64);
390 case AArch64::FMULv2i64_indexed:
391 DupMCID = &
TII->get(AArch64::DUPv2i64lane);
392 MulMCID = &
TII->get(AArch64::FMULv2f64);
396 case AArch64::FMLAv2i32_indexed:
397 RC = &AArch64::FPR64RegClass;
398 DupMCID = &
TII->get(AArch64::DUPv2i32lane);
399 MulMCID = &
TII->get(AArch64::FMLAv2f32);
401 case AArch64::FMLSv2i32_indexed:
402 RC = &AArch64::FPR64RegClass;
403 DupMCID = &
TII->get(AArch64::DUPv2i32lane);
404 MulMCID = &
TII->get(AArch64::FMLSv2f32);
406 case AArch64::FMULXv2i32_indexed:
407 RC = &AArch64::FPR64RegClass;
408 DupMCID = &
TII->get(AArch64::DUPv2i32lane);
409 MulMCID = &
TII->get(AArch64::FMULXv2f32);
411 case AArch64::FMULv2i32_indexed:
412 RC = &AArch64::FPR64RegClass;
413 DupMCID = &
TII->get(AArch64::DUPv2i32lane);
414 MulMCID = &
TII->get(AArch64::FMULv2f32);
445 if (!reuseDUP(MI, DupMCID->
getOpcode(), SrcReg2, LaneNumber, &DupDest)) {
447 BuildMI(MBB, MI, DL, *DupMCID, DupDest)
448 .
addReg(SrcReg2, Src2IsKill)
451 BuildMI(MBB, MI, DL, *MulMCID, MulDest)
452 .
addReg(SrcReg0, Src0IsKill)
453 .
addReg(SrcReg1, Src1IsKill)
454 .
addReg(DupDest, Src2IsKill);
457 if (!reuseDUP(MI, DupMCID->
getOpcode(), SrcReg1, LaneNumber, &DupDest)) {
459 BuildMI(MBB, MI, DL, *DupMCID, DupDest)
460 .
addReg(SrcReg1, Src1IsKill)
463 BuildMI(MBB, MI, DL, *MulMCID, MulDest)
464 .
addReg(SrcReg0, Src0IsKill)
465 .
addReg(DupDest, Src1IsKill);
503 bool AArch64SIMDInstrOpt::optimizeLdStInterleave(
MachineInstr &MI) {
505 unsigned SeqReg, AddrReg;
506 unsigned StReg[4], StRegKill[4];
516 for (
auto &
I : IRT) {
520 DefiningMI =
MRI->getUniqueVRegDef(SeqReg);
521 unsigned NumReg = determineSrcReg(MI);
522 if (!processSeqRegInst(DefiningMI, StReg, StRegKill, NumReg))
525 for (
auto &Repl :
I.ReplOpc) {
528 if (Repl != AArch64::STPQi && Repl != AArch64::STPDi)
555 case AArch64::ST2Twov16b:
556 case AArch64::ST2Twov8b:
557 case AArch64::ST2Twov8h:
558 case AArch64::ST2Twov4h:
559 case AArch64::ST2Twov4s:
560 case AArch64::ST2Twov2s:
561 case AArch64::ST2Twov2d:
563 BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0])
566 BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1])
567 .
addReg(StReg[0], StRegKill[0])
568 .
addReg(StReg[1], StRegKill[1]);
570 BuildMI(MBB, MI, DL, *ReplInstrMCID[2])
577 case AArch64::ST4Fourv16b:
578 case AArch64::ST4Fourv8b:
579 case AArch64::ST4Fourv8h:
580 case AArch64::ST4Fourv4h:
581 case AArch64::ST4Fourv4s:
582 case AArch64::ST4Fourv2s:
583 case AArch64::ST4Fourv2d:
585 BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0])
588 BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1])
589 .
addReg(StReg[0], StRegKill[0])
590 .
addReg(StReg[2], StRegKill[2]);
591 BuildMI(MBB, MI, DL, *ReplInstrMCID[2], ZipDest[2])
594 BuildMI(MBB, MI, DL, *ReplInstrMCID[3], ZipDest[3])
595 .
addReg(StReg[1], StRegKill[1])
596 .
addReg(StReg[3], StRegKill[3]);
597 BuildMI(MBB, MI, DL, *ReplInstrMCID[4], ZipDest[4])
600 BuildMI(MBB, MI, DL, *ReplInstrMCID[5], ZipDest[5])
603 BuildMI(MBB, MI, DL, *ReplInstrMCID[6], ZipDest[6])
606 BuildMI(MBB, MI, DL, *ReplInstrMCID[7], ZipDest[7])
610 BuildMI(MBB, MI, DL, *ReplInstrMCID[8])
615 BuildMI(MBB, MI, DL, *ReplInstrMCID[9])
632 bool AArch64SIMDInstrOpt::processSeqRegInst(
MachineInstr *DefiningMI,
633 unsigned* StReg,
unsigned* StRegKill,
unsigned NumArg)
const {
634 assert (DefiningMI != NULL);
635 if (DefiningMI->
getOpcode() != AArch64::REG_SEQUENCE)
638 for (
unsigned i=0; i<NumArg; i++) {
667 unsigned AArch64SIMDInstrOpt::determineSrcReg(
MachineInstr &MI)
const {
672 case AArch64::ST2Twov16b:
673 case AArch64::ST2Twov8b:
674 case AArch64::ST2Twov8h:
675 case AArch64::ST2Twov4h:
676 case AArch64::ST2Twov4s:
677 case AArch64::ST2Twov2s:
678 case AArch64::ST2Twov2d:
681 case AArch64::ST4Fourv16b:
682 case AArch64::ST4Fourv8b:
683 case AArch64::ST4Fourv8h:
684 case AArch64::ST4Fourv4h:
685 case AArch64::ST4Fourv4s:
686 case AArch64::ST4Fourv2s:
687 case AArch64::ST4Fourv2d:
703 SchedModel.init(&ST);
704 if (!SchedModel.hasInstrSchedModel())
707 bool Changed =
false;
708 for (
auto OptimizationKind : {VectorElem, Interleave}) {
709 if (!shouldExitEarly(&MF, OptimizationKind)) {
716 if (OptimizationKind == VectorElem)
717 InstRewrite = optimizeVectElement(MI) ;
719 InstRewrite = optimizeLdStInterleave(MI);
740 return new AArch64SIMDInstrOpt();
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
This class represents lattice values for constants.
void push_back(const T &Elt)
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Describe properties that are true of each instruction in the target description file.
unsigned getReg() const
getReg - Returns the register number.
STATISTIC(NumFunctions, "Total number of functions")
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
#define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME
INITIALIZE_PASS(AArch64SIMDInstrOpt, "aarch64-simdinstr-opt", AARCH64_VECTOR_BY_ELEMENT_OPT_NAME, false, false) bool AArch64SIMDInstrOpt
Based only on latency of instructions, determine if it is cost efficient to replace the instruction I...
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
Provide an instruction scheduling machine model to CodeGen passes.
const HexagonInstrInfo * TII
unsigned getNumOperands() const
Retuns the total number of operands.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
virtual const TargetInstrInfo * getInstrInfo() const
unsigned getKillRegState(bool B)
TargetInstrInfo - Interface to description of machine instruction set.
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
unsigned const MachineRegisterInfo * MRI
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Summarize the scheduling resources required for an instruction of a particular scheduling class...
FunctionPass class - This class is used to implement most global optimizations.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
#define RuleST2(OpcOrg, OpcR0, OpcR1, OpcR2, RC)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small...
const Function & getFunction() const
Return the LLVM function that this machine code represents.
const MachineBasicBlock * getParent() const
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
TargetSubtargetInfo - Generic base class for all target subtargets.
Representation of each machine instruction.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
#define RuleST4(OpcOrg, OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, OpcR7, OpcR8, OpcR9, RC)
const MachineInstrBuilder & addReg(unsigned RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
FunctionPass * createAArch64SIMDInstrOptPass()
Returns an instance of the high cost ASIMD instruction replacement optimization pass.
unsigned getOpcode() const
Return the opcode number for this descriptor.
StringRef - Represent a constant reference to a string, i.e.
const MachineOperand & getOperand(unsigned i) const
unsigned createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
void initializeAArch64SIMDInstrOptPass(PassRegistry &)