63 #define DEBUG_TYPE "si-insert-waitcnts" 66 "Force emit s_waitcnt expcnt(0) instrs");
68 "Force emit s_waitcnt lgkmcnt(0) instrs");
70 "Force emit s_waitcnt vmcnt(0) instrs");
73 "amdgpu-waitcnt-forcezero",
74 cl::desc(
"Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
79 template <
typename EnumT>
82 std::forward_iterator_tag, const EnumT> {
85 enum_iterator() =
default;
86 enum_iterator(EnumT Value) : Value(Value) {}
88 enum_iterator &operator++() {
89 Value =
static_cast<EnumT
>(Value + 1);
93 bool operator==(
const enum_iterator &RHS)
const {
return Value == RHS.Value; }
102 #define CNT_MASK(t) (1u << (t)) 108 enum_iterator<InstCounterType>(NUM_INST_CNTS));
111 using RegInterval = std::pair<signed, signed>;
142 static const uint32_t WaitEventMaskForInst[NUM_INST_CNTS] = {
144 (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
146 (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
147 (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS),
157 SQ_MAX_PGM_VGPRS = 256,
158 SQ_MAX_PGM_SGPRS = 256,
161 NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS,
167 Wait.VmCnt = std::min(Wait.VmCnt, Count);
170 Wait.ExpCnt = std::min(Wait.ExpCnt, Count);
173 Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count);
188 class WaitcntBrackets {
191 for (
auto T : inst_counter_types())
192 memset(VgprScores[T], 0,
sizeof(VgprScores[T]));
198 return HardwareLimits.VmcntMax;
200 return HardwareLimits.LgkmcntMax;
202 return HardwareLimits.ExpcntMax;
210 assert(T < NUM_INST_CNTS);
211 if (T >= NUM_INST_CNTS)
217 assert(T < NUM_INST_CNTS);
218 if (T >= NUM_INST_CNTS)
225 if (E == VMEM_ACCESS)
227 if (WaitEventMaskForInst[
LGKM_CNT] & (1 << E))
234 if (GprNo < NUM_ALL_VGPRS) {
235 return VgprScores[
T][GprNo];
238 return SgprScores[GprNo - NUM_ALL_VGPRS];
242 memset(ScoreLBs, 0,
sizeof(ScoreLBs));
243 memset(ScoreUBs, 0,
sizeof(ScoreUBs));
245 memset(MixedPendingEvents, 0,
sizeof(MixedPendingEvents));
246 for (
auto T : inst_counter_types())
247 memset(VgprScores[T], 0,
sizeof(VgprScores[T]));
248 memset(SgprScores, 0,
sizeof(SgprScores));
251 bool merge(
const WaitcntBrackets &
Other);
258 int32_t getMaxVGPR()
const {
return VgprUB; }
259 int32_t getMaxSGPR()
const {
return SgprUB; }
262 bool simplifyWaitcnt(AMDGPU::Waitcnt &Wait)
const;
265 AMDGPU::Waitcnt &Wait)
const;
266 void applyWaitcnt(
const AMDGPU::Waitcnt &Wait);
272 bool hasPending()
const {
return PendingEvents != 0; }
274 return PendingEvents & (1 <<
E);
277 bool hasPendingFlat()
const {
284 void setPendingFlat() {
299 static bool mergeScore(
const MergeInfo &M,
uint32_t &Score,
303 assert(T < NUM_INST_CNTS);
304 if (T >= NUM_INST_CNTS)
310 assert(T < NUM_INST_CNTS);
311 if (T >= NUM_INST_CNTS)
316 if (ScoreLBs[T] < UB && UB < ScoreUBs[T])
322 if (GprNo < NUM_ALL_VGPRS) {
323 if (GprNo > VgprUB) {
326 VgprScores[
T][GprNo] = Val;
329 if (GprNo - NUM_ALL_VGPRS > SgprUB) {
330 SgprUB = GprNo - NUM_ALL_VGPRS;
332 SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
341 uint32_t ScoreLBs[NUM_INST_CNTS] = {0};
342 uint32_t ScoreUBs[NUM_INST_CNTS] = {0};
344 bool MixedPendingEvents[NUM_INST_CNTS] = {
false};
346 uint32_t LastFlat[NUM_INST_CNTS] = {0};
351 uint32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
353 uint32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
362 AMDGPU::IsaVersion IV;
369 std::unique_ptr<WaitcntBrackets> Incoming;
375 std::vector<BlockInfo> BlockInfos;
380 bool ForceEmitZeroWaitcnts;
381 bool ForceEmitWaitcnt[NUM_INST_CNTS];
387 (void)ForceExpCounter;
388 (void)ForceLgkmCounter;
389 (void)ForceVMCounter;
395 return "SI insert wait instructions";
403 bool isForceEmitWaitcnt()
const {
404 for (
auto T : inst_counter_types())
405 if (ForceEmitWaitcnt[T])
410 void setForceEmitWaitcnt() {
416 ForceEmitWaitcnt[
EXP_CNT] =
true;
418 ForceEmitWaitcnt[
EXP_CNT] =
false;
430 ForceEmitWaitcnt[
VM_CNT] =
true;
432 ForceEmitWaitcnt[
VM_CNT] =
false;
439 WaitcntBrackets &ScoreBrackets,
442 WaitcntBrackets *ScoreBrackets);
444 WaitcntBrackets &ScoreBrackets);
449 RegInterval WaitcntBrackets::getRegInterval(
const MachineInstr *
MI,
453 unsigned OpNo,
bool Def)
const {
455 if (!Op.
isReg() || !TRI->isInAllocatableClass(Op.
getReg()) ||
456 (Def && !Op.
isDef()))
466 unsigned Reg = TRI->getEncodingValue(Op.
getReg());
469 assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
470 Result.first = Reg - RegisterEncoding.VGPR0;
471 assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
473 assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
474 Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
475 assert(Result.first >= NUM_ALL_VGPRS &&
476 Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
485 unsigned Size = TRI->getRegSizeInBits(*RC);
486 Result.second = Result.first + (Size / 32);
491 void WaitcntBrackets::setExpScore(
const MachineInstr *MI,
496 RegInterval
Interval = getRegInterval(MI, TII, MRI, TRI, OpNo,
false);
501 for (
signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
502 setRegScore(RegNo,
EXP_CNT, Val);
506 void WaitcntBrackets::updateByEvent(
const SIInstrInfo *TII,
512 uint32_t CurrScore = getScoreUB(T) + 1;
518 if (!hasPendingEvent(E)) {
519 if (PendingEvents & WaitEventMaskForInst[T])
520 MixedPendingEvents[
T] =
true;
521 PendingEvents |= 1 <<
E;
523 setScoreUB(T, CurrScore);
531 if (Inst.
getOpcode() != AMDGPU::DS_APPEND &&
532 Inst.
getOpcode() != AMDGPU::DS_CONSUME) {
534 &Inst, TII, TRI, MRI,
540 AMDGPU::OpName::data0) != -1) {
542 &Inst, TII, TRI, MRI,
547 AMDGPU::OpName::data1) != -1) {
548 setExpScore(&Inst, TII, TRI, MRI,
550 AMDGPU::OpName::data1),
554 Inst.
getOpcode() != AMDGPU::DS_GWS_INIT &&
555 Inst.
getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
556 Inst.
getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
557 Inst.
getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
558 Inst.
getOpcode() != AMDGPU::DS_GWS_BARRIER &&
560 Inst.
getOpcode() != AMDGPU::DS_CONSUME &&
565 setExpScore(&Inst, TII, TRI, MRI,
I, CurrScore);
569 }
else if (TII->
isFLAT(Inst)) {
572 &Inst, TII, TRI, MRI,
577 &Inst, TII, TRI, MRI,
581 }
else if (TII->
isMIMG(Inst)) {
583 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
586 &Inst, TII, TRI, MRI,
590 }
else if (TII->
isMTBUF(Inst)) {
592 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
594 }
else if (TII->
isMUBUF(Inst)) {
596 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
599 &Inst, TII, TRI, MRI,
604 if (TII->
isEXP(Inst)) {
621 setExpScore(&Inst, TII, TRI, MRI,
I, CurrScore);
625 #if 0 // TODO: check if this is handled by MUBUF code above. 626 }
else if (Inst.
getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
627 Inst.
getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
628 Inst.
getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
631 RegInterval
Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo,
false);
632 for (
signed RegNo = Interval.first; RegNo < Interval.second;
634 setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
640 RegInterval
Interval = getRegInterval(&Inst, TII, MRI, TRI,
I,
true);
641 if (T ==
VM_CNT && Interval.first >= NUM_ALL_VGPRS)
643 for (
signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
644 setRegScore(RegNo, T, CurrScore);
648 setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
655 for (
auto T : inst_counter_types()) {
661 OS <<
" VM_CNT(" << UB - LB <<
"): ";
664 OS <<
" LGKM_CNT(" << UB - LB <<
"): ";
667 OS <<
" EXP_CNT(" << UB - LB <<
"): ";
670 OS <<
" UNKNOWN(" << UB - LB <<
"): ";
676 for (
int J = 0; J <= getMaxVGPR(); J++) {
680 uint32_t RelScore = RegScore - LB - 1;
681 if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
682 OS << RelScore <<
":v" << J <<
" ";
684 OS << RelScore <<
":ds ";
689 for (
int J = 0; J <= getMaxSGPR(); J++) {
693 uint32_t RelScore = RegScore - LB - 1;
694 OS << RelScore <<
":s" << J <<
" ";
705 bool WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &
Wait)
const {
706 return simplifyWaitcnt(
VM_CNT, Wait.VmCnt) |
707 simplifyWaitcnt(
EXP_CNT, Wait.ExpCnt) |
708 simplifyWaitcnt(
LGKM_CNT, Wait.LgkmCnt);
712 unsigned &Count)
const {
715 if (Count < UB && UB - Count > LB)
723 AMDGPU::Waitcnt &Wait)
const {
728 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
731 !
ST->hasFlatLgkmVMemCountInOrder()) {
736 }
else if (counterOutOfOrder(T)) {
742 addWait(Wait, T, UB - ScoreToWait);
747 void WaitcntBrackets::applyWaitcnt(
const AMDGPU::Waitcnt &Wait) {
748 applyWaitcnt(
VM_CNT, Wait.VmCnt);
749 applyWaitcnt(
EXP_CNT, Wait.ExpCnt);
750 applyWaitcnt(
LGKM_CNT, Wait.LgkmCnt);
753 void WaitcntBrackets::applyWaitcnt(
InstCounterType T,
unsigned Count) {
758 if (counterOutOfOrder(T))
760 setScoreLB(T,
std::max(getScoreLB(T), UB - Count));
763 MixedPendingEvents[
T] =
false;
764 PendingEvents &= ~WaitEventMaskForInst[
T];
772 if (T ==
LGKM_CNT && hasPendingEvent(SMEM_ACCESS))
774 return MixedPendingEvents[
T];
782 char SIInsertWaitcnts::
ID = 0;
787 return new SIInsertWaitcnts();
792 return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
806 bool SIInsertWaitcnts::generateWaitcntInstBefore(
809 setForceEmitWaitcnt();
810 bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
815 AMDGPU::Waitcnt
Wait;
819 if (MI.
getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
820 MI.
getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
821 MI.
getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
828 if (MI.
getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
829 MI.
getOpcode() == AMDGPU::S_SETPC_B64_return) {
833 else if ((MI.
getOpcode() == AMDGPU::S_SENDMSG ||
834 MI.
getOpcode() == AMDGPU::S_SENDMSGHALT) &&
839 #if 0 // TODO: the following blocks of logic when we have fence. 841 const unsigned int group_size =
842 context->shader_info->GetMaxThreadGroupSize();
844 const bool group_is_multi_wave =
845 (group_size == 0 || group_size > target_info->GetWaveFrontSize());
846 const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
848 for (
unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
849 SCRegType src_type = Inst->GetSrcType(i);
852 if (group_is_multi_wave ||
853 context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
854 EmitWaitcnt |= ScoreBrackets->updateByWait(
LGKM_CNT,
855 ScoreBrackets->getScoreUB(
LGKM_CNT));
857 if (target_info->HasBufferLoadToLDS()) {
858 EmitWaitcnt |= ScoreBrackets->updateByWait(
VM_CNT,
859 ScoreBrackets->getScoreUB(
VM_CNT));
865 if (group_is_multi_wave || fence_is_global) {
866 EmitWaitcnt |= ScoreBrackets->updateByWait(
EXP_CNT,
867 ScoreBrackets->getScoreUB(
EXP_CNT));
868 EmitWaitcnt |= ScoreBrackets->updateByWait(
LGKM_CNT,
869 ScoreBrackets->getScoreUB(
LGKM_CNT));
877 if (group_is_multi_wave || fence_is_global) {
878 EmitWaitcnt |= ScoreBrackets->updateByWait(
EXP_CNT,
879 ScoreBrackets->getScoreUB(
EXP_CNT));
880 EmitWaitcnt |= ScoreBrackets->updateByWait(
VM_CNT,
881 ScoreBrackets->getScoreUB(
VM_CNT));
901 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
902 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
903 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
904 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
909 #if 0 // TODO: the following code to handle CALL. 917 if (ScoreBrackets->getScoreUB(
EXP_CNT) >
918 ScoreBrackets->getScoreLB(
EXP_CNT)) {
919 ScoreBrackets->setScoreLB(
EXP_CNT, ScoreBrackets->getScoreUB(
EXP_CNT));
933 unsigned AS = Memop->getAddrSpace();
936 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
938 ScoreBrackets.determineWait(
946 ScoreBrackets.getRegInterval(&MI,
TII,
MRI,
TRI,
I,
false);
947 for (
signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
950 ScoreBrackets.determineWait(
953 ScoreBrackets.determineWait(
968 unsigned AS = Memop->getAddrSpace();
971 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
972 ScoreBrackets.determineWait(
974 ScoreBrackets.determineWait(
982 ScoreBrackets.getRegInterval(&MI,
TII,
MRI,
TRI,
I,
true);
983 for (
signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
985 ScoreBrackets.determineWait(
987 ScoreBrackets.determineWait(
990 ScoreBrackets.determineWait(
1000 if (MI.
getOpcode() == AMDGPU::S_BARRIER &&
1001 !
ST->hasAutoWaitcntBeforeBarrier()) {
1009 if (ScoreBrackets.getScoreLB(
LGKM_CNT) <
1010 ScoreBrackets.getScoreUB(
LGKM_CNT) &&
1011 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1017 if (!ScoreBrackets.simplifyWaitcnt(Wait) && !IsForceEmitWaitcnt) {
1019 if (OldWaitcntInstr) {
1020 if (TrackedWaitcntSet.count(OldWaitcntInstr)) {
1021 TrackedWaitcntSet.erase(OldWaitcntInstr);
1033 if (ForceEmitZeroWaitcnts)
1036 if (ForceEmitWaitcnt[
VM_CNT])
1038 if (ForceEmitWaitcnt[
EXP_CNT])
1043 ScoreBrackets.applyWaitcnt(Wait);
1045 AMDGPU::Waitcnt OldWait;
1046 if (OldWaitcntInstr) {
1050 if (OldWait.dominates(Wait))
1053 if (OldWaitcntInstr && !TrackedWaitcntSet.count(OldWaitcntInstr))
1054 Wait = Wait.combined(OldWait);
1057 if (OldWaitcntInstr) {
1061 <<
"Old Instr: " << MI <<
'\n' 1062 <<
"New Instr: " << *OldWaitcntInstr <<
'\n');
1067 TrackedWaitcntSet.insert(SWaitInst);
1070 <<
"Old Instr: " << MI <<
'\n' 1071 <<
"New Instr: " << *SWaitInst <<
'\n');
1079 bool SIInsertWaitcnts::mayAccessLDSThroughFlat(
const MachineInstr &MI)
const {
1084 unsigned AS = Memop->getAddrSpace();
1092 void SIInsertWaitcnts::updateEventWaitcntAfter(
MachineInstr &Inst,
1093 WaitcntBrackets *ScoreBrackets) {
1098 if (
TII->isDS(Inst) &&
TII->usesLGKM_CNT(Inst)) {
1100 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
1101 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, GDS_ACCESS, Inst);
1102 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, GDS_GPR_LOCK, Inst);
1104 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, LDS_ACCESS, Inst);
1106 }
else if (
TII->isFLAT(Inst)) {
1109 if (
TII->usesVM_CNT(Inst))
1110 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, VMEM_ACCESS, Inst);
1112 if (
TII->usesLGKM_CNT(Inst)) {
1113 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, LDS_ACCESS, Inst);
1118 if (mayAccessLDSThroughFlat(Inst))
1119 ScoreBrackets->setPendingFlat();
1123 Inst.
getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
1124 Inst.
getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
1125 Inst.
getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
1126 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, VMEM_ACCESS, Inst);
1127 if (
ST->vmemWriteNeedsExpWaitcnt() &&
1129 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, VMW_GPR_LOCK, Inst);
1131 }
else if (
TII->isSMRD(Inst)) {
1132 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, SMEM_ACCESS, Inst);
1135 case AMDGPU::S_SENDMSG:
1136 case AMDGPU::S_SENDMSGHALT:
1137 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, SQ_MESSAGE, Inst);
1140 case AMDGPU::EXP_DONE: {
1141 int Imm =
TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
1142 if (Imm >= 32 && Imm <= 63)
1143 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, EXP_PARAM_ACCESS, Inst);
1144 else if (Imm >= 12 && Imm <= 15)
1145 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, EXP_POS_ACCESS, Inst);
1147 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, EXP_GPR_LOCK, Inst);
1150 case AMDGPU::S_MEMTIME:
1151 case AMDGPU::S_MEMREALTIME:
1152 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, SMEM_ACCESS, Inst);
1160 bool WaitcntBrackets::mergeScore(
const MergeInfo &M,
uint32_t &Score,
1162 uint32_t MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
1164 OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
1165 Score =
std::max(MyShifted, OtherShifted);
1166 return OtherShifted > MyShifted;
1174 bool WaitcntBrackets::merge(
const WaitcntBrackets &
Other) {
1175 bool StrictDom =
false;
1177 for (
auto T : inst_counter_types()) {
1179 const bool OldOutOfOrder = counterOutOfOrder(
T);
1180 const uint32_t OldEvents = PendingEvents & WaitEventMaskForInst[
T];
1181 const uint32_t OtherEvents = Other.PendingEvents & WaitEventMaskForInst[
T];
1182 if (OtherEvents & ~OldEvents)
1184 if (Other.MixedPendingEvents[
T] ||
1185 (OldEvents && OtherEvents && OldEvents != OtherEvents))
1186 MixedPendingEvents[
T] =
true;
1187 PendingEvents |= OtherEvents;
1190 const uint32_t MyPending = ScoreUBs[
T] - ScoreLBs[
T];
1191 const uint32_t OtherPending = Other.ScoreUBs[
T] - Other.ScoreLBs[
T];
1193 M.OldLB = ScoreLBs[
T];
1194 M.OtherLB = Other.ScoreLBs[
T];
1195 M.MyShift = OtherPending > MyPending ? OtherPending - MyPending : 0;
1196 M.OtherShift = ScoreUBs[
T] - Other.ScoreUBs[
T] + M.MyShift;
1198 const uint32_t NewUB = ScoreUBs[
T] + M.MyShift;
1199 if (NewUB < ScoreUBs[
T])
1201 ScoreUBs[
T] = NewUB;
1202 ScoreLBs[
T] = std::min(M.OldLB + M.MyShift, M.OtherLB + M.OtherShift);
1204 StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
1206 bool RegStrictDom =
false;
1207 for (
int J = 0,
E =
std::max(getMaxVGPR(), Other.getMaxVGPR()) + 1; J !=
E;
1209 RegStrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
1213 for (
int J = 0,
E =
std::max(getMaxSGPR(), Other.getMaxSGPR()) + 1;
1215 RegStrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
1219 if (RegStrictDom && !OldOutOfOrder)
1223 VgprUB =
std::max(getMaxVGPR(), Other.getMaxVGPR());
1224 SgprUB =
std::max(getMaxSGPR(), Other.getMaxSGPR());
1232 WaitcntBrackets &ScoreBrackets) {
1237 ScoreBrackets.dump();
1248 if (Inst.
getOpcode() == AMDGPU::S_WAITCNT) {
1249 if (OldWaitcntInstr) {
1250 if (TrackedWaitcntSet.count(OldWaitcntInstr)) {
1251 TrackedWaitcntSet.erase(OldWaitcntInstr);
1253 OldWaitcntInstr =
nullptr;
1254 }
else if (!TrackedWaitcntSet.count(&Inst)) {
1257 int64_t Imm = OldWaitcntInstr->getOperand(0).getImm();
1267 OldWaitcntInstr = &Inst;
1272 bool VCCZBugWorkAround =
false;
1274 (!VCCZBugHandledSet.count(&Inst))) {
1275 if (ScoreBrackets.getScoreLB(
LGKM_CNT) <
1276 ScoreBrackets.getScoreUB(
LGKM_CNT) &&
1277 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1279 VCCZBugWorkAround =
true;
1285 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
1286 OldWaitcntInstr =
nullptr;
1288 updateEventWaitcntAfter(Inst, &ScoreBrackets);
1290 #if 0 // TODO: implement resource type check controlled by options with ub = LB. 1294 if (RequireCheckResourceType(Inst, context)) {
1296 ScoreBrackets->setScoreLB(
VM_CNT,
1297 ScoreBrackets->getScoreUB(
VM_CNT));
1303 ScoreBrackets.dump();
1309 if (Inst.
getOpcode() == AMDGPU::DS_GWS_INIT ||
1310 Inst.
getOpcode() == AMDGPU::DS_GWS_SEMA_V ||
1311 Inst.
getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
1312 Inst.
getOpcode() == AMDGPU::DS_GWS_SEMA_P ||
1313 Inst.
getOpcode() == AMDGPU::DS_GWS_BARRIER) {
1320 if (VCCZBugWorkAround) {
1326 .addReg(AMDGPU::VCC);
1327 VCCZBugHandledSet.insert(&Inst);
1339 TII =
ST->getInstrInfo();
1346 for (
auto T : inst_counter_types())
1347 ForceEmitWaitcnt[
T] =
false;
1353 HardwareLimits.NumVGPRsMax =
ST->getAddressableNumVGPRs();
1354 HardwareLimits.NumSGPRsMax =
ST->getAddressableNumSGPRs();
1355 assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
1356 assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
1358 RegisterEncoding.VGPR0 =
TRI->getEncodingValue(AMDGPU::VGPR0);
1359 RegisterEncoding.VGPRL =
1360 RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1;
1361 RegisterEncoding.SGPR0 =
TRI->getEncodingValue(AMDGPU::SGPR0);
1362 RegisterEncoding.SGPRL =
1363 RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
1365 TrackedWaitcntSet.clear();
1366 VCCZBugHandledSet.clear();
1374 RpotIdxMap[MBB] = BlockInfos.size();
1375 BlockInfos.emplace_back(MBB);
1378 std::unique_ptr<WaitcntBrackets> Brackets;
1384 for (BlockInfo &BI : BlockInfos) {
1388 unsigned Idx = std::distance(&*BlockInfos.begin(), &BI);
1392 Brackets = llvm::make_unique<WaitcntBrackets>(*BI.Incoming);
1394 *Brackets = *BI.Incoming;
1397 Brackets = llvm::make_unique<WaitcntBrackets>(
ST);
1402 Modified |= insertWaitcntInBlock(MF, *BI.MBB, *Brackets);
1405 if (Brackets->hasPending()) {
1406 BlockInfo *MoveBracketsToSucc =
nullptr;
1408 unsigned SuccIdx = RpotIdxMap[Succ];
1409 BlockInfo &SuccBI = BlockInfos[SuccIdx];
1410 if (!SuccBI.Incoming) {
1411 SuccBI.Dirty =
true;
1414 if (!MoveBracketsToSucc) {
1415 MoveBracketsToSucc = &SuccBI;
1417 SuccBI.Incoming = llvm::make_unique<WaitcntBrackets>(*Brackets);
1419 }
else if (SuccBI.Incoming->merge(*Brackets)) {
1420 SuccBI.Dirty =
true;
1425 if (MoveBracketsToSucc)
1426 MoveBracketsToSucc->Incoming = std::move(Brackets);
1433 bool HaveScalarStores =
false;
1441 if (!HaveScalarStores && TII->isScalarStore(*
I))
1442 HaveScalarStores =
true;
1444 if (
I->getOpcode() == AMDGPU::S_ENDPGM ||
1445 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
1450 if (HaveScalarStores) {
1459 bool SeenDCacheWB =
false;
1463 if (
I->getOpcode() == AMDGPU::S_DCACHE_WB)
1464 SeenDCacheWB =
true;
1465 else if (TII->isScalarStore(*
I))
1466 SeenDCacheWB =
false;
1469 if ((
I->getOpcode() == AMDGPU::S_ENDPGM ||
1470 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
1473 BuildMI(*MBB,
I,
I->getDebugLoc(), TII->
get(AMDGPU::S_DCACHE_WB));
1479 if (!MFI->isEntryFunction()) {
static cl::opt< unsigned > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(0), cl::Hidden)
Interface definition for SIRegisterInfo.
bool modifiesRegister(unsigned Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr modifies (fully define or partially define) the specified register...
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
AMDGPU specific subclass of TargetSubtarget.
LLVM_ATTRIBUTE_NORETURN void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
This class represents lattice values for constants.
unsigned getExpcntBitMask(const IsaVersion &Version)
Interval Class - An Interval is a set of nodes defined such that every node in the interval has all o...
Implements a dense probed hash-table based set.
void push_back(const T &Elt)
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
unsigned getReg() const
getReg - Returns the register number.
unsigned getSubReg() const
unsigned const TargetRegisterInfo * TRI
LLVM_READONLY int getAtomicNoRetOp(uint16_t Opcode)
FunctionPass * createSIInsertWaitcntsPass()
DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp", "Force emit s_waitcnt expcnt(0) instrs")
iterator_range< succ_iterator > successors()
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
static bool isDS(const MachineInstr &MI)
static const AMDGPUSubtarget & get(const MachineFunction &MF)
A description of a memory reference used in the backend.
static bool isFLAT(const MachineInstr &MI)
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const
unsigned getNumOperands() const
Retuns the total number of operands.
static bool isMIMG(const MachineInstr &MI)
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
static bool readsVCCZ(const MachineInstr &MI)
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
This file provides an implementation of debug counters.
APInt operator*(APInt a, uint64_t RHS)
bool isVGPR(const MachineRegisterInfo &MRI, unsigned Reg) const
static bool isMUBUF(const MachineInstr &MI)
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, unsigned OperandName) const
Returns the operand named Op.
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
static bool isCounterSet(unsigned ID)
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
CRTP base class which implements the entire standard iterator facade in terms of a minimal subset of ...
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
initializer< Ty > init(const Ty &Val)
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
unsigned const MachineRegisterInfo * MRI
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Address space for flat memory.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Represent the analysis usage information of a pass.
Address space for local memory.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
void setImm(int64_t immVal)
FunctionPass class - This class is used to implement most global optimizations.
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
self_iterator getIterator()
static bool shouldExecute(unsigned CounterName)
void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
const MachineBasicBlock & front() const
bool isDebugInstr() const
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isEXP(const MachineInstr &MI)
IsaVersion getIsaVersion(StringRef GPU)
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
char & SIInsertWaitcntsID
Iterator for intrusive lists based on ilist_node.
MachineOperand class - Representation of each machine instruction operand.
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small...
void setPreservesCFG()
This function should be called by the pass, iff they do not:
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
A range adaptor for a pair of iterators.
static void clear(coro::Shape &Shape)
const MachineBasicBlock * getParent() const
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
Representation of each machine instruction.
INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, false) INITIALIZE_PASS_END(SIInsertWaitcnts
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Interface definition for SIInstrInfo.
static bool isMTBUF(const MachineInstr &MI)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
static bool isVMEM(const MachineInstr &MI)
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
bool memoperands_empty() const
Return true if we don't have any memory operands which described the memory access done by this instr...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
LLVM Value Representation.
This class implements an extremely fast bulk output stream that can only output to a stream...
StringRef - Represent a constant reference to a string, i.e.
unsigned getLgkmcntBitMask(const IsaVersion &Version)
ProcessInfo Wait(const ProcessInfo &PI, unsigned SecondsToWait, bool WaitUntilTerminates, std::string *ErrMsg=nullptr)
This function waits for the process specified by PI to finish.
bool operator==(uint64_t V1, const APInt &V2)
const MachineOperand & getOperand(unsigned i) const
unsigned getVmcntBitMask(const IsaVersion &Version)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
const SIRegisterInfo * getRegisterInfo() const override