LLVM  8.0.1
SIISelLowering.cpp
Go to the documentation of this file.
1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// Custom DAG lowering for SI
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #if defined(_MSC_VER) || defined(__MINGW32__)
16 // Provide M_PI.
17 #define _USE_MATH_DEFINES
18 #endif
19 
20 #include "SIISelLowering.h"
21 #include "AMDGPU.h"
22 #include "AMDGPUIntrinsicInfo.h"
23 #include "AMDGPUSubtarget.h"
24 #include "AMDGPUTargetMachine.h"
25 #include "SIDefines.h"
26 #include "SIInstrInfo.h"
27 #include "SIMachineFunctionInfo.h"
28 #include "SIRegisterInfo.h"
30 #include "Utils/AMDGPUBaseInfo.h"
31 #include "llvm/ADT/APFloat.h"
32 #include "llvm/ADT/APInt.h"
33 #include "llvm/ADT/ArrayRef.h"
34 #include "llvm/ADT/BitVector.h"
35 #include "llvm/ADT/SmallVector.h"
36 #include "llvm/ADT/Statistic.h"
37 #include "llvm/ADT/StringRef.h"
38 #include "llvm/ADT/StringSwitch.h"
39 #include "llvm/ADT/Twine.h"
40 #include "llvm/CodeGen/Analysis.h"
58 #include "llvm/IR/Constants.h"
59 #include "llvm/IR/DataLayout.h"
60 #include "llvm/IR/DebugLoc.h"
61 #include "llvm/IR/DerivedTypes.h"
62 #include "llvm/IR/DiagnosticInfo.h"
63 #include "llvm/IR/Function.h"
64 #include "llvm/IR/GlobalValue.h"
65 #include "llvm/IR/InstrTypes.h"
66 #include "llvm/IR/Instruction.h"
67 #include "llvm/IR/Instructions.h"
68 #include "llvm/IR/IntrinsicInst.h"
69 #include "llvm/IR/Type.h"
70 #include "llvm/Support/Casting.h"
71 #include "llvm/Support/CodeGen.h"
73 #include "llvm/Support/Compiler.h"
75 #include "llvm/Support/KnownBits.h"
79 #include <cassert>
80 #include <cmath>
81 #include <cstdint>
82 #include <iterator>
83 #include <tuple>
84 #include <utility>
85 #include <vector>
86 
87 using namespace llvm;
88 
89 #define DEBUG_TYPE "si-lower"
90 
91 STATISTIC(NumTailCalls, "Number of tail calls");
92 
94  "amdgpu-vgpr-index-mode",
95  cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
96  cl::init(false));
97 
99  "amdgpu-frame-index-zero-bits",
100  cl::desc("High bits of frame index assumed to be zero"),
101  cl::init(5),
103 
104 static unsigned findFirstFreeSGPR(CCState &CCInfo) {
105  unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
106  for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
107  if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
108  return AMDGPU::SGPR0 + Reg;
109  }
110  }
111  llvm_unreachable("Cannot allocate sgpr");
112 }
113 
115  const GCNSubtarget &STI)
116  : AMDGPUTargetLowering(TM, STI),
117  Subtarget(&STI) {
118  addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
119  addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
120 
121  addRegisterClass(MVT::i32, &AMDGPU::SReg_32_XM0RegClass);
122  addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
123 
124  addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
125  addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
126  addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
127 
128  addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
129  addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);
130 
131  addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
132  addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
133 
134  addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
135  addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
136 
137  addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
138  addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
139 
140  if (Subtarget->has16BitInsts()) {
141  addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass);
142  addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
143 
144  // Unless there are also VOP3P operations, not operations are really legal.
145  addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass);
146  addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass);
147  addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
148  addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
149  }
150 
152 
153  // We need to custom lower vector stores from local memory
160 
167 
178 
181 
186 
192 
197 
200 
208 
216 
221 
226 
233 
236 
239 
243 
244 #if 0
247 #endif
248 
249  // We only support LOAD/STORE and vector manipulation ops for vectors
250  // with > 4 elements.
253  for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
254  switch (Op) {
255  case ISD::LOAD:
256  case ISD::STORE:
257  case ISD::BUILD_VECTOR:
258  case ISD::BITCAST:
264  break;
265  case ISD::CONCAT_VECTORS:
267  break;
268  default:
270  break;
271  }
272  }
273  }
274 
276 
277  // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
278  // is expanded to avoid having two separate loops in case the index is a VGPR.
279 
280  // Most operations are naturally 32-bit vector operations. We only support
281  // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
282  for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
285 
288 
291 
294  }
295 
300 
303 
304  // Avoid stack access for these.
305  // TODO: Generalize to more vector types.
310 
316 
320 
325 
326  // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
327  // and output demarshalling
330 
331  // We can't return success/failure, only the old value,
332  // let LLVM add the comparison
335 
336  if (Subtarget->hasFlatAddressSpace()) {
339  }
340 
343 
344  // On SI this is s_memtime and s_memrealtime on VI.
348 
349  if (Subtarget->has16BitInsts()) {
353  }
354 
355  // v_mad_f32 does not support denormals according to some sources.
356  if (!Subtarget->hasFP32Denormals())
358 
359  if (!Subtarget->hasBFI()) {
360  // fcopysign can be done in a single instruction with BFI.
363  }
364 
365  if (!Subtarget->hasBCNT(32))
367 
368  if (!Subtarget->hasBCNT(64))
370 
371  if (Subtarget->hasFFBH())
373 
374  if (Subtarget->hasFFBL())
376 
377  // We only really have 32-bit BFE instructions (and 16-bit on VI).
378  //
379  // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
380  // effort to match them now. We want this to be false for i64 cases when the
381  // extraction isn't restricted to the upper or lower half. Ideally we would
382  // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
383  // span the midpoint are probably relatively rare, so don't worry about them
384  // for now.
385  if (Subtarget->hasBFE())
386  setHasExtractBitsInsn(true);
387 
392 
393 
394  // These are really only legal for ieee_mode functions. We should be avoiding
395  // them for functions that don't have ieee_mode enabled, so just say they are
396  // legal.
401 
402 
403  if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
407  } else {
412  }
413 
415 
420 
421  if (Subtarget->has16BitInsts()) {
423 
426 
429 
432 
435 
440 
443 
449 
451 
453 
455 
457 
462 
467 
468  // F16 - Constant Actions.
470 
471  // F16 - Load/Store Actions.
476 
477  // F16 - VOP1 Actions.
486 
487  // F16 - VOP2 Actions.
490 
492 
493  // F16 - VOP3 Actions.
495  if (!Subtarget->hasFP16Denormals())
497 
498  for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) {
499  for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
500  switch (Op) {
501  case ISD::LOAD:
502  case ISD::STORE:
503  case ISD::BUILD_VECTOR:
504  case ISD::BITCAST:
510  break;
511  case ISD::CONCAT_VECTORS:
513  break;
514  default:
516  break;
517  }
518  }
519  }
520 
521  // XXX - Do these do anything? Vector constants turn into build_vector.
524 
527 
532 
537 
544 
549 
554 
559 
563 
564  if (!Subtarget->hasVOP3PInsts()) {
567  }
568 
570  // This isn't really legal, but this avoids the legalizer unrolling it (and
571  // allows matching fneg (fabs x) patterns)
573 
578 
581 
584  }
585 
586  if (Subtarget->hasVOP3PInsts()) {
597 
601 
604 
606 
609 
616 
621 
624 
627 
631 
635  }
636 
639 
640  if (Subtarget->has16BitInsts()) {
645  } else {
646  // Legalization hack.
649 
652  }
653 
656  }
657 
684 
685  // All memory operations. Some folding on the pointer operand is done to help
686  // matching the constant offsets in the addressing modes.
704 
706 
707  // SI at least has hardware support for floating point exceptions, but no way
708  // of using or handling them is implemented. They are also optional in OpenCL
709  // (Section 7.3)
711 }
712 
714  return Subtarget;
715 }
716 
717 //===----------------------------------------------------------------------===//
718 // TargetLowering queries
719 //===----------------------------------------------------------------------===//
720 
721 // v_mad_mix* support a conversion from f16 to f32.
722 //
723 // There is only one special case when denormals are enabled we don't currently,
724 // where this is OK to use.
726  EVT DestVT, EVT SrcVT) const {
727  return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
728  (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
729  DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() &&
730  SrcVT.getScalarType() == MVT::f16;
731 }
732 
734  // SI has some legal vector types, but no legal vector operations. Say no
735  // shuffles are legal in order to prefer scalarizing some vector operations.
736  return false;
737 }
738 
740  CallingConv::ID CC,
741  EVT VT) const {
742  // TODO: Consider splitting all arguments into 32-bit pieces.
743  if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
744  EVT ScalarVT = VT.getScalarType();
745  unsigned Size = ScalarVT.getSizeInBits();
746  if (Size == 32)
747  return ScalarVT.getSimpleVT();
748 
749  if (Size == 64)
750  return MVT::i32;
751 
752  if (Size == 16 && Subtarget->has16BitInsts())
753  return VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
754  }
755 
756  return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
757 }
758 
760  CallingConv::ID CC,
761  EVT VT) const {
762  if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
763  unsigned NumElts = VT.getVectorNumElements();
764  EVT ScalarVT = VT.getScalarType();
765  unsigned Size = ScalarVT.getSizeInBits();
766 
767  if (Size == 32)
768  return NumElts;
769 
770  if (Size == 64)
771  return 2 * NumElts;
772 
773  if (Size == 16 && Subtarget->has16BitInsts())
774  return (VT.getVectorNumElements() + 1) / 2;
775  }
776 
777  return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
778 }
779 
782  EVT VT, EVT &IntermediateVT,
783  unsigned &NumIntermediates, MVT &RegisterVT) const {
784  if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
785  unsigned NumElts = VT.getVectorNumElements();
786  EVT ScalarVT = VT.getScalarType();
787  unsigned Size = ScalarVT.getSizeInBits();
788  if (Size == 32) {
789  RegisterVT = ScalarVT.getSimpleVT();
790  IntermediateVT = RegisterVT;
791  NumIntermediates = NumElts;
792  return NumIntermediates;
793  }
794 
795  if (Size == 64) {
796  RegisterVT = MVT::i32;
797  IntermediateVT = RegisterVT;
798  NumIntermediates = 2 * NumElts;
799  return NumIntermediates;
800  }
801 
802  // FIXME: We should fix the ABI to be the same on targets without 16-bit
803  // support, but unless we can properly handle 3-vectors, it will be still be
804  // inconsistent.
805  if (Size == 16 && Subtarget->has16BitInsts()) {
806  RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
807  IntermediateVT = RegisterVT;
808  NumIntermediates = (NumElts + 1) / 2;
809  return NumIntermediates;
810  }
811  }
812 
814  Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
815 }
816 
818  // Only limited forms of aggregate type currently expected.
819  assert(Ty->isStructTy() && "Expected struct type");
820 
821 
822  Type *ElementType = nullptr;
823  unsigned NumElts;
824  if (Ty->getContainedType(0)->isVectorTy()) {
825  VectorType *VecComponent = cast<VectorType>(Ty->getContainedType(0));
826  ElementType = VecComponent->getElementType();
827  NumElts = VecComponent->getNumElements();
828  } else {
829  ElementType = Ty->getContainedType(0);
830  NumElts = 1;
831  }
832 
833  assert((Ty->getContainedType(1) && Ty->getContainedType(1)->isIntegerTy(32)) && "Expected int32 type");
834 
835  // Calculate the size of the memVT type from the aggregate
836  unsigned Pow2Elts = 0;
837  unsigned ElementSize;
838  switch (ElementType->getTypeID()) {
839  default:
840  llvm_unreachable("Unknown type!");
841  case Type::IntegerTyID:
842  ElementSize = cast<IntegerType>(ElementType)->getBitWidth();
843  break;
844  case Type::HalfTyID:
845  ElementSize = 16;
846  break;
847  case Type::FloatTyID:
848  ElementSize = 32;
849  break;
850  }
851  unsigned AdditionalElts = ElementSize == 16 ? 2 : 1;
852  Pow2Elts = 1 << Log2_32_Ceil(NumElts + AdditionalElts);
853 
854  return MVT::getVectorVT(MVT::getVT(ElementType, false),
855  Pow2Elts);
856 }
857 
859  const CallInst &CI,
860  MachineFunction &MF,
861  unsigned IntrID) const {
862  if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
863  AMDGPU::lookupRsrcIntrinsic(IntrID)) {
865  (Intrinsic::ID)IntrID);
867  return false;
868 
870 
871  if (RsrcIntr->IsImage) {
872  Info.ptrVal = MFI->getImagePSV(
874  CI.getArgOperand(RsrcIntr->RsrcArg));
875  Info.align = 0;
876  } else {
877  Info.ptrVal = MFI->getBufferPSV(
879  CI.getArgOperand(RsrcIntr->RsrcArg));
880  }
881 
885  Info.memVT = MVT::getVT(CI.getType(), true);
886  if (Info.memVT == MVT::Other) {
887  // Some intrinsics return an aggregate type - special case to work out
888  // the correct memVT
889  Info.memVT = memVTFromAggregate(CI.getType());
890  }
892  } else if (Attr.hasFnAttribute(Attribute::WriteOnly)) {
893  Info.opc = ISD::INTRINSIC_VOID;
894  Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
896  } else {
897  // Atomic
899  Info.memVT = MVT::getVT(CI.getType());
903 
904  // XXX - Should this be volatile without known ordering?
906  }
907  return true;
908  }
909 
910  switch (IntrID) {
919  Info.memVT = MVT::getVT(CI.getType());
920  Info.ptrVal = CI.getOperand(0);
921  Info.align = 0;
923 
924  const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
925  if (!Vol || !Vol->isZero())
927 
928  return true;
929  }
930 
931  default:
932  return false;
933  }
934 }
935 
938  Type *&AccessTy) const {
939  switch (II->getIntrinsicID()) {
947  Value *Ptr = II->getArgOperand(0);
948  AccessTy = II->getType();
949  Ops.push_back(Ptr);
950  return true;
951  }
952  default:
953  return false;
954  }
955 }
956 
957 bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
958  if (!Subtarget->hasFlatInstOffsets()) {
959  // Flat instructions do not have offsets, and only have the register
960  // address.
961  return AM.BaseOffs == 0 && AM.Scale == 0;
962  }
963 
964  // GFX9 added a 13-bit signed offset. When using regular flat instructions,
965  // the sign bit is ignored and is treated as a 12-bit unsigned offset.
966 
967  // Just r + i
968  return isUInt<12>(AM.BaseOffs) && AM.Scale == 0;
969 }
970 
972  if (Subtarget->hasFlatGlobalInsts())
973  return isInt<13>(AM.BaseOffs) && AM.Scale == 0;
974 
975  if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
976  // Assume the we will use FLAT for all global memory accesses
977  // on VI.
978  // FIXME: This assumption is currently wrong. On VI we still use
979  // MUBUF instructions for the r + i addressing mode. As currently
980  // implemented, the MUBUF instructions only work on buffer < 4GB.
981  // It may be possible to support > 4GB buffers with MUBUF instructions,
982  // by setting the stride value in the resource descriptor which would
983  // increase the size limit to (stride * 4GB). However, this is risky,
984  // because it has never been validated.
985  return isLegalFlatAddressingMode(AM);
986  }
987 
988  return isLegalMUBUFAddressingMode(AM);
989 }
990 
991 bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
992  // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
993  // additionally can do r + r + i with addr64. 32-bit has more addressing
994  // mode options. Depending on the resource constant, it can also do
995  // (i64 r0) + (i32 r1) * (i14 i).
996  //
997  // Private arrays end up using a scratch buffer most of the time, so also
998  // assume those use MUBUF instructions. Scratch loads / stores are currently
999  // implemented as mubuf instructions with offen bit set, so slightly
1000  // different than the normal addr64.
1001  if (!isUInt<12>(AM.BaseOffs))
1002  return false;
1003 
1004  // FIXME: Since we can split immediate into soffset and immediate offset,
1005  // would it make sense to allow any immediate?
1006 
1007  switch (AM.Scale) {
1008  case 0: // r + i or just i, depending on HasBaseReg.
1009  return true;
1010  case 1:
1011  return true; // We have r + r or r + i.
1012  case 2:
1013  if (AM.HasBaseReg) {
1014  // Reject 2 * r + r.
1015  return false;
1016  }
1017 
1018  // Allow 2 * r as r + r
1019  // Or 2 * r + i is allowed as r + r + i.
1020  return true;
1021  default: // Don't allow n * r
1022  return false;
1023  }
1024 }
1025 
1027  const AddrMode &AM, Type *Ty,
1028  unsigned AS, Instruction *I) const {
1029  // No global is ever allowed as a base.
1030  if (AM.BaseGV)
1031  return false;
1032 
1033  if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1034  return isLegalGlobalAddressingMode(AM);
1035 
1036  if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1038  // If the offset isn't a multiple of 4, it probably isn't going to be
1039  // correctly aligned.
1040  // FIXME: Can we get the real alignment here?
1041  if (AM.BaseOffs % 4 != 0)
1042  return isLegalMUBUFAddressingMode(AM);
1043 
1044  // There are no SMRD extloads, so if we have to do a small type access we
1045  // will use a MUBUF load.
1046  // FIXME?: We also need to do this if unaligned, but we don't know the
1047  // alignment here.
1048  if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1049  return isLegalGlobalAddressingMode(AM);
1050 
1051  if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1052  // SMRD instructions have an 8-bit, dword offset on SI.
1053  if (!isUInt<8>(AM.BaseOffs / 4))
1054  return false;
1055  } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1056  // On CI+, this can also be a 32-bit literal constant offset. If it fits
1057  // in 8-bits, it can use a smaller encoding.
1058  if (!isUInt<32>(AM.BaseOffs / 4))
1059  return false;
1060  } else if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
1061  // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1062  if (!isUInt<20>(AM.BaseOffs))
1063  return false;
1064  } else
1065  llvm_unreachable("unhandled generation");
1066 
1067  if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1068  return true;
1069 
1070  if (AM.Scale == 1 && AM.HasBaseReg)
1071  return true;
1072 
1073  return false;
1074 
1075  } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1076  return isLegalMUBUFAddressingMode(AM);
1077  } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1078  AS == AMDGPUAS::REGION_ADDRESS) {
1079  // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1080  // field.
1081  // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1082  // an 8-bit dword offset but we don't know the alignment here.
1083  if (!isUInt<16>(AM.BaseOffs))
1084  return false;
1085 
1086  if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1087  return true;
1088 
1089  if (AM.Scale == 1 && AM.HasBaseReg)
1090  return true;
1091 
1092  return false;
1093  } else if (AS == AMDGPUAS::FLAT_ADDRESS ||
1095  // For an unknown address space, this usually means that this is for some
1096  // reason being used for pure arithmetic, and not based on some addressing
1097  // computation. We don't have instructions that compute pointers with any
1098  // addressing modes, so treat them as having no offset like flat
1099  // instructions.
1100  return isLegalFlatAddressingMode(AM);
1101  } else {
1102  llvm_unreachable("unhandled address space");
1103  }
1104 }
1105 
1106 bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
1107  const SelectionDAG &DAG) const {
1108  if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
1109  return (MemVT.getSizeInBits() <= 4 * 32);
1110  } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1111  unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1112  return (MemVT.getSizeInBits() <= MaxPrivateBits);
1113  } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
1114  return (MemVT.getSizeInBits() <= 2 * 32);
1115  }
1116  return true;
1117 }
1118 
1120  unsigned AddrSpace,
1121  unsigned Align,
1122  bool *IsFast) const {
1123  if (IsFast)
1124  *IsFast = false;
1125 
1126  // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
1127  // which isn't a simple VT.
1128  // Until MVT is extended to handle this, simply check for the size and
1129  // rely on the condition below: allow accesses if the size is a multiple of 4.
1130  if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
1131  VT.getStoreSize() > 16)) {
1132  return false;
1133  }
1134 
1135  if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1136  AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1137  // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
1138  // aligned, 8 byte access in a single operation using ds_read2/write2_b32
1139  // with adjacent offsets.
1140  bool AlignedBy4 = (Align % 4 == 0);
1141  if (IsFast)
1142  *IsFast = AlignedBy4;
1143 
1144  return AlignedBy4;
1145  }
1146 
1147  // FIXME: We have to be conservative here and assume that flat operations
1148  // will access scratch. If we had access to the IR function, then we
1149  // could determine if any private memory was used in the function.
1150  if (!Subtarget->hasUnalignedScratchAccess() &&
1151  (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
1152  AddrSpace == AMDGPUAS::FLAT_ADDRESS)) {
1153  bool AlignedBy4 = Align >= 4;
1154  if (IsFast)
1155  *IsFast = AlignedBy4;
1156 
1157  return AlignedBy4;
1158  }
1159 
1160  if (Subtarget->hasUnalignedBufferAccess()) {
1161  // If we have an uniform constant load, it still requires using a slow
1162  // buffer instruction if unaligned.
1163  if (IsFast) {
1164  *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
1165  AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ?
1166  (Align % 4 == 0) : true;
1167  }
1168 
1169  return true;
1170  }
1171 
1172  // Smaller than dword value must be aligned.
1173  if (VT.bitsLT(MVT::i32))
1174  return false;
1175 
1176  // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1177  // byte-address are ignored, thus forcing Dword alignment.
1178  // This applies to private, global, and constant memory.
1179  if (IsFast)
1180  *IsFast = true;
1181 
1182  return VT.bitsGT(MVT::i32) && Align % 4 == 0;
1183 }
1184 
1185 EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
1186  unsigned SrcAlign, bool IsMemset,
1187  bool ZeroMemset,
1188  bool MemcpyStrSrc,
1189  MachineFunction &MF) const {
1190  // FIXME: Should account for address space here.
1191 
1192  // The default fallback uses the private pointer size as a guess for a type to
1193  // use. Make sure we switch these to 64-bit accesses.
1194 
1195  if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global
1196  return MVT::v4i32;
1197 
1198  if (Size >= 8 && DstAlign >= 4)
1199  return MVT::v2i32;
1200 
1201  // Use the default.
1202  return MVT::Other;
1203 }
1204 
1205 static bool isFlatGlobalAddrSpace(unsigned AS) {
1206  return AS == AMDGPUAS::GLOBAL_ADDRESS ||
1207  AS == AMDGPUAS::FLAT_ADDRESS ||
1209 }
1210 
1212  unsigned DestAS) const {
1213  return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS);
1214 }
1215 
1217  const MemSDNode *MemNode = cast<MemSDNode>(N);
1218  const Value *Ptr = MemNode->getMemOperand()->getValue();
1219  const Instruction *I = dyn_cast_or_null<Instruction>(Ptr);
1220  return I && I->getMetadata("amdgpu.noclobber");
1221 }
1222 
1224  unsigned DestAS) const {
1225  // Flat -> private/local is a simple truncate.
1226  // Flat -> global is no-op
1227  if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1228  return true;
1229 
1230  return isNoopAddrSpaceCast(SrcAS, DestAS);
1231 }
1232 
1234  const MemSDNode *MemNode = cast<MemSDNode>(N);
1235 
1236  return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand());
1237 }
1238 
1241  if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
1242  return TypeSplitVector;
1243 
1245 }
1246 
1248  Type *Ty) const {
1249  // FIXME: Could be smarter if called for vector constants.
1250  return true;
1251 }
1252 
1254  if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1255  switch (Op) {
1256  case ISD::LOAD:
1257  case ISD::STORE:
1258 
1259  // These operations are done with 32-bit instructions anyway.
1260  case ISD::AND:
1261  case ISD::OR:
1262  case ISD::XOR:
1263  case ISD::SELECT:
1264  // TODO: Extensions?
1265  return true;
1266  default:
1267  return false;
1268  }
1269  }
1270 
1271  // SimplifySetCC uses this function to determine whether or not it should
1272  // create setcc with i1 operands. We don't have instructions for i1 setcc.
1273  if (VT == MVT::i1 && Op == ISD::SETCC)
1274  return false;
1275 
1276  return TargetLowering::isTypeDesirableForOp(Op, VT);
1277 }
1278 
1279 SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1280  const SDLoc &SL,
1281  SDValue Chain,
1282  uint64_t Offset) const {
1283  const DataLayout &DL = DAG.getDataLayout();
1284  MachineFunction &MF = DAG.getMachineFunction();
1286 
1287  const ArgDescriptor *InputPtrReg;
1288  const TargetRegisterClass *RC;
1289 
1290  std::tie(InputPtrReg, RC)
1292 
1295  SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
1296  MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1297 
1298  return DAG.getObjectPtrOffset(SL, BasePtr, Offset);
1299 }
1300 
1301 SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1302  const SDLoc &SL) const {
1303  uint64_t Offset = getImplicitParameterOffset(DAG.getMachineFunction(),
1304  FIRST_IMPLICIT);
1305  return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
1306 }
1307 
1308 SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
1309  const SDLoc &SL, SDValue Val,
1310  bool Signed,
1311  const ISD::InputArg *Arg) const {
1312  if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
1313  VT.bitsLT(MemVT)) {
1314  unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
1315  Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
1316  }
1317 
1318  if (MemVT.isFloatingPoint())
1319  Val = getFPExtOrFPTrunc(DAG, Val, SL, VT);
1320  else if (Signed)
1321  Val = DAG.getSExtOrTrunc(Val, SL, VT);
1322  else
1323  Val = DAG.getZExtOrTrunc(Val, SL, VT);
1324 
1325  return Val;
1326 }
1327 
1328 SDValue SITargetLowering::lowerKernargMemParameter(
1329  SelectionDAG &DAG, EVT VT, EVT MemVT,
1330  const SDLoc &SL, SDValue Chain,
1331  uint64_t Offset, unsigned Align, bool Signed,
1332  const ISD::InputArg *Arg) const {
1333  Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
1335  MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
1336 
1337  // Try to avoid using an extload by loading earlier than the argument address,
1338  // and extracting the relevant bits. The load should hopefully be merged with
1339  // the previous argument.
1340  if (MemVT.getStoreSize() < 4 && Align < 4) {
1341  // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
1342  int64_t AlignDownOffset = alignDown(Offset, 4);
1343  int64_t OffsetDiff = Offset - AlignDownOffset;
1344 
1345  EVT IntVT = MemVT.changeTypeToInteger();
1346 
1347  // TODO: If we passed in the base kernel offset we could have a better
1348  // alignment than 4, but we don't really need it.
1349  SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
1350  SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, 4,
1353 
1354  SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
1355  SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
1356 
1357  SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
1358  ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
1359  ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
1360 
1361 
1362  return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
1363  }
1364 
1365  SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
1366  SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
1369 
1370  SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
1371  return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
1372 }
1373 
1374 SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
1375  const SDLoc &SL, SDValue Chain,
1376  const ISD::InputArg &Arg) const {
1377  MachineFunction &MF = DAG.getMachineFunction();
1378  MachineFrameInfo &MFI = MF.getFrameInfo();
1379 
1380  if (Arg.Flags.isByVal()) {
1381  unsigned Size = Arg.Flags.getByValSize();
1382  int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
1383  return DAG.getFrameIndex(FrameIdx, MVT::i32);
1384  }
1385 
1386  unsigned ArgOffset = VA.getLocMemOffset();
1387  unsigned ArgSize = VA.getValVT().getStoreSize();
1388 
1389  int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
1390 
1391  // Create load nodes to retrieve arguments from the stack.
1392  SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
1393  SDValue ArgValue;
1394 
1395  // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
1397  MVT MemVT = VA.getValVT();
1398 
1399  switch (VA.getLocInfo()) {
1400  default:
1401  break;
1402  case CCValAssign::BCvt:
1403  MemVT = VA.getLocVT();
1404  break;
1405  case CCValAssign::SExt:
1406  ExtType = ISD::SEXTLOAD;
1407  break;
1408  case CCValAssign::ZExt:
1409  ExtType = ISD::ZEXTLOAD;
1410  break;
1411  case CCValAssign::AExt:
1412  ExtType = ISD::EXTLOAD;
1413  break;
1414  }
1415 
1416  ArgValue = DAG.getExtLoad(
1417  ExtType, SL, VA.getLocVT(), Chain, FIN,
1419  MemVT);
1420  return ArgValue;
1421 }
1422 
1423 SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
1424  const SIMachineFunctionInfo &MFI,
1425  EVT VT,
1427  const ArgDescriptor *Reg;
1428  const TargetRegisterClass *RC;
1429 
1430  std::tie(Reg, RC) = MFI.getPreloadedValue(PVID);
1431  return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
1432 }
1433 
1435  CallingConv::ID CallConv,
1437  BitVector &Skipped,
1438  FunctionType *FType,
1440  for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
1441  const ISD::InputArg *Arg = &Ins[I];
1442 
1443  assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
1444  "vector type argument should have been split");
1445 
1446  // First check if it's a PS input addr.
1447  if (CallConv == CallingConv::AMDGPU_PS &&
1448  !Arg->Flags.isInReg() && !Arg->Flags.isByVal() && PSInputNum <= 15) {
1449 
1450  bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
1451 
1452  // Inconveniently only the first part of the split is marked as isSplit,
1453  // so skip to the end. We only want to increment PSInputNum once for the
1454  // entire split argument.
1455  if (Arg->Flags.isSplit()) {
1456  while (!Arg->Flags.isSplitEnd()) {
1457  assert(!Arg->VT.isVector() &&
1458  "unexpected vector split in ps argument type");
1459  if (!SkipArg)
1460  Splits.push_back(*Arg);
1461  Arg = &Ins[++I];
1462  }
1463  }
1464 
1465  if (SkipArg) {
1466  // We can safely skip PS inputs.
1467  Skipped.set(Arg->getOrigArgIndex());
1468  ++PSInputNum;
1469  continue;
1470  }
1471 
1472  Info->markPSInputAllocated(PSInputNum);
1473  if (Arg->Used)
1474  Info->markPSInputEnabled(PSInputNum);
1475 
1476  ++PSInputNum;
1477  }
1478 
1479  Splits.push_back(*Arg);
1480  }
1481 }
1482 
1483 // Allocate special inputs passed in VGPRs.
1485  MachineFunction &MF,
1486  const SIRegisterInfo &TRI,
1488  if (Info.hasWorkItemIDX()) {
1489  unsigned Reg = AMDGPU::VGPR0;
1490  MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1491 
1492  CCInfo.AllocateReg(Reg);
1494  }
1495 
1496  if (Info.hasWorkItemIDY()) {
1497  unsigned Reg = AMDGPU::VGPR1;
1498  MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1499 
1500  CCInfo.AllocateReg(Reg);
1502  }
1503 
1504  if (Info.hasWorkItemIDZ()) {
1505  unsigned Reg = AMDGPU::VGPR2;
1506  MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1507 
1508  CCInfo.AllocateReg(Reg);
1510  }
1511 }
1512 
1513 // Try to allocate a VGPR at the end of the argument list, or if no argument
1514 // VGPRs are left allocating a stack slot.
1516  ArrayRef<MCPhysReg> ArgVGPRs
1517  = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
1518  unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
1519  if (RegIdx == ArgVGPRs.size()) {
1520  // Spill to stack required.
1521  int64_t Offset = CCInfo.AllocateStack(4, 4);
1522 
1523  return ArgDescriptor::createStack(Offset);
1524  }
1525 
1526  unsigned Reg = ArgVGPRs[RegIdx];
1527  Reg = CCInfo.AllocateReg(Reg);
1528  assert(Reg != AMDGPU::NoRegister);
1529 
1530  MachineFunction &MF = CCInfo.getMachineFunction();
1531  MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1532  return ArgDescriptor::createRegister(Reg);
1533 }
1534 
1536  const TargetRegisterClass *RC,
1537  unsigned NumArgRegs) {
1538  ArrayRef<MCPhysReg> ArgSGPRs = makeArrayRef(RC->begin(), 32);
1539  unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
1540  if (RegIdx == ArgSGPRs.size())
1541  report_fatal_error("ran out of SGPRs for arguments");
1542 
1543  unsigned Reg = ArgSGPRs[RegIdx];
1544  Reg = CCInfo.AllocateReg(Reg);
1545  assert(Reg != AMDGPU::NoRegister);
1546 
1547  MachineFunction &MF = CCInfo.getMachineFunction();
1548  MF.addLiveIn(Reg, RC);
1549  return ArgDescriptor::createRegister(Reg);
1550 }
1551 
1553  return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
1554 }
1555 
1557  return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
1558 }
1559 
1561  MachineFunction &MF,
1562  const SIRegisterInfo &TRI,
1564  if (Info.hasWorkItemIDX())
1565  Info.setWorkItemIDX(allocateVGPR32Input(CCInfo));
1566 
1567  if (Info.hasWorkItemIDY())
1568  Info.setWorkItemIDY(allocateVGPR32Input(CCInfo));
1569 
1570  if (Info.hasWorkItemIDZ())
1571  Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo));
1572 }
1573 
1575  MachineFunction &MF,
1576  const SIRegisterInfo &TRI,
1578  auto &ArgInfo = Info.getArgInfo();
1579 
1580  // TODO: Unify handling with private memory pointers.
1581 
1582  if (Info.hasDispatchPtr())
1583  ArgInfo.DispatchPtr = allocateSGPR64Input(CCInfo);
1584 
1585  if (Info.hasQueuePtr())
1586  ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo);
1587 
1588  if (Info.hasKernargSegmentPtr())
1589  ArgInfo.KernargSegmentPtr = allocateSGPR64Input(CCInfo);
1590 
1591  if (Info.hasDispatchID())
1592  ArgInfo.DispatchID = allocateSGPR64Input(CCInfo);
1593 
1594  // flat_scratch_init is not applicable for non-kernel functions.
1595 
1596  if (Info.hasWorkGroupIDX())
1597  ArgInfo.WorkGroupIDX = allocateSGPR32Input(CCInfo);
1598 
1599  if (Info.hasWorkGroupIDY())
1600  ArgInfo.WorkGroupIDY = allocateSGPR32Input(CCInfo);
1601 
1602  if (Info.hasWorkGroupIDZ())
1603  ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo);
1604 
1605  if (Info.hasImplicitArgPtr())
1606  ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo);
1607 }
1608 
1609 // Allocate special inputs passed in user SGPRs.
1610 static void allocateHSAUserSGPRs(CCState &CCInfo,
1611  MachineFunction &MF,
1612  const SIRegisterInfo &TRI,
1614  if (Info.hasImplicitBufferPtr()) {
1615  unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
1616  MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
1617  CCInfo.AllocateReg(ImplicitBufferPtrReg);
1618  }
1619 
1620  // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
1621  if (Info.hasPrivateSegmentBuffer()) {
1622  unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
1623  MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
1624  CCInfo.AllocateReg(PrivateSegmentBufferReg);
1625  }
1626 
1627  if (Info.hasDispatchPtr()) {
1628  unsigned DispatchPtrReg = Info.addDispatchPtr(TRI);
1629  MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
1630  CCInfo.AllocateReg(DispatchPtrReg);
1631  }
1632 
1633  if (Info.hasQueuePtr()) {
1634  unsigned QueuePtrReg = Info.addQueuePtr(TRI);
1635  MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
1636  CCInfo.AllocateReg(QueuePtrReg);
1637  }
1638 
1639  if (Info.hasKernargSegmentPtr()) {
1640  unsigned InputPtrReg = Info.addKernargSegmentPtr(TRI);
1641  MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
1642  CCInfo.AllocateReg(InputPtrReg);
1643  }
1644 
1645  if (Info.hasDispatchID()) {
1646  unsigned DispatchIDReg = Info.addDispatchID(TRI);
1647  MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
1648  CCInfo.AllocateReg(DispatchIDReg);
1649  }
1650 
1651  if (Info.hasFlatScratchInit()) {
1652  unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI);
1653  MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
1654  CCInfo.AllocateReg(FlatScratchInitReg);
1655  }
1656 
1657  // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
1658  // these from the dispatch pointer.
1659 }
1660 
1661 // Allocate special input registers that are initialized per-wave.
1662 static void allocateSystemSGPRs(CCState &CCInfo,
1663  MachineFunction &MF,
1665  CallingConv::ID CallConv,
1666  bool IsShader) {
1667  if (Info.hasWorkGroupIDX()) {
1668  unsigned Reg = Info.addWorkGroupIDX();
1669  MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1670  CCInfo.AllocateReg(Reg);
1671  }
1672 
1673  if (Info.hasWorkGroupIDY()) {
1674  unsigned Reg = Info.addWorkGroupIDY();
1675  MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1676  CCInfo.AllocateReg(Reg);
1677  }
1678 
1679  if (Info.hasWorkGroupIDZ()) {
1680  unsigned Reg = Info.addWorkGroupIDZ();
1681  MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1682  CCInfo.AllocateReg(Reg);
1683  }
1684 
1685  if (Info.hasWorkGroupInfo()) {
1686  unsigned Reg = Info.addWorkGroupInfo();
1687  MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1688  CCInfo.AllocateReg(Reg);
1689  }
1690 
1691  if (Info.hasPrivateSegmentWaveByteOffset()) {
1692  // Scratch wave offset passed in system SGPR.
1693  unsigned PrivateSegmentWaveByteOffsetReg;
1694 
1695  if (IsShader) {
1696  PrivateSegmentWaveByteOffsetReg =
1698 
1699  // This is true if the scratch wave byte offset doesn't have a fixed
1700  // location.
1701  if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
1702  PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
1703  Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
1704  }
1705  } else
1706  PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
1707 
1708  MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
1709  CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
1710  }
1711 }
1712 
1714  MachineFunction &MF,
1715  const SIRegisterInfo &TRI,
1717  // Now that we've figured out where the scratch register inputs are, see if
1718  // should reserve the arguments and use them directly.
1719  MachineFrameInfo &MFI = MF.getFrameInfo();
1720  bool HasStackObjects = MFI.hasStackObjects();
1721 
1722  // Record that we know we have non-spill stack objects so we don't need to
1723  // check all stack objects later.
1724  if (HasStackObjects)
1725  Info.setHasNonSpillStackObjects(true);
1726 
1727  // Everything live out of a block is spilled with fast regalloc, so it's
1728  // almost certain that spilling will be required.
1729  if (TM.getOptLevel() == CodeGenOpt::None)
1730  HasStackObjects = true;
1731 
1732  // For now assume stack access is needed in any callee functions, so we need
1733  // the scratch registers to pass in.
1734  bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
1735 
1736  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1737  if (ST.isAmdHsaOrMesa(MF.getFunction())) {
1738  if (RequiresStackAccess) {
1739  // If we have stack objects, we unquestionably need the private buffer
1740  // resource. For the Code Object V2 ABI, this will be the first 4 user
1741  // SGPR inputs. We can reserve those and use them directly.
1742 
1743  unsigned PrivateSegmentBufferReg = Info.getPreloadedReg(
1745  Info.setScratchRSrcReg(PrivateSegmentBufferReg);
1746 
1747  if (MFI.hasCalls()) {
1748  // If we have calls, we need to keep the frame register in a register
1749  // that won't be clobbered by a call, so ensure it is copied somewhere.
1750 
1751  // This is not a problem for the scratch wave offset, because the same
1752  // registers are reserved in all functions.
1753 
1754  // FIXME: Nothing is really ensuring this is a call preserved register,
1755  // it's just selected from the end so it happens to be.
1756  unsigned ReservedOffsetReg
1758  Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1759  } else {
1760  unsigned PrivateSegmentWaveByteOffsetReg = Info.getPreloadedReg(
1762  Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
1763  }
1764  } else {
1765  unsigned ReservedBufferReg
1767  unsigned ReservedOffsetReg
1769 
1770  // We tentatively reserve the last registers (skipping the last two
1771  // which may contain VCC). After register allocation, we'll replace
1772  // these with the ones immediately after those which were really
1773  // allocated. In the prologue copies will be inserted from the argument
1774  // to these reserved registers.
1775  Info.setScratchRSrcReg(ReservedBufferReg);
1776  Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1777  }
1778  } else {
1779  unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
1780 
1781  // Without HSA, relocations are used for the scratch pointer and the
1782  // buffer resource setup is always inserted in the prologue. Scratch wave
1783  // offset is still in an input SGPR.
1784  Info.setScratchRSrcReg(ReservedBufferReg);
1785 
1786  if (HasStackObjects && !MFI.hasCalls()) {
1787  unsigned ScratchWaveOffsetReg = Info.getPreloadedReg(
1789  Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg);
1790  } else {
1791  unsigned ReservedOffsetReg
1793  Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1794  }
1795  }
1796 }
1797 
1800  return !Info->isEntryFunction();
1801 }
1802 
1804 
1805 }
1806 
1808  MachineBasicBlock *Entry,
1809  const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
1811 
1812  const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
1813  if (!IStart)
1814  return;
1815 
1816  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1817  MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
1818  MachineBasicBlock::iterator MBBI = Entry->begin();
1819  for (const MCPhysReg *I = IStart; *I; ++I) {
1820  const TargetRegisterClass *RC = nullptr;
1821  if (AMDGPU::SReg_64RegClass.contains(*I))
1822  RC = &AMDGPU::SGPR_64RegClass;
1823  else if (AMDGPU::SReg_32RegClass.contains(*I))
1824  RC = &AMDGPU::SGPR_32RegClass;
1825  else
1826  llvm_unreachable("Unexpected register class in CSRsViaCopy!");
1827 
1828  unsigned NewVR = MRI->createVirtualRegister(RC);
1829  // Create copy from CSR to a virtual register.
1830  Entry->addLiveIn(*I);
1831  BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
1832  .addReg(*I);
1833 
1834  // Insert the copy-back instructions right before the terminator.
1835  for (auto *Exit : Exits)
1836  BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
1837  TII->get(TargetOpcode::COPY), *I)
1838  .addReg(NewVR);
1839  }
1840 }
1841 
1843  SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
1844  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
1845  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
1847 
1848  MachineFunction &MF = DAG.getMachineFunction();
1849  const Function &Fn = MF.getFunction();
1850  FunctionType *FType = MF.getFunction().getFunctionType();
1852  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1853 
1854  if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
1855  DiagnosticInfoUnsupported NoGraphicsHSA(
1856  Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
1857  DAG.getContext()->diagnose(NoGraphicsHSA);
1858  return DAG.getEntryNode();
1859  }
1860 
1861  // Create stack objects that are used for emitting debugger prologue if
1862  // "amdgpu-debugger-emit-prologue" attribute was specified.
1863  if (ST.debuggerEmitPrologue())
1864  createDebuggerPrologueStackObjects(MF);
1865 
1868  BitVector Skipped(Ins.size());
1869  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1870  *DAG.getContext());
1871 
1872  bool IsShader = AMDGPU::isShader(CallConv);
1873  bool IsKernel = AMDGPU::isKernel(CallConv);
1874  bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
1875 
1876  if (!IsEntryFunc) {
1877  // 4 bytes are reserved at offset 0 for the emergency stack slot. Skip over
1878  // this when allocating argument fixed offsets.
1879  CCInfo.AllocateStack(4, 4);
1880  }
1881 
1882  if (IsShader) {
1883  processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
1884 
1885  // At least one interpolation mode must be enabled or else the GPU will
1886  // hang.
1887  //
1888  // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
1889  // set PSInputAddr, the user wants to enable some bits after the compilation
1890  // based on run-time states. Since we can't know what the final PSInputEna
1891  // will look like, so we shouldn't do anything here and the user should take
1892  // responsibility for the correct programming.
1893  //
1894  // Otherwise, the following restrictions apply:
1895  // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
1896  // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
1897  // enabled too.
1898  if (CallConv == CallingConv::AMDGPU_PS) {
1899  if ((Info->getPSInputAddr() & 0x7F) == 0 ||
1900  ((Info->getPSInputAddr() & 0xF) == 0 &&
1901  Info->isPSInputAllocated(11))) {
1902  CCInfo.AllocateReg(AMDGPU::VGPR0);
1903  CCInfo.AllocateReg(AMDGPU::VGPR1);
1904  Info->markPSInputAllocated(0);
1905  Info->markPSInputEnabled(0);
1906  }
1907  if (Subtarget->isAmdPalOS()) {
1908  // For isAmdPalOS, the user does not enable some bits after compilation
1909  // based on run-time states; the register values being generated here are
1910  // the final ones set in hardware. Therefore we need to apply the
1911  // workaround to PSInputAddr and PSInputEnable together. (The case where
1912  // a bit is set in PSInputAddr but not PSInputEnable is where the
1913  // frontend set up an input arg for a particular interpolation mode, but
1914  // nothing uses that input arg. Really we should have an earlier pass
1915  // that removes such an arg.)
1916  unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
1917  if ((PsInputBits & 0x7F) == 0 ||
1918  ((PsInputBits & 0xF) == 0 &&
1919  (PsInputBits >> 11 & 1)))
1920  Info->markPSInputEnabled(
1922  }
1923  }
1924 
1925  assert(!Info->hasDispatchPtr() &&
1926  !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
1927  !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
1928  !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
1929  !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
1930  !Info->hasWorkItemIDZ());
1931  } else if (IsKernel) {
1932  assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
1933  } else {
1934  Splits.append(Ins.begin(), Ins.end());
1935  }
1936 
1937  if (IsEntryFunc) {
1938  allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
1939  allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
1940  }
1941 
1942  if (IsKernel) {
1943  analyzeFormalArgumentsCompute(CCInfo, Ins);
1944  } else {
1945  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
1946  CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
1947  }
1948 
1949  SmallVector<SDValue, 16> Chains;
1950 
1951  // FIXME: This is the minimum kernel argument alignment. We should improve
1952  // this to the maximum alignment of the arguments.
1953  //
1954  // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
1955  // kern arg offset.
1956  const unsigned KernelArgBaseAlign = 16;
1957 
1958  for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
1959  const ISD::InputArg &Arg = Ins[i];
1960  if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
1961  InVals.push_back(DAG.getUNDEF(Arg.VT));
1962  continue;
1963  }
1964 
1965  CCValAssign &VA = ArgLocs[ArgIdx++];
1966  MVT VT = VA.getLocVT();
1967 
1968  if (IsEntryFunc && VA.isMemLoc()) {
1969  VT = Ins[i].VT;
1970  EVT MemVT = VA.getLocVT();
1971 
1972  const uint64_t Offset = VA.getLocMemOffset();
1973  unsigned Align = MinAlign(KernelArgBaseAlign, Offset);
1974 
1975  SDValue Arg = lowerKernargMemParameter(
1976  DAG, VT, MemVT, DL, Chain, Offset, Align, Ins[i].Flags.isSExt(), &Ins[i]);
1977  Chains.push_back(Arg.getValue(1));
1978 
1979  auto *ParamTy =
1980  dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
1981  if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
1982  ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
1983  // On SI local pointers are just offsets into LDS, so they are always
1984  // less than 16-bits. On CI and newer they could potentially be
1985  // real pointers, so we can't guarantee their size.
1986  Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
1987  DAG.getValueType(MVT::i16));
1988  }
1989 
1990  InVals.push_back(Arg);
1991  continue;
1992  } else if (!IsEntryFunc && VA.isMemLoc()) {
1993  SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
1994  InVals.push_back(Val);
1995  if (!Arg.Flags.isByVal())
1996  Chains.push_back(Val.getValue(1));
1997  continue;
1998  }
1999 
2000  assert(VA.isRegLoc() && "Parameter must be in a register!");
2001 
2002  unsigned Reg = VA.getLocReg();
2003  const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
2004  EVT ValVT = VA.getValVT();
2005 
2006  Reg = MF.addLiveIn(Reg, RC);
2007  SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
2008 
2009  if (Arg.Flags.isSRet() && !getSubtarget()->enableHugePrivateBuffer()) {
2010  // The return object should be reasonably addressable.
2011 
2012  // FIXME: This helps when the return is a real sret. If it is a
2013  // automatically inserted sret (i.e. CanLowerReturn returns false), an
2014  // extra copy is inserted in SelectionDAGBuilder which obscures this.
2015  unsigned NumBits = 32 - AssumeFrameIndexHighZeroBits;
2016  Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
2017  DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
2018  }
2019 
2020  // If this is an 8 or 16-bit value, it is really passed promoted
2021  // to 32 bits. Insert an assert[sz]ext to capture this, then
2022  // truncate to the right size.
2023  switch (VA.getLocInfo()) {
2024  case CCValAssign::Full:
2025  break;
2026  case CCValAssign::BCvt:
2027  Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
2028  break;
2029  case CCValAssign::SExt:
2030  Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
2031  DAG.getValueType(ValVT));
2032  Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2033  break;
2034  case CCValAssign::ZExt:
2035  Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
2036  DAG.getValueType(ValVT));
2037  Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2038  break;
2039  case CCValAssign::AExt:
2040  Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2041  break;
2042  default:
2043  llvm_unreachable("Unknown loc info!");
2044  }
2045 
2046  InVals.push_back(Val);
2047  }
2048 
2049  if (!IsEntryFunc) {
2050  // Special inputs come after user arguments.
2051  allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
2052  }
2053 
2054  // Start adding system SGPRs.
2055  if (IsEntryFunc) {
2056  allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
2057  } else {
2058  CCInfo.AllocateReg(Info->getScratchRSrcReg());
2059  CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
2060  CCInfo.AllocateReg(Info->getFrameOffsetReg());
2061  allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2062  }
2063 
2064  auto &ArgUsageInfo =
2066  ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
2067 
2068  unsigned StackArgSize = CCInfo.getNextStackOffset();
2069  Info->setBytesInStackArgArea(StackArgSize);
2070 
2071  return Chains.empty() ? Chain :
2072  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
2073 }
2074 
2075 // TODO: If return values can't fit in registers, we should return as many as
2076 // possible in registers before passing on stack.
2078  CallingConv::ID CallConv,
2079  MachineFunction &MF, bool IsVarArg,
2080  const SmallVectorImpl<ISD::OutputArg> &Outs,
2081  LLVMContext &Context) const {
2082  // Replacing returns with sret/stack usage doesn't make sense for shaders.
2083  // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
2084  // for shaders. Vector types should be explicitly handled by CC.
2085  if (AMDGPU::isEntryFunctionCC(CallConv))
2086  return true;
2087 
2089  CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
2090  return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
2091 }
2092 
2093 SDValue
2095  bool isVarArg,
2096  const SmallVectorImpl<ISD::OutputArg> &Outs,
2097  const SmallVectorImpl<SDValue> &OutVals,
2098  const SDLoc &DL, SelectionDAG &DAG) const {
2099  MachineFunction &MF = DAG.getMachineFunction();
2101 
2102  if (AMDGPU::isKernel(CallConv)) {
2103  return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
2104  OutVals, DL, DAG);
2105  }
2106 
2107  bool IsShader = AMDGPU::isShader(CallConv);
2108 
2109  Info->setIfReturnsVoid(Outs.empty());
2110  bool IsWaveEnd = Info->returnsVoid() && IsShader;
2111 
2112  // CCValAssign - represent the assignment of the return value to a location.
2115 
2116  // CCState - Info about the registers and stack slots.
2117  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2118  *DAG.getContext());
2119 
2120  // Analyze outgoing return values.
2121  CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2122 
2123  SDValue Flag;
2124  SmallVector<SDValue, 48> RetOps;
2125  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2126 
2127  // Add return address for callable functions.
2128  if (!Info->isEntryFunction()) {
2130  SDValue ReturnAddrReg = CreateLiveInRegister(
2131  DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
2132 
2133  // FIXME: Should be able to use a vreg here, but need a way to prevent it
2134  // from being allcoated to a CSR.
2135 
2136  SDValue PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
2137  MVT::i64);
2138 
2139  Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, Flag);
2140  Flag = Chain.getValue(1);
2141 
2142  RetOps.push_back(PhysReturnAddrReg);
2143  }
2144 
2145  // Copy the result values into the output registers.
2146  for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
2147  ++I, ++RealRVLocIdx) {
2148  CCValAssign &VA = RVLocs[I];
2149  assert(VA.isRegLoc() && "Can only return in registers!");
2150  // TODO: Partially return in registers if return values don't fit.
2151  SDValue Arg = OutVals[RealRVLocIdx];
2152 
2153  // Copied from other backends.
2154  switch (VA.getLocInfo()) {
2155  case CCValAssign::Full:
2156  break;
2157  case CCValAssign::BCvt:
2158  Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2159  break;
2160  case CCValAssign::SExt:
2161  Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2162  break;
2163  case CCValAssign::ZExt:
2164  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2165  break;
2166  case CCValAssign::AExt:
2167  Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2168  break;
2169  default:
2170  llvm_unreachable("Unknown loc info!");
2171  }
2172 
2173  Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
2174  Flag = Chain.getValue(1);
2175  RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2176  }
2177 
2178  // FIXME: Does sret work properly?
2179  if (!Info->isEntryFunction()) {
2180  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2181  const MCPhysReg *I =
2183  if (I) {
2184  for (; *I; ++I) {
2185  if (AMDGPU::SReg_64RegClass.contains(*I))
2186  RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2187  else if (AMDGPU::SReg_32RegClass.contains(*I))
2188  RetOps.push_back(DAG.getRegister(*I, MVT::i32));
2189  else
2190  llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2191  }
2192  }
2193  }
2194 
2195  // Update chain and glue.
2196  RetOps[0] = Chain;
2197  if (Flag.getNode())
2198  RetOps.push_back(Flag);
2199 
2200  unsigned Opc = AMDGPUISD::ENDPGM;
2201  if (!IsWaveEnd)
2203  return DAG.getNode(Opc, DL, MVT::Other, RetOps);
2204 }
2205 
2207  SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
2208  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2209  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
2210  SDValue ThisVal) const {
2211  CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
2212 
2213  // Assign locations to each value returned by this call.
2215  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
2216  *DAG.getContext());
2217  CCInfo.AnalyzeCallResult(Ins, RetCC);
2218 
2219  // Copy all of the result registers out of their specified physreg.
2220  for (unsigned i = 0; i != RVLocs.size(); ++i) {
2221  CCValAssign VA = RVLocs[i];
2222  SDValue Val;
2223 
2224  if (VA.isRegLoc()) {
2225  Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
2226  Chain = Val.getValue(1);
2227  InFlag = Val.getValue(2);
2228  } else if (VA.isMemLoc()) {
2229  report_fatal_error("TODO: return values in memory");
2230  } else
2231  llvm_unreachable("unknown argument location type");
2232 
2233  switch (VA.getLocInfo()) {
2234  case CCValAssign::Full:
2235  break;
2236  case CCValAssign::BCvt:
2237  Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
2238  break;
2239  case CCValAssign::ZExt:
2240  Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
2241  DAG.getValueType(VA.getValVT()));
2242  Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2243  break;
2244  case CCValAssign::SExt:
2245  Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
2246  DAG.getValueType(VA.getValVT()));
2247  Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2248  break;
2249  case CCValAssign::AExt:
2250  Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2251  break;
2252  default:
2253  llvm_unreachable("Unknown loc info!");
2254  }
2255 
2256  InVals.push_back(Val);
2257  }
2258 
2259  return Chain;
2260 }
2261 
2262 // Add code to pass special inputs required depending on used features separate
2263 // from the explicit user arguments present in the IR.
2265  CallLoweringInfo &CLI,
2266  CCState &CCInfo,
2267  const SIMachineFunctionInfo &Info,
2268  SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
2269  SmallVectorImpl<SDValue> &MemOpChains,
2270  SDValue Chain) const {
2271  // If we don't have a call site, this was a call inserted by
2272  // legalization. These can never use special inputs.
2273  if (!CLI.CS)
2274  return;
2275 
2276  const Function *CalleeFunc = CLI.CS.getCalledFunction();
2277  assert(CalleeFunc);
2278 
2279  SelectionDAG &DAG = CLI.DAG;
2280  const SDLoc &DL = CLI.DL;
2281 
2282  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2283 
2284  auto &ArgUsageInfo =
2286  const AMDGPUFunctionArgInfo &CalleeArgInfo
2287  = ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
2288 
2289  const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
2290 
2291  // TODO: Unify with private memory register handling. This is complicated by
2292  // the fact that at least in kernels, the input argument is not necessarily
2293  // in the same location as the input.
2306  };
2307 
2308  for (auto InputID : InputRegs) {
2309  const ArgDescriptor *OutgoingArg;
2310  const TargetRegisterClass *ArgRC;
2311 
2312  std::tie(OutgoingArg, ArgRC) = CalleeArgInfo.getPreloadedValue(InputID);
2313  if (!OutgoingArg)
2314  continue;
2315 
2316  const ArgDescriptor *IncomingArg;
2317  const TargetRegisterClass *IncomingArgRC;
2318  std::tie(IncomingArg, IncomingArgRC)
2319  = CallerArgInfo.getPreloadedValue(InputID);
2320  assert(IncomingArgRC == ArgRC);
2321 
2322  // All special arguments are ints for now.
2323  EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
2324  SDValue InputReg;
2325 
2326  if (IncomingArg) {
2327  InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
2328  } else {
2329  // The implicit arg ptr is special because it doesn't have a corresponding
2330  // input for kernels, and is computed from the kernarg segment pointer.
2331  assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2332  InputReg = getImplicitArgPtr(DAG, DL);
2333  }
2334 
2335  if (OutgoingArg->isRegister()) {
2336  RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
2337  } else {
2338  unsigned SpecialArgOffset = CCInfo.AllocateStack(ArgVT.getStoreSize(), 4);
2339  SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
2340  SpecialArgOffset);
2341  MemOpChains.push_back(ArgStore);
2342  }
2343  }
2344 }
2345 
2347  return CC == CallingConv::Fast;
2348 }
2349 
2350 /// Return true if we might ever do TCO for calls with this calling convention.
2352  switch (CC) {
2353  case CallingConv::C:
2354  return true;
2355  default:
2356  return canGuaranteeTCO(CC);
2357  }
2358 }
2359 
2361  SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
2362  const SmallVectorImpl<ISD::OutputArg> &Outs,
2363  const SmallVectorImpl<SDValue> &OutVals,
2364  const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
2365  if (!mayTailCallThisCC(CalleeCC))
2366  return false;
2367 
2368  MachineFunction &MF = DAG.getMachineFunction();
2369  const Function &CallerF = MF.getFunction();
2370  CallingConv::ID CallerCC = CallerF.getCallingConv();
2372  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2373 
2374  // Kernels aren't callable, and don't have a live in return address so it
2375  // doesn't make sense to do a tail call with entry functions.
2376  if (!CallerPreserved)
2377  return false;
2378 
2379  bool CCMatch = CallerCC == CalleeCC;
2380 
2382  if (canGuaranteeTCO(CalleeCC) && CCMatch)
2383  return true;
2384  return false;
2385  }
2386 
2387  // TODO: Can we handle var args?
2388  if (IsVarArg)
2389  return false;
2390 
2391  for (const Argument &Arg : CallerF.args()) {
2392  if (Arg.hasByValAttr())
2393  return false;
2394  }
2395 
2396  LLVMContext &Ctx = *DAG.getContext();
2397 
2398  // Check that the call results are passed in the same way.
2399  if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
2400  CCAssignFnForCall(CalleeCC, IsVarArg),
2401  CCAssignFnForCall(CallerCC, IsVarArg)))
2402  return false;
2403 
2404  // The callee has to preserve all registers the caller needs to preserve.
2405  if (!CCMatch) {
2406  const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2407  if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
2408  return false;
2409  }
2410 
2411  // Nothing more to check if the callee is taking no arguments.
2412  if (Outs.empty())
2413  return true;
2414 
2416  CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
2417 
2418  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
2419 
2420  const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
2421  // If the stack arguments for this call do not fit into our own save area then
2422  // the call cannot be made tail.
2423  // TODO: Is this really necessary?
2424  if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
2425  return false;
2426 
2427  const MachineRegisterInfo &MRI = MF.getRegInfo();
2428  return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
2429 }
2430 
2432  if (!CI->isTailCall())
2433  return false;
2434 
2435  const Function *ParentFn = CI->getParent()->getParent();
2436  if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
2437  return false;
2438 
2439  auto Attr = ParentFn->getFnAttribute("disable-tail-calls");
2440  return (Attr.getValueAsString() != "true");
2441 }
2442 
2443 // The wave scratch offset register is used as the global base pointer.
2445  SmallVectorImpl<SDValue> &InVals) const {
2446  SelectionDAG &DAG = CLI.DAG;
2447  const SDLoc &DL = CLI.DL;
2449  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
2451  SDValue Chain = CLI.Chain;
2452  SDValue Callee = CLI.Callee;
2453  bool &IsTailCall = CLI.IsTailCall;
2454  CallingConv::ID CallConv = CLI.CallConv;
2455  bool IsVarArg = CLI.IsVarArg;
2456  bool IsSibCall = false;
2457  bool IsThisReturn = false;
2458  MachineFunction &MF = DAG.getMachineFunction();
2459 
2460  if (IsVarArg) {
2461  return lowerUnhandledCall(CLI, InVals,
2462  "unsupported call to variadic function ");
2463  }
2464 
2465  if (!CLI.CS.getInstruction())
2466  report_fatal_error("unsupported libcall legalization");
2467 
2468  if (!CLI.CS.getCalledFunction()) {
2469  return lowerUnhandledCall(CLI, InVals,
2470  "unsupported indirect call to function ");
2471  }
2472 
2473  if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
2474  return lowerUnhandledCall(CLI, InVals,
2475  "unsupported required tail call to function ");
2476  }
2477 
2479  // Note the issue is with the CC of the calling function, not of the call
2480  // itself.
2481  return lowerUnhandledCall(CLI, InVals,
2482  "unsupported call from graphics shader of function ");
2483  }
2484 
2485  // The first 4 bytes are reserved for the callee's emergency stack slot.
2486  if (IsTailCall) {
2487  IsTailCall = isEligibleForTailCallOptimization(
2488  Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
2489  if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall()) {
2490  report_fatal_error("failed to perform tail call elimination on a call "
2491  "site marked musttail");
2492  }
2493 
2494  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
2495 
2496  // A sibling call is one where we're under the usual C ABI and not planning
2497  // to change that but can still do a tail call:
2498  if (!TailCallOpt && IsTailCall)
2499  IsSibCall = true;
2500 
2501  if (IsTailCall)
2502  ++NumTailCalls;
2503  }
2504 
2506 
2507  // Analyze operands of the call, assigning locations to each operand.
2509  CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
2510  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
2511 
2512  // The first 4 bytes are reserved for the callee's emergency stack slot.
2513  CCInfo.AllocateStack(4, 4);
2514 
2515  CCInfo.AnalyzeCallOperands(Outs, AssignFn);
2516 
2517  // Get a count of how many bytes are to be pushed on the stack.
2518  unsigned NumBytes = CCInfo.getNextStackOffset();
2519 
2520  if (IsSibCall) {
2521  // Since we're not changing the ABI to make this a tail call, the memory
2522  // operands are already available in the caller's incoming argument space.
2523  NumBytes = 0;
2524  }
2525 
2526  // FPDiff is the byte offset of the call's argument area from the callee's.
2527  // Stores to callee stack arguments will be placed in FixedStackSlots offset
2528  // by this amount for a tail call. In a sibling call it must be 0 because the
2529  // caller will deallocate the entire stack and the callee still expects its
2530  // arguments to begin at SP+0. Completely unused for non-tail calls.
2531  int32_t FPDiff = 0;
2532  MachineFrameInfo &MFI = MF.getFrameInfo();
2534 
2535  SDValue CallerSavedFP;
2536 
2537  // Adjust the stack pointer for the new arguments...
2538  // These operations are automatically eliminated by the prolog/epilog pass
2539  if (!IsSibCall) {
2540  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
2541 
2542  unsigned OffsetReg = Info->getScratchWaveOffsetReg();
2543 
2544  // In the HSA case, this should be an identity copy.
2545  SDValue ScratchRSrcReg
2546  = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
2547  RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
2548 
2549  // TODO: Don't hardcode these registers and get from the callee function.
2550  SDValue ScratchWaveOffsetReg
2551  = DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32);
2552  RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg);
2553 
2554  if (!Info->isEntryFunction()) {
2555  // Avoid clobbering this function's FP value. In the current convention
2556  // callee will overwrite this, so do save/restore around the call site.
2557  CallerSavedFP = DAG.getCopyFromReg(Chain, DL,
2558  Info->getFrameOffsetReg(), MVT::i32);
2559  }
2560  }
2561 
2562  SmallVector<SDValue, 8> MemOpChains;
2563  MVT PtrVT = MVT::i32;
2564 
2565  // Walk the register/memloc assignments, inserting copies/loads.
2566  for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
2567  ++i, ++realArgIdx) {
2568  CCValAssign &VA = ArgLocs[i];
2569  SDValue Arg = OutVals[realArgIdx];
2570 
2571  // Promote the value if needed.
2572  switch (VA.getLocInfo()) {
2573  case CCValAssign::Full:
2574  break;
2575  case CCValAssign::BCvt:
2576  Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2577  break;
2578  case CCValAssign::ZExt:
2579  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2580  break;
2581  case CCValAssign::SExt:
2582  Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2583  break;
2584  case CCValAssign::AExt:
2585  Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2586  break;
2587  case CCValAssign::FPExt:
2588  Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
2589  break;
2590  default:
2591  llvm_unreachable("Unknown loc info!");
2592  }
2593 
2594  if (VA.isRegLoc()) {
2595  RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2596  } else {
2597  assert(VA.isMemLoc());
2598 
2599  SDValue DstAddr;
2600  MachinePointerInfo DstInfo;
2601 
2602  unsigned LocMemOffset = VA.getLocMemOffset();
2603  int32_t Offset = LocMemOffset;
2604 
2605  SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
2606  unsigned Align = 0;
2607 
2608  if (IsTailCall) {
2609  ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2610  unsigned OpSize = Flags.isByVal() ?
2611  Flags.getByValSize() : VA.getValVT().getStoreSize();
2612 
2613  // FIXME: We can have better than the minimum byval required alignment.
2614  Align = Flags.isByVal() ? Flags.getByValAlign() :
2615  MinAlign(Subtarget->getStackAlignment(), Offset);
2616 
2617  Offset = Offset + FPDiff;
2618  int FI = MFI.CreateFixedObject(OpSize, Offset, true);
2619 
2620  DstAddr = DAG.getFrameIndex(FI, PtrVT);
2621  DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
2622 
2623  // Make sure any stack arguments overlapping with where we're storing
2624  // are loaded before this eventual operation. Otherwise they'll be
2625  // clobbered.
2626 
2627  // FIXME: Why is this really necessary? This seems to just result in a
2628  // lot of code to copy the stack and write them back to the same
2629  // locations, which are supposed to be immutable?
2630  Chain = addTokenForArgument(Chain, DAG, MFI, FI);
2631  } else {
2632  DstAddr = PtrOff;
2633  DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
2634  Align = MinAlign(Subtarget->getStackAlignment(), LocMemOffset);
2635  }
2636 
2637  if (Outs[i].Flags.isByVal()) {
2638  SDValue SizeNode =
2639  DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
2640  SDValue Cpy = DAG.getMemcpy(
2641  Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
2642  /*isVol = */ false, /*AlwaysInline = */ true,
2643  /*isTailCall = */ false, DstInfo,
2646 
2647  MemOpChains.push_back(Cpy);
2648  } else {
2649  SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Align);
2650  MemOpChains.push_back(Store);
2651  }
2652  }
2653  }
2654 
2655  // Copy special input registers after user input arguments.
2656  passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
2657 
2658  if (!MemOpChains.empty())
2659  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
2660 
2661  // Build a sequence of copy-to-reg nodes chained together with token chain
2662  // and flag operands which copy the outgoing args into the appropriate regs.
2663  SDValue InFlag;
2664  for (auto &RegToPass : RegsToPass) {
2665  Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
2666  RegToPass.second, InFlag);
2667  InFlag = Chain.getValue(1);
2668  }
2669 
2670 
2671  SDValue PhysReturnAddrReg;
2672  if (IsTailCall) {
2673  // Since the return is being combined with the call, we need to pass on the
2674  // return address.
2675 
2677  SDValue ReturnAddrReg = CreateLiveInRegister(
2678  DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
2679 
2680  PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
2681  MVT::i64);
2682  Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, InFlag);
2683  InFlag = Chain.getValue(1);
2684  }
2685 
2686  // We don't usually want to end the call-sequence here because we would tidy
2687  // the frame up *after* the call, however in the ABI-changing tail-call case
2688  // we've carefully laid out the parameters so that when sp is reset they'll be
2689  // in the correct location.
2690  if (IsTailCall && !IsSibCall) {
2691  Chain = DAG.getCALLSEQ_END(Chain,
2692  DAG.getTargetConstant(NumBytes, DL, MVT::i32),
2693  DAG.getTargetConstant(0, DL, MVT::i32),
2694  InFlag, DL);
2695  InFlag = Chain.getValue(1);
2696  }
2697 
2698  std::vector<SDValue> Ops;
2699  Ops.push_back(Chain);
2700  Ops.push_back(Callee);
2701 
2702  if (IsTailCall) {
2703  // Each tail call may have to adjust the stack by a different amount, so
2704  // this information must travel along with the operation for eventual
2705  // consumption by emitEpilogue.
2706  Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
2707 
2708  Ops.push_back(PhysReturnAddrReg);
2709  }
2710 
2711  // Add argument registers to the end of the list so that they are known live
2712  // into the call.
2713  for (auto &RegToPass : RegsToPass) {
2714  Ops.push_back(DAG.getRegister(RegToPass.first,
2715  RegToPass.second.getValueType()));
2716  }
2717 
2718  // Add a register mask operand representing the call-preserved registers.
2719 
2720  auto *TRI = static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
2721  const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
2722  assert(Mask && "Missing call preserved mask for calling convention");
2723  Ops.push_back(DAG.getRegisterMask(Mask));
2724 
2725  if (InFlag.getNode())
2726  Ops.push_back(InFlag);
2727 
2728  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2729 
2730  // If we're doing a tall call, use a TC_RETURN here rather than an
2731  // actual call instruction.
2732  if (IsTailCall) {
2733  MFI.setHasTailCall();
2734  return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops);
2735  }
2736 
2737  // Returns a chain and a flag for retval copy to use.
2738  SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
2739  Chain = Call.getValue(0);
2740  InFlag = Call.getValue(1);
2741 
2742  if (CallerSavedFP) {
2743  SDValue FPReg = DAG.getRegister(Info->getFrameOffsetReg(), MVT::i32);
2744  Chain = DAG.getCopyToReg(Chain, DL, FPReg, CallerSavedFP, InFlag);
2745  InFlag = Chain.getValue(1);
2746  }
2747 
2748  uint64_t CalleePopBytes = NumBytes;
2749  Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32),
2750  DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32),
2751  InFlag, DL);
2752  if (!Ins.empty())
2753  InFlag = Chain.getValue(1);
2754 
2755  // Handle result values, copying them out of physregs into vregs that we
2756  // return.
2757  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
2758  InVals, IsThisReturn,
2759  IsThisReturn ? OutVals[0] : SDValue());
2760 }
2761 
2762 unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
2763  SelectionDAG &DAG) const {
2764  unsigned Reg = StringSwitch<unsigned>(RegName)
2765  .Case("m0", AMDGPU::M0)
2766  .Case("exec", AMDGPU::EXEC)
2767  .Case("exec_lo", AMDGPU::EXEC_LO)
2768  .Case("exec_hi", AMDGPU::EXEC_HI)
2769  .Case("flat_scratch", AMDGPU::FLAT_SCR)
2770  .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
2771  .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
2772  .Default(AMDGPU::NoRegister);
2773 
2774  if (Reg == AMDGPU::NoRegister) {
2775  report_fatal_error(Twine("invalid register name \""
2776  + StringRef(RegName) + "\"."));
2777 
2778  }
2779 
2780  if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
2781  Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
2782  report_fatal_error(Twine("invalid register \""
2783  + StringRef(RegName) + "\" for subtarget."));
2784  }
2785 
2786  switch (Reg) {
2787  case AMDGPU::M0:
2788  case AMDGPU::EXEC_LO:
2789  case AMDGPU::EXEC_HI:
2790  case AMDGPU::FLAT_SCR_LO:
2791  case AMDGPU::FLAT_SCR_HI:
2792  if (VT.getSizeInBits() == 32)
2793  return Reg;
2794  break;
2795  case AMDGPU::EXEC:
2796  case AMDGPU::FLAT_SCR:
2797  if (VT.getSizeInBits() == 64)
2798  return Reg;
2799  break;
2800  default:
2801  llvm_unreachable("missing register type checking");
2802  }
2803 
2804  report_fatal_error(Twine("invalid type for register \""
2805  + StringRef(RegName) + "\"."));
2806 }
2807 
2808 // If kill is not the last instruction, split the block so kill is always a
2809 // proper terminator.
2811  MachineBasicBlock *BB) const {
2812  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
2813 
2814  MachineBasicBlock::iterator SplitPoint(&MI);
2815  ++SplitPoint;
2816 
2817  if (SplitPoint == BB->end()) {
2818  // Don't bother with a new block.
2820  return BB;
2821  }
2822 
2823  MachineFunction *MF = BB->getParent();
2824  MachineBasicBlock *SplitBB
2826 
2827  MF->insert(++MachineFunction::iterator(BB), SplitBB);
2828  SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());
2829 
2830  SplitBB->transferSuccessorsAndUpdatePHIs(BB);
2831  BB->addSuccessor(SplitBB);
2832 
2834  return SplitBB;
2835 }
2836 
2837 // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
2838 // wavefront. If the value is uniform and just happens to be in a VGPR, this
2839 // will only do one iteration. In the worst case, this will loop 64 times.
2840 //
2841 // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
2843  const SIInstrInfo *TII,
2845  MachineBasicBlock &OrigBB,
2846  MachineBasicBlock &LoopBB,
2847  const DebugLoc &DL,
2848  const MachineOperand &IdxReg,
2849  unsigned InitReg,
2850  unsigned ResultReg,
2851  unsigned PhiReg,
2852  unsigned InitSaveExecReg,
2853  int Offset,
2854  bool UseGPRIdxMode,
2855  bool IsIndirectSrc) {
2856  MachineBasicBlock::iterator I = LoopBB.begin();
2857 
2858  unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2859  unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2860  unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2861  unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2862 
2863  BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
2864  .addReg(InitReg)
2865  .addMBB(&OrigBB)
2866  .addReg(ResultReg)
2867  .addMBB(&LoopBB);
2868 
2869  BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
2870  .addReg(InitSaveExecReg)
2871  .addMBB(&OrigBB)
2872  .addReg(NewExec)
2873  .addMBB(&LoopBB);
2874 
2875  // Read the next variant <- also loop target.
2876  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
2877  .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
2878 
2879  // Compare the just read M0 value to all possible Idx values.
2880  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
2881  .addReg(CurrentIdxReg)
2882  .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
2883 
2884  // Update EXEC, save the original EXEC value to VCC.
2885  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
2886  .addReg(CondReg, RegState::Kill);
2887 
2888  MRI.setSimpleHint(NewExec, CondReg);
2889 
2890  if (UseGPRIdxMode) {
2891  unsigned IdxReg;
2892  if (Offset == 0) {
2893  IdxReg = CurrentIdxReg;
2894  } else {
2895  IdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2896  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), IdxReg)
2897  .addReg(CurrentIdxReg, RegState::Kill)
2898  .addImm(Offset);
2899  }
2900  unsigned IdxMode = IsIndirectSrc ?
2902  MachineInstr *SetOn =
2903  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
2904  .addReg(IdxReg, RegState::Kill)
2905  .addImm(IdxMode);
2906  SetOn->getOperand(3).setIsUndef();
2907  } else {
2908  // Move index from VCC into M0
2909  if (Offset == 0) {
2910  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2911  .addReg(CurrentIdxReg, RegState::Kill);
2912  } else {
2913  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
2914  .addReg(CurrentIdxReg, RegState::Kill)
2915  .addImm(Offset);
2916  }
2917  }
2918 
2919  // Update EXEC, switch all done bits to 0 and all todo bits to 1.
2920  MachineInstr *InsertPt =
2921  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
2922  .addReg(AMDGPU::EXEC)
2923  .addReg(NewExec);
2924 
2925  // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
2926  // s_cbranch_scc0?
2927 
2928  // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
2929  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
2930  .addMBB(&LoopBB);
2931 
2932  return InsertPt->getIterator();
2933 }
2934 
2935 // This has slightly sub-optimal regalloc when the source vector is killed by
2936 // the read. The register allocator does not understand that the kill is
2937 // per-workitem, so is kept alive for the whole loop so we end up not re-using a
2938 // subregister from it, using 1 more VGPR than necessary. This was saved when
2939 // this was expanded after register allocation.
2941  MachineBasicBlock &MBB,
2942  MachineInstr &MI,
2943  unsigned InitResultReg,
2944  unsigned PhiReg,
2945  int Offset,
2946  bool UseGPRIdxMode,
2947  bool IsIndirectSrc) {
2948  MachineFunction *MF = MBB.getParent();
2950  const DebugLoc &DL = MI.getDebugLoc();
2952 
2953  unsigned DstReg = MI.getOperand(0).getReg();
2954  unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
2955  unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
2956 
2957  BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
2958 
2959  // Save the EXEC mask
2960  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), SaveExec)
2961  .addReg(AMDGPU::EXEC);
2962 
2963  // To insert the loop we need to split the block. Move everything after this
2964  // point to a new block, and insert a new empty block between the two.
2966  MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
2967  MachineFunction::iterator MBBI(MBB);
2968  ++MBBI;
2969 
2970  MF->insert(MBBI, LoopBB);
2971  MF->insert(MBBI, RemainderBB);
2972 
2973  LoopBB->addSuccessor(LoopBB);
2974  LoopBB->addSuccessor(RemainderBB);
2975 
2976  // Move the rest of the block into a new block.
2977  RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
2978  RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
2979 
2980  MBB.addSuccessor(LoopBB);
2981 
2982  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
2983 
2984  auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
2985  InitResultReg, DstReg, PhiReg, TmpExec,
2986  Offset, UseGPRIdxMode, IsIndirectSrc);
2987 
2988  MachineBasicBlock::iterator First = RemainderBB->begin();
2989  BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
2990  .addReg(SaveExec);
2991 
2992  return InsPt;
2993 }
2994 
2995 // Returns subreg index, offset
2996 static std::pair<unsigned, int>
2998  const TargetRegisterClass *SuperRC,
2999  unsigned VecReg,
3000  int Offset) {
3001  int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
3002 
3003  // Skip out of bounds offsets, or else we would end up using an undefined
3004  // register.
3005  if (Offset >= NumElts || Offset < 0)
3006  return std::make_pair(AMDGPU::sub0, Offset);
3007 
3008  return std::make_pair(AMDGPU::sub0 + Offset, 0);
3009 }
3010 
3011 // Return true if the index is an SGPR and was set.
3014  MachineInstr &MI,
3015  int Offset,
3016  bool UseGPRIdxMode,
3017  bool IsIndirectSrc) {
3018  MachineBasicBlock *MBB = MI.getParent();
3019  const DebugLoc &DL = MI.getDebugLoc();
3021 
3022  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3023  const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
3024 
3025  assert(Idx->getReg() != AMDGPU::NoRegister);
3026 
3027  if (!TII->getRegisterInfo().isSGPRClass(IdxRC))
3028  return false;
3029 
3030  if (UseGPRIdxMode) {
3031  unsigned IdxMode = IsIndirectSrc ?
3033  if (Offset == 0) {
3034  MachineInstr *SetOn =
3035  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
3036  .add(*Idx)
3037  .addImm(IdxMode);
3038 
3039  SetOn->getOperand(3).setIsUndef();
3040  } else {
3041  unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3042  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
3043  .add(*Idx)
3044  .addImm(Offset);
3045  MachineInstr *SetOn =
3046  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
3047  .addReg(Tmp, RegState::Kill)
3048  .addImm(IdxMode);
3049 
3050  SetOn->getOperand(3).setIsUndef();
3051  }
3052 
3053  return true;
3054  }
3055 
3056  if (Offset == 0) {
3057  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3058  .add(*Idx);
3059  } else {
3060  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
3061  .add(*Idx)
3062  .addImm(Offset);
3063  }
3064 
3065  return true;
3066 }
3067 
3068 // Control flow needs to be inserted if indexing with a VGPR.
3070  MachineBasicBlock &MBB,
3071  const GCNSubtarget &ST) {
3072  const SIInstrInfo *TII = ST.getInstrInfo();
3073  const SIRegisterInfo &TRI = TII->getRegisterInfo();
3074  MachineFunction *MF = MBB.getParent();
3076 
3077  unsigned Dst = MI.getOperand(0).getReg();
3078  unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
3079  int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3080 
3081  const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
3082 
3083  unsigned SubReg;
3084  std::tie(SubReg, Offset)
3085  = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
3086 
3087  bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
3088 
3089  if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) {
3091  const DebugLoc &DL = MI.getDebugLoc();
3092 
3093  if (UseGPRIdxMode) {
3094  // TODO: Look at the uses to avoid the copy. This may require rescheduling
3095  // to avoid interfering with other uses, so probably requires a new
3096  // optimization pass.
3097  BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
3098  .addReg(SrcReg, RegState::Undef, SubReg)
3099  .addReg(SrcReg, RegState::Implicit)
3100  .addReg(AMDGPU::M0, RegState::Implicit);
3101  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3102  } else {
3103  BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
3104  .addReg(SrcReg, RegState::Undef, SubReg)
3105  .addReg(SrcReg, RegState::Implicit);
3106  }
3107 
3108  MI.eraseFromParent();
3109 
3110  return &MBB;
3111  }
3112 
3113  const DebugLoc &DL = MI.getDebugLoc();
3115 
3116  unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3117  unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3118 
3119  BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
3120 
3121  auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg,
3122  Offset, UseGPRIdxMode, true);
3123  MachineBasicBlock *LoopBB = InsPt->getParent();
3124 
3125  if (UseGPRIdxMode) {
3126  BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
3127  .addReg(SrcReg, RegState::Undef, SubReg)
3128  .addReg(SrcReg, RegState::Implicit)
3129  .addReg(AMDGPU::M0, RegState::Implicit);
3130  BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3131  } else {
3132  BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
3133  .addReg(SrcReg, RegState::Undef, SubReg)
3134  .addReg(SrcReg, RegState::Implicit);
3135  }
3136 
3137  MI.eraseFromParent();
3138 
3139  return LoopBB;
3140 }
3141 
3142 static unsigned getMOVRELDPseudo(const SIRegisterInfo &TRI,
3143  const TargetRegisterClass *VecRC) {
3144  switch (TRI.getRegSizeInBits(*VecRC)) {
3145  case 32: // 4 bytes
3146  return AMDGPU::V_MOVRELD_B32_V1;
3147  case 64: // 8 bytes
3148  return AMDGPU::V_MOVRELD_B32_V2;
3149  case 128: // 16 bytes
3150  return AMDGPU::V_MOVRELD_B32_V4;
3151  case 256: // 32 bytes
3152  return AMDGPU::V_MOVRELD_B32_V8;
3153  case 512: // 64 bytes
3154  return AMDGPU::V_MOVRELD_B32_V16;
3155  default:
3156  llvm_unreachable("unsupported size for MOVRELD pseudos");
3157  }
3158 }
3159 
3161  MachineBasicBlock &MBB,
3162  const GCNSubtarget &ST) {
3163  const SIInstrInfo *TII = ST.getInstrInfo();
3164  const SIRegisterInfo &TRI = TII->getRegisterInfo();
3165  MachineFunction *MF = MBB.getParent();
3167 
3168  unsigned Dst = MI.getOperand(0).getReg();
3169  const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
3170  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3171  const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
3172  int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3173  const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
3174 
3175  // This can be an immediate, but will be folded later.
3176  assert(Val->getReg());
3177 
3178  unsigned SubReg;
3179  std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
3180  SrcVec->getReg(),
3181  Offset);
3182  bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
3183 
3184  if (Idx->getReg() == AMDGPU::NoRegister) {
3186  const DebugLoc &DL = MI.getDebugLoc();
3187 
3188  assert(Offset == 0);
3189 
3190  BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
3191  .add(*SrcVec)
3192  .add(*Val)
3193  .addImm(SubReg);
3194 
3195  MI.eraseFromParent();
3196  return &MBB;
3197  }
3198 
3199  if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) {
3201  const DebugLoc &DL = MI.getDebugLoc();
3202 
3203  if (UseGPRIdxMode) {
3204  BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
3205  .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst
3206  .add(*Val)
3207  .addReg(Dst, RegState::ImplicitDefine)
3208  .addReg(SrcVec->getReg(), RegState::Implicit)
3209  .addReg(AMDGPU::M0, RegState::Implicit);
3210 
3211  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3212  } else {
3213  const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
3214 
3215  BuildMI(MBB, I, DL, MovRelDesc)
3216  .addReg(Dst, RegState::Define)
3217  .addReg(SrcVec->getReg())
3218  .add(*Val)
3219  .addImm(SubReg - AMDGPU::sub0);
3220  }
3221 
3222  MI.eraseFromParent();
3223  return &MBB;
3224  }
3225 
3226  if (Val->isReg())
3227  MRI.clearKillFlags(Val->getReg());
3228 
3229  const DebugLoc &DL = MI.getDebugLoc();
3230 
3231  unsigned PhiReg = MRI.createVirtualRegister(VecRC);
3232 
3233  auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
3234  Offset, UseGPRIdxMode, false);
3235  MachineBasicBlock *LoopBB = InsPt->getParent();
3236 
3237  if (UseGPRIdxMode) {
3238  BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
3239  .addReg(PhiReg, RegState::Undef, SubReg) // vdst
3240  .add(*Val) // src0
3242  .addReg(PhiReg, RegState::Implicit)
3243  .addReg(AMDGPU::M0, RegState::Implicit);
3244  BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3245  } else {
3246  const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
3247 
3248  BuildMI(*LoopBB, InsPt, DL, MovRelDesc)
3249  .addReg(Dst, RegState::Define)
3250  .addReg(PhiReg)
3251  .add(*Val)
3252  .addImm(SubReg - AMDGPU::sub0);
3253  }
3254 
3255  MI.eraseFromParent();
3256 
3257  return LoopBB;
3258 }
3259 
3261  MachineInstr &MI, MachineBasicBlock *BB) const {
3262 
3263  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3264  MachineFunction *MF = BB->getParent();
3266 
3267  if (TII->isMIMG(MI)) {
3268  if (MI.memoperands_empty() && MI.mayLoadOrStore()) {
3269  report_fatal_error("missing mem operand from MIMG instruction");
3270  }
3271  // Add a memoperand for mimg instructions so that they aren't assumed to
3272  // be ordered memory instuctions.
3273 
3274  return BB;
3275  }
3276 
3277  switch (MI.getOpcode()) {
3278  case AMDGPU::S_ADD_U64_PSEUDO:
3279  case AMDGPU::S_SUB_U64_PSEUDO: {
3281  const DebugLoc &DL = MI.getDebugLoc();
3282 
3283  MachineOperand &Dest = MI.getOperand(0);
3284  MachineOperand &Src0 = MI.getOperand(1);
3285  MachineOperand &Src1 = MI.getOperand(2);
3286 
3287  unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3288  unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3289 
3290  MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
3291  Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
3292  &AMDGPU::SReg_32_XM0RegClass);
3293  MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
3294  Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
3295  &AMDGPU::SReg_32_XM0RegClass);
3296 
3297  MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
3298  Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
3299  &AMDGPU::SReg_32_XM0RegClass);
3300  MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
3301  Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
3302  &AMDGPU::SReg_32_XM0RegClass);
3303 
3304  bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
3305 
3306  unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
3307  unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
3308  BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
3309  .add(Src0Sub0)
3310  .add(Src1Sub0);
3311  BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
3312  .add(Src0Sub1)
3313  .add(Src1Sub1);
3314  BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
3315  .addReg(DestSub0)
3316  .addImm(AMDGPU::sub0)
3317  .addReg(DestSub1)
3318  .addImm(AMDGPU::sub1);
3319  MI.eraseFromParent();
3320  return BB;
3321  }
3322  case AMDGPU::SI_INIT_M0: {
3323  BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
3324  TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3325  .add(MI.getOperand(0));
3326  MI.eraseFromParent();
3327  return BB;
3328  }
3329  case AMDGPU::SI_INIT_EXEC:
3330  // This should be before all vector instructions.
3331  BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
3332  AMDGPU::EXEC)
3333  .addImm(MI.getOperand(0).getImm());
3334  MI.eraseFromParent();
3335  return BB;
3336 
3337  case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {
3338  // Extract the thread count from an SGPR input and set EXEC accordingly.
3339  // Since BFM can't shift by 64, handle that case with CMP + CMOV.
3340  //
3341  // S_BFE_U32 count, input, {shift, 7}
3342  // S_BFM_B64 exec, count, 0
3343  // S_CMP_EQ_U32 count, 64
3344  // S_CMOV_B64 exec, -1
3345  MachineInstr *FirstMI = &*BB->begin();
3347  unsigned InputReg = MI.getOperand(0).getReg();
3348  unsigned CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3349  bool Found = false;
3350 
3351  // Move the COPY of the input reg to the beginning, so that we can use it.
3352  for (auto I = BB->begin(); I != &MI; I++) {
3353  if (I->getOpcode() != TargetOpcode::COPY ||
3354  I->getOperand(0).getReg() != InputReg)
3355  continue;
3356 
3357  if (I == FirstMI) {
3358  FirstMI = &*++BB->begin();
3359  } else {
3360  I->removeFromParent();
3361  BB->insert(FirstMI, &*I);
3362  }
3363  Found = true;
3364  break;
3365  }
3366  assert(Found);
3367  (void)Found;
3368 
3369  // This should be before all vector instructions.
3370  BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
3371  .addReg(InputReg)
3372  .addImm((MI.getOperand(1).getImm() & 0x7f) | 0x70000);
3373  BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFM_B64),
3374  AMDGPU::EXEC)
3375  .addReg(CountReg)
3376  .addImm(0);
3377  BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32))
3378  .addReg(CountReg, RegState::Kill)
3379  .addImm(64);
3380  BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMOV_B64),
3381  AMDGPU::EXEC)
3382  .addImm(-1);
3383  MI.eraseFromParent();
3384  return BB;
3385  }
3386 
3387  case AMDGPU::GET_GROUPSTATICSIZE: {
3388  DebugLoc DL = MI.getDebugLoc();
3389  BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
3390  .add(MI.getOperand(0))
3391  .addImm(MFI->getLDSSize());
3392  MI.eraseFromParent();
3393  return BB;
3394  }
3395  case AMDGPU::SI_INDIRECT_SRC_V1:
3396  case AMDGPU::SI_INDIRECT_SRC_V2:
3397  case AMDGPU::SI_INDIRECT_SRC_V4:
3398  case AMDGPU::SI_INDIRECT_SRC_V8:
3399  case AMDGPU::SI_INDIRECT_SRC_V16:
3400  return emitIndirectSrc(MI, *BB, *getSubtarget());
3401  case AMDGPU::SI_INDIRECT_DST_V1:
3402  case AMDGPU::SI_INDIRECT_DST_V2:
3403  case AMDGPU::SI_INDIRECT_DST_V4:
3404  case AMDGPU::SI_INDIRECT_DST_V8:
3405  case AMDGPU::SI_INDIRECT_DST_V16:
3406  return emitIndirectDst(MI, *BB, *getSubtarget());
3407  case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
3408  case AMDGPU::SI_KILL_I1_PSEUDO:
3409  return splitKillBlock(MI, BB);
3410  case AMDGPU::V_CNDMASK_B64_PSEUDO: {
3412 
3413  unsigned Dst = MI.getOperand(0).getReg();
3414  unsigned Src0 = MI.getOperand(1).getReg();
3415  unsigned Src1 = MI.getOperand(2).getReg();
3416  const DebugLoc &DL = MI.getDebugLoc();
3417  unsigned SrcCond = MI.getOperand(3).getReg();
3418 
3419  unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3420  unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3421  unsigned SrcCondCopy = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
3422 
3423  BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
3424  .addReg(SrcCond);
3425  BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
3426  .addReg(Src0, 0, AMDGPU::sub0)
3427  .addReg(Src1, 0, AMDGPU::sub0)
3428  .addReg(SrcCondCopy);
3429  BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
3430  .addReg(Src0, 0, AMDGPU::sub1)
3431  .addReg(Src1, 0, AMDGPU::sub1)
3432  .addReg(SrcCondCopy);
3433 
3434  BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
3435  .addReg(DstLo)
3436  .addImm(AMDGPU::sub0)
3437  .addReg(DstHi)
3438  .addImm(AMDGPU::sub1);
3439  MI.eraseFromParent();
3440  return BB;
3441  }
3442  case AMDGPU::SI_BR_UNDEF: {
3443  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3444  const DebugLoc &DL = MI.getDebugLoc();
3445  MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
3446  .add(MI.getOperand(0));
3447  Br->getOperand(1).setIsUndef(true); // read undef SCC
3448  MI.eraseFromParent();
3449  return BB;
3450  }
3451  case AMDGPU::ADJCALLSTACKUP:
3452  case AMDGPU::ADJCALLSTACKDOWN: {
3454  MachineInstrBuilder MIB(*MF, &MI);
3455 
3456  // Add an implicit use of the frame offset reg to prevent the restore copy
3457  // inserted after the call from being reorderd after stack operations in the
3458  // the caller's frame.
3459  MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
3460  .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit)
3461  .addReg(Info->getFrameOffsetReg(), RegState::Implicit);
3462  return BB;
3463  }
3464  case AMDGPU::SI_CALL_ISEL:
3465  case AMDGPU::SI_TCRETURN_ISEL: {
3466  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3467  const DebugLoc &DL = MI.getDebugLoc();
3468  unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
3469 
3471  unsigned GlobalAddrReg = MI.getOperand(0).getReg();
3472  MachineInstr *PCRel = MRI.getVRegDef(GlobalAddrReg);
3473  assert(PCRel->getOpcode() == AMDGPU::SI_PC_ADD_REL_OFFSET);
3474 
3475  const GlobalValue *G = PCRel->getOperand(1).getGlobal();
3476 
3477  MachineInstrBuilder MIB;
3478  if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
3479  MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg)
3480  .add(MI.getOperand(0))
3481  .addGlobalAddress(G);
3482  } else {
3483  MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_TCRETURN))
3484  .add(MI.getOperand(0))
3485  .addGlobalAddress(G);
3486 
3487  // There is an additional imm operand for tcreturn, but it should be in the
3488  // right place already.
3489  }
3490 
3491  for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
3492  MIB.add(MI.getOperand(I));
3493 
3494  MIB.cloneMemRefs(MI);
3495  MI.eraseFromParent();
3496  return BB;
3497  }
3498  default:
3500  }
3501 }
3502 
3504  return isTypeLegal(VT.getScalarType());
3505 }
3506 
3508  // This currently forces unfolding various combinations of fsub into fma with
3509  // free fneg'd operands. As long as we have fast FMA (controlled by
3510  // isFMAFasterThanFMulAndFAdd), we should perform these.
3511 
3512  // When fma is quarter rate, for f64 where add / sub are at best half rate,
3513  // most of these combines appear to be cycle neutral but save on instruction
3514  // count / code size.
3515  return true;
3516 }
3517 
3519  EVT VT) const {
3520  if (!VT.isVector()) {
3521  return MVT::i1;
3522  }
3523  return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
3524 }
3525 
3527  // TODO: Should i16 be used always if legal? For now it would force VALU
3528  // shifts.
3529  return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
3530 }
3531 
3532 // Answering this is somewhat tricky and depends on the specific device which
3533 // have different rates for fma or all f64 operations.
3534 //
3535 // v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
3536 // regardless of which device (although the number of cycles differs between
3537 // devices), so it is always profitable for f64.
3538 //
3539 // v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
3540 // only on full rate devices. Normally, we should prefer selecting v_mad_f32
3541 // which we can always do even without fused FP ops since it returns the same
3542 // result as the separate operations and since it is always full
3543 // rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
3544 // however does not support denormals, so we do report fma as faster if we have
3545 // a fast fma device and require denormals.
3546 //
3548  VT = VT.getScalarType();
3549 
3550  switch (VT.getSimpleVT().SimpleTy) {
3551  case MVT::f32: {
3552  // This is as fast on some subtargets. However, we always have full rate f32
3553  // mad available which returns the same result as the separate operations
3554  // which we should prefer over fma. We can't use this if we want to support
3555  // denormals, so only report this in these cases.
3556  if (Subtarget->hasFP32Denormals())
3557  return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
3558 
3559  // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
3560  return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
3561  }
3562  case MVT::f64:
3563  return true;
3564  case MVT::f16:
3565  return Subtarget->has16BitInsts() && Subtarget->hasFP16Denormals();
3566  default:
3567  break;
3568  }
3569 
3570  return false;
3571 }
3572 
3573 //===----------------------------------------------------------------------===//
3574 // Custom DAG Lowering Operations
3575 //===----------------------------------------------------------------------===//
3576 
3577 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
3578 // wider vector type is legal.
3580  SelectionDAG &DAG) const {
3581  unsigned Opc = Op.getOpcode();
3582  EVT VT = Op.getValueType();
3583  assert(VT == MVT::v4f16);
3584 
3585  SDValue Lo, Hi;
3586  std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
3587 
3588  SDLoc SL(Op);
3589  SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo,
3590  Op->getFlags());
3591  SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi,
3592  Op->getFlags());
3593 
3594  return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
3595 }
3596 
3597 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
3598 // wider vector type is legal.
3600  SelectionDAG &DAG) const {
3601  unsigned Opc = Op.getOpcode();
3602  EVT VT = Op.getValueType();
3603  assert(VT == MVT::v4i16 || VT == MVT::v4f16);
3604 
3605  SDValue Lo0, Hi0;
3606  std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
3607  SDValue Lo1, Hi1;
3608  std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
3609 
3610  SDLoc SL(Op);
3611 
3612  SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
3613  Op->getFlags());
3614  SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
3615  Op->getFlags());
3616 
3617  return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
3618 }
3619 
3621  switch (Op.getOpcode()) {
3622  default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
3623  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
3624  case ISD::LOAD: {
3625  SDValue Result = LowerLOAD(Op, DAG);
3626  assert((!Result.getNode() ||
3627  Result.getNode()->getNumValues() == 2) &&
3628  "Load should return a value and a chain");
3629  return Result;
3630  }
3631 
3632  case ISD::FSIN:
3633  case ISD::FCOS:
3634  return LowerTrig(Op, DAG);
3635  case ISD::SELECT: return LowerSELECT(Op, DAG);
3636  case ISD::FDIV: return LowerFDIV(Op, DAG);
3637  case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
3638  case ISD::STORE: return LowerSTORE(Op, DAG);
3639  case ISD::GlobalAddress: {
3640  MachineFunction &MF = DAG.getMachineFunction();
3642  return LowerGlobalAddress(MFI, Op, DAG);
3643  }
3644  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
3645  case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
3646  case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
3647  case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
3649  return lowerINSERT_VECTOR_ELT(Op, DAG);
3651  return lowerEXTRACT_VECTOR_ELT(Op, DAG);
3652  case ISD::BUILD_VECTOR:
3653  return lowerBUILD_VECTOR(Op, DAG);
3654  case ISD::FP_ROUND:
3655  return lowerFP_ROUND(Op, DAG);
3656  case ISD::TRAP:
3657  return lowerTRAP(Op, DAG);
3658  case ISD::DEBUGTRAP:
3659  return lowerDEBUGTRAP(Op, DAG);
3660  case ISD::FABS:
3661  case ISD::FNEG:
3662  case ISD::FCANONICALIZE:
3663  return splitUnaryVectorOp(Op, DAG);
3664  case ISD::FMINNUM:
3665  case ISD::FMAXNUM:
3666  return lowerFMINNUM_FMAXNUM(Op, DAG);
3667  case ISD::SHL:
3668  case ISD::SRA:
3669  case ISD::SRL:
3670  case ISD::ADD:
3671  case ISD::SUB:
3672  case ISD::MUL:
3673  case ISD::SMIN:
3674  case ISD::SMAX:
3675  case ISD::UMIN:
3676  case ISD::UMAX:
3677  case ISD::FADD:
3678  case ISD::FMUL:
3679  case ISD::FMINNUM_IEEE:
3680  case ISD::FMAXNUM_IEEE:
3681  return splitBinaryVectorOp(Op, DAG);
3682  }
3683  return SDValue();
3684 }
3685 
3687  const SDLoc &DL,
3688  SelectionDAG &DAG, bool Unpacked) {
3689  if (!LoadVT.isVector())
3690  return Result;
3691 
3692  if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
3693  // Truncate to v2i16/v4i16.
3694  EVT IntLoadVT = LoadVT.changeTypeToInteger();
3695 
3696  // Workaround legalizer not scalarizing truncate after vector op
3697  // legalization byt not creating intermediate vector trunc.
3699  DAG.ExtractVectorElements(Result, Elts);
3700  for (SDValue &Elt : Elts)
3701  Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
3702 
3703  Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
3704 
3705  // Bitcast to original type (v2f16/v4f16).
3706  return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
3707  }
3708 
3709  // Cast back to the original packed type.
3710  return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
3711 }
3712 
3713 SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
3714  MemSDNode *M,
3715  SelectionDAG &DAG,
3716  ArrayRef<SDValue> Ops,
3717  bool IsIntrinsic) const {
3718  SDLoc DL(M);
3719 
3720  bool Unpacked = Subtarget->hasUnpackedD16VMem();
3721  EVT LoadVT = M->getValueType(0);
3722 
3723  EVT EquivLoadVT = LoadVT;
3724  if (Unpacked && LoadVT.isVector()) {
3725  EquivLoadVT = LoadVT.isVector() ?
3727  LoadVT.getVectorNumElements()) : LoadVT;
3728  }
3729 
3730  // Change from v4f16/v2f16 to EquivLoadVT.
3731  SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
3732 
3733  SDValue Load
3734  = DAG.getMemIntrinsicNode(
3735  IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
3736  VTList, Ops, M->getMemoryVT(),
3737  M->getMemOperand());
3738  if (!Unpacked) // Just adjusted the opcode.
3739  return Load;
3740 
3741  SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
3742 
3743  return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
3744 }
3745 
3747  SDNode *N, SelectionDAG &DAG) {
3748  EVT VT = N->getValueType(0);
3749  const auto *CD = dyn_cast<ConstantSDNode>(N->getOperand(3));
3750  if (!CD)
3751  return DAG.getUNDEF(VT);
3752 
3753  int CondCode = CD->getSExtValue();
3754  if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE ||
3755  CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE)
3756  return DAG.getUNDEF(VT);
3757 
3758  ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
3759 
3760 
3761  SDValue LHS = N->getOperand(1);
3762  SDValue RHS = N->getOperand(2);
3763 
3764  SDLoc DL(N);
3765 
3766  EVT CmpVT = LHS.getValueType();
3767  if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
3768  unsigned PromoteOp = ICmpInst::isSigned(IcInput) ?
3770  LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
3771  RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
3772  }
3773 
3774  ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
3775 
3776  return DAG.getNode(AMDGPUISD::SETCC, DL, VT, LHS, RHS,
3777  DAG.getCondCode(CCOpcode));
3778 }
3779 
3781  SDNode *N, SelectionDAG &DAG) {
3782  EVT VT = N->getValueType(0);
3783  const auto *CD = dyn_cast<ConstantSDNode>(N->getOperand(3));
3784  if (!CD)
3785  return DAG.getUNDEF(VT);
3786 
3787  int CondCode = CD->getSExtValue();
3788  if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE ||
3789  CondCode > FCmpInst::Predicate::LAST_FCMP_PREDICATE) {
3790  return DAG.getUNDEF(VT);
3791  }
3792 
3793  SDValue Src0 = N->getOperand(1);
3794  SDValue Src1 = N->getOperand(2);
3795  EVT CmpVT = Src0.getValueType();
3796  SDLoc SL(N);
3797 
3798  if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
3799  Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
3800  Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
3801  }
3802 
3803  FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
3804  ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
3805  return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src0,
3806  Src1, DAG.getCondCode(CCOpcode));
3807 }
3808 
3811  SelectionDAG &DAG) const {
3812  switch (N->getOpcode()) {
3813  case ISD::INSERT_VECTOR_ELT: {
3814  if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
3815  Results.push_back(Res);
3816  return;
3817  }
3818  case ISD::EXTRACT_VECTOR_ELT: {
3819  if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
3820  Results.push_back(Res);
3821  return;
3822  }
3823  case ISD::INTRINSIC_WO_CHAIN: {
3824  unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
3825  switch (IID) {
3827  SDValue Src0 = N->getOperand(1);
3828  SDValue Src1 = N->getOperand(2);
3829  SDLoc SL(N);
3831  Src0, Src1);
3832  Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
3833  return;
3834  }
3839  SDValue Src0 = N->getOperand(1);
3840  SDValue Src1 = N->getOperand(2);
3841  SDLoc SL(N);
3842  unsigned Opcode;
3843 
3846  else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
3848  else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
3849  Opcode = AMDGPUISD::CVT_PK_I16_I32;
3850  else
3851  Opcode = AMDGPUISD::CVT_PK_U16_U32;
3852 
3853  EVT VT = N->getValueType(0);
3854  if (isTypeLegal(VT))
3855  Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
3856  else {
3857  SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
3858  Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
3859  }
3860  return;
3861  }
3862  }
3863  break;
3864  }
3865  case ISD::INTRINSIC_W_CHAIN: {
3866  if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
3867  Results.push_back(Res);
3868  Results.push_back(Res.getValue(1));
3869  return;
3870  }
3871 
3872  break;
3873  }
3874  case ISD::SELECT: {
3875  SDLoc SL(N);
3876  EVT VT = N->getValueType(0);
3877  EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3878  SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
3879  SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
3880 
3881  EVT SelectVT = NewVT;
3882  if (NewVT.bitsLT(MVT::i32)) {
3883  LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
3884  RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
3885  SelectVT = MVT::i32;
3886  }
3887 
3888  SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT,
3889  N->getOperand(0), LHS, RHS);
3890 
3891  if (NewVT != SelectVT)
3892  NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
3893  Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
3894  return;
3895  }
3896  case ISD::FNEG: {
3897  if (N->getValueType(0) != MVT::v2f16)
3898  break;
3899 
3900  SDLoc SL(N);
3901  SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
3902 
3903  SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32,
3904  BC,
3905  DAG.getConstant(0x80008000, SL, MVT::i32));
3906  Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
3907  return;
3908  }
3909  case ISD::FABS: {
3910  if (N->getValueType(0) != MVT::v2f16)
3911  break;
3912 
3913  SDLoc SL(N);
3914  SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
3915 
3916  SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32,
3917  BC,
3918  DAG.getConstant(0x7fff7fff, SL, MVT::i32));
3919  Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
3920  return;
3921  }
3922  default:
3923  break;
3924  }
3925 }
3926 
3927 /// Helper function for LowerBRCOND
3928 static SDNode *findUser(SDValue Value, unsigned Opcode) {
3929 
3930  SDNode *Parent = Value.getNode();
3931  for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
3932  I != E; ++I) {
3933 
3934  if (I.getUse().get() != Value)
3935  continue;
3936 
3937  if (I->getOpcode() == Opcode)
3938  return *I;
3939  }
3940  return nullptr;
3941 }
3942 
3943 unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
3944  if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
3945  switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
3946  case Intrinsic::amdgcn_if:
3947  return AMDGPUISD::IF;
3949  return AMDGPUISD::ELSE;
3951  return AMDGPUISD::LOOP;
3953  llvm_unreachable("should not occur");
3954  default:
3955  return 0;
3956  }
3957  }
3958 
3959  // break, if_break, else_break are all only used as inputs to loop, not
3960  // directly as branch conditions.
3961  return 0;
3962 }
3963 
3964 void SITargetLowering::createDebuggerPrologueStackObjects(
3965  MachineFunction &MF) const {
3966  // Create stack objects that are used for emitting debugger prologue.
3967  //
3968  // Debugger prologue writes work group IDs and work item IDs to scratch memory
3969  // at fixed location in the following format:
3970  // offset 0: work group ID x
3971  // offset 4: work group ID y
3972  // offset 8: work group ID z
3973  // offset 16: work item ID x
3974  // offset 20: work item ID y
3975  // offset 24: work item ID z
3977  int ObjectIdx = 0;
3978 
3979  // For each dimension:
3980  for (unsigned i = 0; i < 3; ++i) {
3981  // Create fixed stack object for work group ID.
3982  ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4, true);
3983  Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx);
3984  // Create fixed stack object for work item ID.
3985  ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4 + 16, true);
3986  Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx);
3987  }
3988 }
3989 
3990 bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
3991  const Triple &TT = getTargetMachine().getTargetTriple();
3992  return (GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
3995 }
3996 
3997 bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
3998  return (GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
4001  !shouldEmitFixup(GV) &&
4003 }
4004 
4005 bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
4006  return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
4007 }
4008 
4009 /// This transforms the control flow intrinsics to get the branch destination as
4010 /// last parameter, also switches branch target with BR if the need arise
4011 SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
4012  SelectionDAG &DAG) const {
4013  SDLoc DL(BRCOND);
4014 
4015  SDNode *Intr = BRCOND.getOperand(1).getNode();
4016  SDValue Target = BRCOND.getOperand(2);
4017  SDNode *BR = nullptr;
4018  SDNode *SetCC = nullptr;
4019 
4020  if (Intr->getOpcode() == ISD::SETCC) {
4021  // As long as we negate the condition everything is fine
4022  SetCC = Intr;
4023  Intr = SetCC->getOperand(0).getNode();
4024 
4025  } else {
4026  // Get the target from BR if we don't negate the condition
4027  BR = findUser(BRCOND, ISD::BR);
4028  Target = BR->getOperand(1);
4029  }
4030 
4031  // FIXME: This changes the types of the intrinsics instead of introducing new
4032  // nodes with the correct types.
4033  // e.g. llvm.amdgcn.loop
4034 
4035  // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3
4036  // => t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088>
4037 
4038  unsigned CFNode = isCFIntrinsic(Intr);
4039  if (CFNode == 0) {
4040  // This is a uniform branch so we don't need to legalize.
4041  return BRCOND;
4042  }
4043 
4044  bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
4045  Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
4046 
4047  assert(!SetCC ||
4048  (SetCC->getConstantOperandVal(1) == 1 &&
4049  cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
4050  ISD::SETNE));
4051 
4052  // operands of the new intrinsic call
4054  if (HaveChain)
4055  Ops.push_back(BRCOND.getOperand(0));
4056 
4057  Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
4058  Ops.push_back(Target);
4059 
4060  ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
4061 
4062  // build the new intrinsic call
4063  SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
4064 
4065  if (!HaveChain) {
4066  SDValue Ops[] = {
4067  SDValue(Result, 0),
4068  BRCOND.getOperand(0)
4069  };
4070 
4071  Result = DAG.getMergeValues(Ops, DL).getNode();
4072  }
4073 
4074  if (BR) {
4075  // Give the branch instruction our target
4076  SDValue Ops[] = {
4077  BR->getOperand(0),
4078  BRCOND.getOperand(2)
4079  };
4080  SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
4081  DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
4082  BR = NewBR.getNode();
4083  }
4084 
4085  SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
4086 
4087  // Copy the intrinsic results to registers
4088  for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
4090  if (!CopyToReg)
4091  continue;
4092 
4093  Chain = DAG.getCopyToReg(
4094  Chain, DL,
4095  CopyToReg->getOperand(1),
4096  SDValue(Result, i - 1),
4097  SDValue());
4098 
4099  DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
4100  }
4101 
4102  // Remove the old intrinsic from the chain
4104  SDValue(Intr, Intr->getNumValues() - 1),
4105  Intr->getOperand(0));
4106 
4107  return Chain;
4108 }
4109 
4110 SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG,
4111  SDValue Op,
4112  const SDLoc &DL,
4113  EVT VT) const {
4114  return Op.getValueType().bitsLE(VT) ?
4115  DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
4116  DAG.getNode(ISD::FTRUNC, DL, VT, Op);
4117 }
4118 
4119 SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
4120  assert(Op.getValueType() == MVT::f16 &&
4121  "Do not know how to custom lower FP_ROUND for non-f16 type");
4122 
4123  SDValue Src = Op.getOperand(0);
4124  EVT SrcVT = Src.getValueType();
4125  if (SrcVT != MVT::f64)
4126  return Op;
4127 
4128  SDLoc DL(Op);
4129 
4130  SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
4131  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
4132  return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
4133 }
4134 
4135 SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
4136  SelectionDAG &DAG) const {
4137  EVT VT = Op.getValueType();
4138  bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
4139 
4140  // FIXME: Assert during eslection that this is only selected for
4141  // ieee_mode. Currently a combine can produce the ieee version for non-ieee
4142  // mode functions, but this happens to be OK since it's only done in cases
4143  // where there is known no sNaN.
4144  if (IsIEEEMode)
4145  return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
4146 
4147  if (VT == MVT::v4f16)
4148  return splitBinaryVectorOp(Op, DAG);
4149  return Op;
4150 }
4151 
4152 SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
4153  SDLoc SL(Op);
4154  SDValue Chain = Op.getOperand(0);
4155 
4156  if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4157  !Subtarget->isTrapHandlerEnabled())
4158  return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
4159 
4160  MachineFunction &MF = DAG.getMachineFunction();
4162  unsigned UserSGPR = Info->getQueuePtrUserSGPR();
4163  assert(UserSGPR != AMDGPU::NoRegister);
4164  SDValue QueuePtr = CreateLiveInRegister(
4165  DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
4166  SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
4167  SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
4168  QueuePtr, SDValue());
4169  SDValue Ops[] = {
4170  ToReg,
4172  SGPR01,
4173  ToReg.getValue(1)
4174  };
4175  return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
4176 }
4177 
4178 SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
4179  SDLoc SL(Op);
4180  SDValue Chain = Op.getOperand(0);
4181  MachineFunction &MF = DAG.getMachineFunction();
4182 
4183  if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4184  !Subtarget->isTrapHandlerEnabled()) {
4186  "debugtrap handler not supported",
4187  Op.getDebugLoc(),
4188  DS_Warning);
4189  LLVMContext &Ctx = MF.getFunction().getContext();
4190  Ctx.diagnose(NoTrap);
4191  return Chain;
4192  }
4193 
4194  SDValue Ops[] = {
4195  Chain,
4197  };
4198  return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
4199 }
4200 
4201 SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
4202  SelectionDAG &DAG) const {
4203  // FIXME: Use inline constants (src_{shared, private}_base) instead.
4204  if (Subtarget->hasApertureRegs()) {
4205  unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
4208  unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
4211  unsigned Encoding =
4213  Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
4214  WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
4215 
4216  SDValue EncodingImm = DAG.getTargetConstant(Encoding, DL, MVT::i16);
4217  SDValue ApertureReg = SDValue(
4218  DAG.getMachineNode(AMDGPU::S_GETREG_B32, DL, MVT::i32, EncodingImm), 0);
4219  SDValue ShiftAmount = DAG.getTargetConstant(WidthM1 + 1, DL, MVT::i32);
4220  return DAG.getNode(ISD::SHL, DL, MVT::i32, ApertureReg, ShiftAmount);
4221  }
4222 
4223  MachineFunction &MF = DAG.getMachineFunction();
4225  unsigned UserSGPR = Info->getQueuePtrUserSGPR();
4226  assert(UserSGPR != AMDGPU::NoRegister);
4227 
4228  SDValue QueuePtr = CreateLiveInRegister(
4229  DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
4230 
4231  // Offset into amd_queue_t for group_segment_aperture_base_hi /
4232  // private_segment_aperture_base_hi.
4233  uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
4234 
4235  SDValue Ptr = DAG.getObjectPtrOffset(DL, QueuePtr, StructOffset);
4236 
4237  // TODO: Use custom target PseudoSourceValue.
4238  // TODO: We should use the value from the IR intrinsic call, but it might not
4239  // be available and how do we get it?
4242 
4243  MachinePointerInfo PtrInfo(V, StructOffset);
4244  return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
4245  MinAlign(64, StructOffset),
4248 }
4249 
4250 SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
4251  SelectionDAG &DAG) const {
4252  SDLoc SL(Op);
4253  const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
4254 
4255  SDValue Src = ASC->getOperand(0);
4256  SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
4257 
4258  const AMDGPUTargetMachine &TM =
4259  static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
4260 
4261  // flat -> local/private
4263  unsigned DestAS = ASC->getDestAddressSpace();
4264 
4265  if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
4266  DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
4267  unsigned NullVal = TM.getNullPointerValue(DestAS);
4268  SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
4269  SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
4270  SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
4271 
4272  return DAG.getNode(ISD::SELECT, SL, MVT::i32,
4273  NonNull, Ptr, SegmentNullPtr);
4274  }
4275  }
4276 
4277  // local/private -> flat
4279  unsigned SrcAS = ASC->getSrcAddressSpace();
4280 
4281  if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
4282  SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
4283  unsigned NullVal = TM.getNullPointerValue(SrcAS);
4284  SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
4285 
4286  SDValue NonNull
4287  = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
4288 
4289  SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG);
4290  SDValue CvtPtr
4291  = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
4292 
4293  return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull,
4294  DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr),
4295  FlatNullPtr);
4296  }
4297  }
4298 
4299  // global <-> flat are no-ops and never emitted.
4300 
4301  const MachineFunction &MF = DAG.getMachineFunction();
4302  DiagnosticInfoUnsupported InvalidAddrSpaceCast(
4303  MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
4304  DAG.getContext()->diagnose(InvalidAddrSpaceCast);
4305 
4306  return DAG.getUNDEF(ASC->getValueType(0));
4307 }
4308 
4309 SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
4310  SelectionDAG &DAG) const {
4311  SDValue Vec = Op.getOperand(0);
4312  SDValue InsVal = Op.getOperand(1);
4313  SDValue Idx = Op.getOperand(2);
4314  EVT VecVT = Vec.getValueType();
4315  EVT EltVT = VecVT.getVectorElementType();
4316  unsigned VecSize = VecVT.getSizeInBits();
4317  unsigned EltSize = EltVT.getSizeInBits();
4318 
4319 
4320  assert(VecSize <= 64);
4321 
4322  unsigned NumElts = VecVT.getVectorNumElements();
4323  SDLoc SL(Op);
4324  auto KIdx = dyn_cast<ConstantSDNode>(Idx);
4325 
4326  if (NumElts == 4 && EltSize == 16 && KIdx) {
4327  SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
4328 
4329  SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
4330  DAG.getConstant(0, SL, MVT::i32));
4331  SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
4332  DAG.getConstant(1, SL, MVT::i32));
4333 
4334  SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
4335  SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
4336 
4337  unsigned Idx = KIdx->getZExtValue();
4338  bool InsertLo = Idx < 2;
4339  SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16,
4340  InsertLo ? LoVec : HiVec,
4341  DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
4342  DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
4343 
4344  InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
4345 
4346  SDValue Concat = InsertLo ?
4347  DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) :
4348  DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf });
4349 
4350  return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
4351  }
4352 
4353  if (isa<ConstantSDNode>(Idx))
4354  return SDValue();
4355 
4356  MVT IntVT = MVT::getIntegerVT(VecSize);
4357 
4358  // Avoid stack access for dynamic indexing.
4359  SDValue Val = InsVal;
4360  if (InsVal.getValueType() == MVT::f16)
4361  Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal);
4362 
4363  // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
4364  SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Val);
4365 
4366  assert(isPowerOf2_32(EltSize));
4367  SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
4368 
4369  // Convert vector index to bit-index.
4370  SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
4371 
4372  SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
4373  SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
4374  DAG.getConstant(0xffff, SL, IntVT),
4375  ScaledIdx);
4376 
4377  SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
4378  SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
4379  DAG.getNOT(SL, BFM, IntVT), BCVec);
4380 
4381  SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
4382  return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
4383 }
4384 
4385 SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
4386  SelectionDAG &DAG) const {
4387  SDLoc SL(Op);
4388 
4389  EVT ResultVT = Op.getValueType();
4390  SDValue Vec = Op.getOperand(0);
4391  SDValue Idx = Op.getOperand(1);
4392  EVT VecVT = Vec.getValueType();
4393  unsigned VecSize = VecVT.getSizeInBits();
4394  EVT EltVT = VecVT.getVectorElementType();
4395  assert(VecSize <= 64);
4396 
4397  DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
4398 
4399  // Make sure we do any optimizations that will make it easier to fold
4400  // source modifiers before obscuring it with bit operations.
4401 
4402  // XXX - Why doesn't this get called when vector_shuffle is expanded?
4403  if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
4404  return Combined;
4405 
4406  unsigned EltSize = EltVT.getSizeInBits();
4407  assert(isPowerOf2_32(EltSize));
4408 
4409  MVT IntVT = MVT::getIntegerVT(VecSize);
4410  SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
4411 
4412  // Convert vector index to bit-index (* EltSize)
4413  SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
4414 
4415  SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
4416  SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
4417 
4418  if (ResultVT == MVT::f16) {
4419  SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
4420  return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
4421  }
4422 
4423  return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
4424 }
4425 
4426 SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
4427  SelectionDAG &DAG) const {
4428  SDLoc SL(Op);
4429  EVT VT = Op.getValueType();
4430 
4431  if (VT == MVT::v4i16 || VT == MVT::v4f16) {
4433 
4434  // Turn into pair of packed build_vectors.
4435  // TODO: Special case for constants that can be materialized with s_mov_b64.
4436  SDValue Lo = DAG.getBuildVector(HalfVT, SL,
4437  { Op.getOperand(0), Op.getOperand(1) });
4438  SDValue Hi = DAG.getBuildVector(HalfVT, SL,
4439  { Op.getOperand(2), Op.getOperand(3) });
4440 
4441  SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Lo);
4442  SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Hi);
4443 
4444  SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, { CastLo, CastHi });
4445  return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
4446  }
4447 
4448  assert(VT == MVT::v2f16 || VT == MVT::v2i16);
4449  assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
4450 
4451  SDValue Lo = Op.getOperand(0);
4452  SDValue Hi = Op.getOperand(1);
4453 
4454  // Avoid adding defined bits with the zero_extend.
4455  if (Hi.isUndef()) {
4456  Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
4457  SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
4458  return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
4459  }
4460 
4461  Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
4462  Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
4463 
4464  SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
4465  DAG.getConstant(16, SL, MVT::i32));
4466  if (Lo.isUndef())
4467  return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
4468 
4469  Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
4470  Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
4471 
4472  SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
4473  return DAG.getNode(ISD::BITCAST, SL, VT, Or);
4474 }
4475 
4476 bool
4478  // We can fold offsets for anything that doesn't require a GOT relocation.
4479  return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
4482  !shouldEmitGOTReloc(GA->getGlobal());
4483 }
4484 
4485 static SDValue
4487  const SDLoc &DL, unsigned Offset, EVT PtrVT,
4488  unsigned GAFlags = SIInstrInfo::MO_NONE) {
4489  // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
4490  // lowered to the following code sequence:
4491  //
4492  // For constant address space:
4493  // s_getpc_b64 s[0:1]
4494  // s_add_u32 s0, s0, $symbol
4495  // s_addc_u32 s1, s1, 0
4496  //
4497  // s_getpc_b64 returns the address of the s_add_u32 instruction and then
4498  // a fixup or relocation is emitted to replace $symbol with a literal
4499  // constant, which is a pc-relative offset from the encoding of the $symbol
4500  // operand to the global variable.
4501  //
4502  // For global address space:
4503  // s_getpc_b64 s[0:1]
4504  // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
4505  // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
4506  //
4507  // s_getpc_b64 returns the address of the s_add_u32 instruction and then
4508  // fixups or relocations are emitted to replace $symbol@*@lo and
4509  // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
4510  // which is a 64-bit pc-relative offset from the encoding of the $symbol
4511  // operand to the global variable.
4512  //
4513  // What we want here is an offset from the value returned by s_getpc
4514  // (which is the address of the s_add_u32 instruction) to the global
4515  // variable, but since the encoding of $symbol starts 4 bytes after the start
4516  // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
4517  // small. This requires us to add 4 to the global variable offset in order to
4518  // compute the correct address.
4519  SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
4520  GAFlags);
4521  SDValue PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
4522  GAFlags == SIInstrInfo::MO_NONE ?
4523  GAFlags : GAFlags + 1);
4524  return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
4525 }
4526 
4527 SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
4528  SDValue Op,
4529  SelectionDAG &DAG) const {
4530  GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
4531  const GlobalValue *GV = GSD->getGlobal();
4532  if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
4535  return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
4536 
4537  SDLoc DL(GSD);
4538  EVT PtrVT = Op.getValueType();
4539 
4540  // FIXME: Should not make address space based decisions here.
4541  if (shouldEmitFixup(GV))
4542  return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
4543  else if (shouldEmitPCReloc(GV))
4544  return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
4546 
4547  SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
4549 
4550  Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
4552  const DataLayout &DataLayout = DAG.getDataLayout();
4553  unsigned Align = DataLayout.getABITypeAlignment(PtrTy);
4554  MachinePointerInfo PtrInfo
4556 
4557  return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align,
4560 }
4561 
4563  const SDLoc &DL, SDValue V) const {
4564  // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
4565  // the destination register.
4566  //
4567  // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
4568  // so we will end up with redundant moves to m0.
4569  //
4570  // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
4571 
4572  // A Null SDValue creates a glue result.
4573  SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
4574  V, Chain);
4575  return SDValue(M0, 0);
4576 }
4577 
4578 SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
4579  SDValue Op,
4580  MVT VT,
4581  unsigned Offset) const {
4582  SDLoc SL(Op);
4583  SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL,
4584  DAG.getEntryNode(), Offset, 4, false);
4585  // The local size values will have the hi 16-bits as zero.
4586  return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
4587  DAG.getValueType(VT));
4588 }
4589 
4591  EVT VT) {
4593  "non-hsa intrinsic with hsa target",
4594  DL.getDebugLoc());
4595  DAG.getContext()->diagnose(BadIntrin);
4596  return DAG.getUNDEF(VT);
4597 }
4598 
4600  EVT VT) {
4602  "intrinsic not supported on subtarget",
4603  DL.getDebugLoc());
4604  DAG.getContext()->diagnose(BadIntrin);
4605  return DAG.getUNDEF(VT);
4606 }
4607 
4609  ArrayRef<SDValue> Elts) {
4610  assert(!Elts.empty());
4611  MVT Type;
4612  unsigned NumElts;
4613 
4614  if (Elts.size() == 1) {
4615  Type = MVT::f32;
4616  NumElts = 1;
4617  } else if (Elts.size() == 2) {
4618  Type = MVT::v2f32;
4619  NumElts = 2;
4620  } else if (Elts.size() <= 4) {
4621  Type = MVT::v4f32;
4622  NumElts = 4;
4623  } else if (Elts.size() <= 8) {
4624  Type = MVT::v8f32;
4625  NumElts = 8;
4626  } else {
4627  assert(Elts.size() <= 16);
4628  Type = MVT::v16f32;
4629  NumElts = 16;
4630  }
4631 
4632  SmallVector<SDValue, 16> VecElts(NumElts);
4633  for (unsigned i = 0; i < Elts.size(); ++i) {
4634  SDValue Elt = Elts[i];
4635  if (Elt.getValueType() != MVT::f32)
4636  Elt = DAG.getBitcast(MVT::f32, Elt);
4637  VecElts[i] = Elt;
4638  }
4639  for (unsigned i = Elts.size(); i < NumElts; ++i)
4640  VecElts[i] = DAG.getUNDEF(MVT::f32);
4641 
4642  if (NumElts == 1)
4643  return VecElts[0];
4644  return DAG.getBuildVector(Type, DL, VecElts);
4645 }
4646 
4647 static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG,
4648  SDValue *GLC, SDValue *SLC) {
4649  auto CachePolicyConst = dyn_cast<ConstantSDNode>(CachePolicy.getNode());
4650  if (!CachePolicyConst)
4651  return false;
4652 
4653  uint64_t Value = CachePolicyConst->getZExtValue();
4654  SDLoc DL(CachePolicy);
4655  if (GLC) {
4656  *GLC = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
4657  Value &= ~(uint64_t)0x1;
4658  }
4659  if (SLC) {
4660  *SLC = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
4661  Value &= ~(uint64_t)0x2;
4662  }
4663 
4664  return Value == 0;
4665 }
4666 
4667 // Re-construct the required return value for a image load intrinsic.
4668 // This is more complicated due to the optional use TexFailCtrl which means the required
4669 // return type is an aggregate
4671  MachineSDNode *Result,
4672  ArrayRef<EVT> ResultTypes,
4673  bool IsTexFail, bool Unpacked, bool IsD16,
4674  int DMaskPop, int NumVDataDwords,
4675  const SDLoc &DL, LLVMContext &Context) {
4676  // Determine the required return type. This is the same regardless of IsTexFail flag
4677  EVT ReqRetVT = ResultTypes[0];
4678  EVT ReqRetEltVT = ReqRetVT.isVector() ? ReqRetVT.getVectorElementType() : ReqRetVT;
4679  int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
4680  EVT AdjEltVT = Unpacked && IsD16 ? MVT::i32 : ReqRetEltVT;
4681  EVT AdjVT = Unpacked ? ReqRetNumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, ReqRetNumElts)
4682  : AdjEltVT
4683  : ReqRetVT;
4684 
4685  // Extract data part of the result
4686  // Bitcast the result to the same type as the required return type
4687  int NumElts;
4688  if (IsD16 && !Unpacked)
4689  NumElts = NumVDataDwords << 1;
4690  else
4691  NumElts = NumVDataDwords;
4692 
4693  EVT CastVT = NumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, NumElts)
4694  : AdjEltVT;
4695 
4696  // Special case for v8f16. Rather than add support for this, use v4i32 to
4697  // extract the data elements
4698  bool V8F16Special = false;
4699  if (CastVT == MVT::v8f16) {
4700  CastVT = MVT::v4i32;
4701  DMaskPop >>= 1;
4702  ReqRetNumElts >>= 1;
4703  V8F16Special = true;
4704  AdjVT = MVT::v2i32;
4705  }
4706 
4707  SDValue N = SDValue(Result, 0);
4708  SDValue CastRes = DAG.getNode(ISD::BITCAST, DL, CastVT, N);
4709 
4710  // Iterate over the result
4711  SmallVector<SDValue, 4> BVElts;
4712 
4713  if (CastVT.isVector()) {
4714  DAG.ExtractVectorElements(CastRes, BVElts, 0, DMaskPop);
4715  } else {
4716  BVElts.push_back(CastRes);
4717  }
4718  int ExtraElts = ReqRetNumElts - DMaskPop;
4719  while(ExtraElts--)
4720  BVElts.push_back(DAG.getUNDEF(AdjEltVT));
4721 
4722  SDValue PreTFCRes;
4723  if (ReqRetNumElts > 1) {
4724  SDValue NewVec = DAG.getBuildVector(AdjVT, DL, BVElts);
4725  if (IsD16 && Unpacked)
4726  PreTFCRes = adjustLoadValueTypeImpl(NewVec, ReqRetVT, DL, DAG, Unpacked);
4727  else
4728  PreTFCRes = NewVec;
4729  } else {
4730  PreTFCRes = BVElts[0];
4731  }
4732 
4733  if (V8F16Special)
4734  PreTFCRes = DAG.getNode(ISD::BITCAST, DL, MVT::v4f16, PreTFCRes);
4735 
4736  if (!IsTexFail) {
4737  if (Result->getNumValues() > 1)
4738  return DAG.getMergeValues({PreTFCRes, SDValue(Result, 1)}, DL);
4739  else
4740  return PreTFCRes;
4741  }
4742 
4743  // Extract the TexFail result and insert into aggregate return
4744  SmallVector<SDValue, 1> TFCElt;
4745  DAG.ExtractVectorElements(N, TFCElt, DMaskPop, 1);
4746  SDValue TFCRes = DAG.getNode(ISD::BITCAST, DL, ResultTypes[1], TFCElt[0]);
4747  return DAG.getMergeValues({PreTFCRes, TFCRes, SDValue(Result, 1)}, DL);
4748 }
4749 
4750 static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
4751  SDValue *LWE, bool &IsTexFail) {
4752  auto TexFailCtrlConst = dyn_cast<ConstantSDNode>(TexFailCtrl.getNode());
4753  if (!TexFailCtrlConst)
4754  return false;
4755 
4756  uint64_t Value = TexFailCtrlConst->getZExtValue();
4757  if (Value) {
4758  IsTexFail = true;
4759  }
4760 
4761  SDLoc DL(TexFailCtrlConst);
4762  *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
4763  Value &= ~(uint64_t)0x1;
4764  *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
4765  Value &= ~(uint64_t)0x2;
4766 
4767  return Value == 0;
4768 }
4769 
4770 SDValue SITargetLowering::lowerImage(SDValue Op,
4771  const AMDGPU::ImageDimIntrinsicInfo *Intr,
4772  SelectionDAG &DAG) const {
4773  SDLoc DL(Op);
4774  MachineFunction &MF = DAG.getMachineFunction();
4775  const GCNSubtarget* ST = &MF.getSubtarget<GCNSubtarget>();
4776  const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4777  AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4778  const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
4779  const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4780  AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
4781  unsigned IntrOpcode = Intr->BaseOpcode;
4782 
4783  SmallVector<EVT, 3> ResultTypes(Op->value_begin(), Op->value_end());
4784  SmallVector<EVT, 3> OrigResultTypes(Op->value_begin(), Op->value_end());
4785  bool IsD16 = false;
4786  bool IsA16 = false;
4787  SDValue VData;
4788  int NumVDataDwords;
4789  bool AdjustRetType = false;
4790 
4791  unsigned AddrIdx; // Index of first address argument
4792  unsigned DMask;
4793  unsigned DMaskLanes = 0;
4794 
4795  if (BaseOpcode->Atomic) {
4796  VData = Op.getOperand(2);
4797 
4798  bool Is64Bit = VData.getValueType() == MVT::i64;
4799  if (BaseOpcode->AtomicX2) {
4800  SDValue VData2 = Op.getOperand(3);
4801  VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
4802  {VData, VData2});
4803  if (Is64Bit)
4804  VData = DAG.getBitcast(MVT::v4i32, VData);
4805 
4806  ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
4807  DMask = Is64Bit ? 0xf : 0x3;
4808  NumVDataDwords = Is64Bit ? 4 : 2;
4809  AddrIdx = 4;
4810  } else {
4811  DMask = Is64Bit ? 0x3 : 0x1;
4812  NumVDataDwords = Is64Bit ? 2 : 1;
4813  AddrIdx = 3;
4814  }
4815  } else {
4816  unsigned DMaskIdx = BaseOpcode->Store ? 3 : isa<MemSDNode>(Op) ? 2 : 1;
4817  auto DMaskConst = dyn_cast<ConstantSDNode>(Op.getOperand(DMaskIdx));
4818  if (!DMaskConst)
4819  return Op;
4820  DMask = DMaskConst->getZExtValue();
4821  DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
4822 
4823  if (BaseOpcode->Store) {
4824  VData = Op.getOperand(2);
4825 
4826  MVT StoreVT = VData.getSimpleValueType();
4827  if (StoreVT.getScalarType() == MVT::f16) {
4828  if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ||
4829  !BaseOpcode->HasD16)
4830  return Op; // D16 is unsupported for this instruction
4831 
4832  IsD16 = true;
4833  VData = handleD16VData(VData, DAG);
4834  }
4835 
4836  NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
4837  } else {
4838  // Work out the num dwords based on the dmask popcount and underlying type
4839  // and whether packing is supported.
4840  MVT LoadVT = ResultTypes[0].getSimpleVT();
4841  if (LoadVT.getScalarType() == MVT::f16) {
4842  if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ||
4843  !BaseOpcode->HasD16)
4844  return Op; // D16 is unsupported for this instruction
4845 
4846  IsD16 = true;
4847  }
4848 
4849  // Confirm that the return type is large enough for the dmask specified
4850  if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
4851  (!LoadVT.isVector() && DMaskLanes > 1))
4852  return Op;
4853 
4854  if (IsD16 && !Subtarget->hasUnpackedD16VMem())
4855  NumVDataDwords = (DMaskLanes + 1) / 2;
4856  else
4857  NumVDataDwords = DMaskLanes;
4858 
4859  AdjustRetType = true;
4860  }
4861 
4862  AddrIdx = DMaskIdx + 1;
4863  }
4864 
4865  unsigned NumGradients = BaseOpcode->Gradients ? DimInfo->NumGradients : 0;
4866  unsigned NumCoords = BaseOpcode->Coordinates ? DimInfo->NumCoords : 0;
4867  unsigned NumLCM = BaseOpcode->LodOrClampOrMip ? 1 : 0;
4868  unsigned NumVAddrs = BaseOpcode->NumExtraArgs + NumGradients +
4869  NumCoords + NumLCM;
4870  unsigned NumMIVAddrs = NumVAddrs;
4871 
4872  SmallVector<SDValue, 4> VAddrs;
4873 
4874  // Optimize _L to _LZ when _L is zero
4875  if (LZMappingInfo) {
4876  if (auto ConstantLod =
4877  dyn_cast<ConstantFPSDNode>(Op.getOperand(AddrIdx+NumVAddrs-1))) {
4878  if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4879  IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l
4880  NumMIVAddrs--; // remove 'lod'
4881  }
4882  }
4883  }
4884 
4885  // Check for 16 bit addresses and pack if true.
4886  unsigned DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
4887  MVT VAddrVT = Op.getOperand(DimIdx).getSimpleValueType();
4888  const MVT VAddrScalarVT = VAddrVT.getScalarType();
4889  if (((VAddrScalarVT == MVT::f16) || (VAddrScalarVT == MVT::i16)) &&
4890  ST->hasFeature(AMDGPU::FeatureR128A16)) {
4891  IsA16 = true;
4892  const MVT VectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
4893  for (unsigned i = AddrIdx; i < (AddrIdx + NumMIVAddrs); ++i) {
4894  SDValue AddrLo, AddrHi;
4895  // Push back extra arguments.
4896  if (i < DimIdx) {
4897  AddrLo = Op.getOperand(i);
4898  } else {
4899  AddrLo = Op.getOperand(i);
4900  // Dz/dh, dz/dv and the last odd coord are packed with undef. Also,
4901  // in 1D, derivatives dx/dh and dx/dv are packed with undef.
4902  if (((i + 1) >= (AddrIdx + NumMIVAddrs)) ||
4903  ((NumGradients / 2) % 2 == 1 &&
4904  (i == DimIdx + (NumGradients / 2) - 1 ||
4905  i == DimIdx + NumGradients - 1))) {
4906  AddrHi = DAG.getUNDEF(MVT::f16);
4907  } else {
4908  AddrHi = Op.getOperand(i + 1);
4909  i++;
4910  }
4911  AddrLo = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VectorVT,
4912  {AddrLo, AddrHi});
4913  AddrLo = DAG.getBitcast(MVT::i32, AddrLo);
4914  }
4915  VAddrs.push_back(AddrLo);
4916  }
4917  } else {
4918  for (unsigned i = 0; i < NumMIVAddrs; ++i)
4919  VAddrs.push_back(Op.getOperand(AddrIdx + i));
4920  }
4921 
4922  SDValue VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
4923 
4924  SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
4925  SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
4926  unsigned CtrlIdx; // Index of texfailctrl argument
4927  SDValue Unorm;
4928  if (!BaseOpcode->Sampler) {
4929  Unorm = True;
4930  CtrlIdx = AddrIdx + NumVAddrs + 1;
4931  } else {
4932  auto UnormConst =
4933  dyn_cast<ConstantSDNode>(Op.getOperand(AddrIdx + NumVAddrs + 2));
4934  if (!UnormConst)
4935  return Op;
4936 
4937  Unorm = UnormConst->getZExtValue() ? True : False;
4938  CtrlIdx = AddrIdx + NumVAddrs + 3;
4939  }
4940 
4941  SDValue TFE;
4942  SDValue LWE;
4943  SDValue TexFail = Op.getOperand(CtrlIdx);
4944  bool IsTexFail = false;
4945  if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
4946  return Op;
4947 
4948  if (IsTexFail) {
4949  if (!DMaskLanes) {
4950  // Expecting to get an error flag since TFC is on - and dmask is 0
4951  // Force dmask to be at least 1 otherwise the instruction will fail
4952  DMask = 0x1;
4953  DMaskLanes = 1;
4954  NumVDataDwords = 1;
4955  }
4956  NumVDataDwords += 1;
4957  AdjustRetType = true;
4958  }
4959 
4960  // Has something earlier tagged that the return type needs adjusting
4961  // This happens if the instruction is a load or has set TexFailCtrl flags
4962  if (AdjustRetType) {
4963  // NumVDataDwords reflects the true number of dwords required in the return type
4964  if (DMaskLanes == 0 && !BaseOpcode->Store) {
4965  // This is a no-op load. This can be eliminated
4966  SDValue Undef = DAG.getUNDEF(Op.getValueType());
4967  if (isa<MemSDNode>(Op))
4968  return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
4969  return Undef;
4970  }
4971 
4972  // Have to use a power of 2 number of dwords
4973  NumVDataDwords = 1 << Log2_32_Ceil(NumVDataDwords);
4974 
4975  EVT NewVT = NumVDataDwords > 1 ?
4976  EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumVDataDwords)
4977  : MVT::f32;
4978 
4979  ResultTypes[0] = NewVT;
4980  if (ResultTypes.size() == 3) {
4981  // Original result was aggregate type used for TexFailCtrl results
4982  // The actual instruction returns as a vector type which has now been
4983  // created. Remove the aggregate result.
4984  ResultTypes.erase(&ResultTypes[1]);
4985  }
4986  }
4987 
4988  SDValue GLC;
4989  SDValue SLC;
4990  if (BaseOpcode->Atomic) {
4991  GLC = True; // TODO no-return optimization
4992  if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, nullptr, &SLC))
4993  return Op;
4994  } else {
4995  if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, &GLC, &SLC))
4996  return Op;
4997  }
4998 
5000  if (BaseOpcode->Store || BaseOpcode->Atomic)
5001  Ops.push_back(VData); // vdata
5002  Ops.push_back(VAddr);
5003  Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs)); // rsrc
5004  if (BaseOpcode->Sampler)
5005  Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs + 1)); // sampler
5006  Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
5007  Ops.push_back(Unorm);
5008  Ops.push_back(GLC);
5009  Ops.push_back(SLC);
5010  Ops.push_back(IsA16 && // a16 or r128
5011  ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
5012  Ops.push_back(TFE); // tfe
5013  Ops.push_back(LWE); // lwe
5014  Ops.push_back(DimInfo->DA ? True : False);
5015  if (BaseOpcode->HasD16)
5016  Ops.push_back(IsD16 ? True : False);
5017  if (isa<MemSDNode>(Op))
5018  Ops.push_back(Op.getOperand(0)); // chain
5019 
5020  int NumVAddrDwords = VAddr.getValueType().getSizeInBits() / 32;
5021  int Opcode = -1;
5022 
5024  Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
5025  NumVDataDwords, NumVAddrDwords);
5026  if (Opcode == -1)
5027  Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
5028  NumVDataDwords, NumVAddrDwords);
5029  assert(Opcode != -1);
5030 
5031  MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
5032  if (auto MemOp = dyn_cast<MemSDNode>(Op)) {
5033  MachineMemOperand *MemRef = MemOp->getMemOperand();
5034  DAG.setNodeMemRefs(NewNode, {MemRef});
5035  }
5036 
5037  if (BaseOpcode->AtomicX2) {
5039  DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
5040  return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
5041  } else if (!BaseOpcode->Store) {
5042  return constructRetValue(DAG, NewNode,
5043  OrigResultTypes, IsTexFail,
5044  Subtarget->hasUnpackedD16VMem(), IsD16,
5045  DMaskLanes, NumVDataDwords, DL,
5046  *DAG.getContext());
5047  }
5048 
5049  return SDValue(NewNode, 0);
5050 }
5051 
5052 SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
5053  SDValue Offset, SDValue GLC,
5054  SelectionDAG &DAG) const {
5055  MachineFunction &MF = DAG.getMachineFunction();
5060  VT.getStoreSize(), VT.getStoreSize());
5061 
5062  if (!Offset->isDivergent()) {
5063  SDValue Ops[] = {
5064  Rsrc,
5065  Offset, // Offset
5066  GLC // glc
5067  };
5069  DAG.getVTList(VT), Ops, VT, MMO);
5070  }
5071 
5072  // We have a divergent offset. Emit a MUBUF buffer load instead. We can
5073  // assume that the buffer is unswizzled.
5075  unsigned NumLoads = 1;
5076  MVT LoadVT = VT.getSimpleVT();
5077  unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
5078  assert((LoadVT.getScalarType() == MVT::i32 ||
5079  LoadVT.getScalarType() == MVT::f32) &&
5080  isPowerOf2_32(NumElts));
5081 
5082  if (NumElts == 8 || NumElts == 16) {
5083  NumLoads = NumElts == 16 ? 4 : 2;
5084  LoadVT = MVT::v4i32;
5085  }
5086 
5087  SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
5088  unsigned CachePolicy = cast<ConstantSDNode>(GLC)->getZExtValue();
5089  SDValue Ops[] = {
5090  DAG.getEntryNode(), // Chain
5091  Rsrc, // rsrc
5092  DAG.getConstant(0, DL, MVT::i32), // vindex
5093  {}, // voffset
5094  {}, // soffset
5095  {}, // offset
5096  DAG.getConstant(CachePolicy, DL, MVT::i32), // cachepolicy
5097  DAG.getConstant(0, DL, MVT::i1), // idxen
5098  };
5099 
5100  // Use the alignment to ensure that the required offsets will fit into the
5101  // immediate offsets.
5102  setBufferOffsets(Offset, DAG, &Ops[3], NumLoads > 1 ? 16 * NumLoads : 4);
5103 
5104  uint64_t InstOffset = cast<ConstantSDNode>(Ops[5])->getZExtValue();
5105  for (unsigned i = 0; i < NumLoads; ++i) {
5106  Ops[5] = DAG.getConstant(InstOffset + 16 * i, DL, MVT::i32);
5108  Ops, LoadVT, MMO));
5109  }
5110 
5111  if (VT == MVT::v8i32 || VT == MVT::v16i32)
5112  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
5113 
5114  return Loads[0];
5115 }
5116 
5117 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
5118  SelectionDAG &DAG) const {
5119  MachineFunction &MF = DAG.getMachineFunction();
5120  auto MFI = MF.getInfo<SIMachineFunctionInfo>();
5121 
5122  EVT VT = Op.getValueType();
5123  SDLoc DL(Op);
5124  unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
5125 
5126  // TODO: Should this propagate fast-math-flags?
5127 
5128  switch (IntrinsicID) {
5130  if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
5131  return emitNonHSAIntrinsicError(DAG, DL, VT);
5132  return getPreloadedValue(DAG, *MFI, VT,
5134  }
5137  if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
5138  DiagnosticInfoUnsupported BadIntrin(
5139  MF.getFunction(), "unsupported hsa intrinsic without hsa target",
5140  DL.getDebugLoc());
5141  DAG.getContext()->diagnose(BadIntrin);
5142  return DAG.getUNDEF(VT);
5143  }
5144 
5145  auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
5147  return getPreloadedValue(DAG, *MFI, VT, RegID);
5148  }
5150  if (MFI->isEntryFunction())
5151  return getImplicitArgPtr(DAG, DL);
5152  return getPreloadedValue(DAG, *MFI, VT,
5154  }
5156  return getPreloadedValue(DAG, *MFI, VT,
5158  }
5160  return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
5161  }
5162  case Intrinsic::amdgcn_rcp:
5163  return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
5164  case Intrinsic::amdgcn_rsq:
5165  return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
5168  return emitRemovedIntrinsicError(DAG, DL, VT);
5169 
5170  return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
5173  return emitRemovedIntrinsicError(DAG, DL, VT);
5174  return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
5177  return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
5178 
5179  Type *Type = VT.getTypeForEVT(*DAG.getContext());
5181  APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
5182 
5183  SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
5184  SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
5185  DAG.getConstantFP(Max, DL, VT));
5186  return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
5187  DAG.getConstantFP(Min, DL, VT));
5188  }
5190  if (Subtarget->isAmdHsaOS())
5191  return emitNonHSAIntrinsicError(DAG, DL, VT);
5192 
5193  return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
5196  if (Subtarget->isAmdHsaOS())
5197  return emitNonHSAIntrinsicError(DAG, DL, VT);
5198 
5199  return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
5202  if (Subtarget->isAmdHsaOS())
5203  return emitNonHSAIntrinsicError(DAG, DL, VT);
5204 
5205  return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
5208  if (Subtarget->isAmdHsaOS())
5209  return emitNonHSAIntrinsicError(DAG, DL, VT);
5210 
5211  return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
5214  if (Subtarget->isAmdHsaOS())
5215  return emitNonHSAIntrinsicError(DAG, DL, VT);
5216 
5217  return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
5220  if (Subtarget->isAmdHsaOS())
5221  return emitNonHSAIntrinsicError(DAG, DL, VT);
5222 
5223  return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
5226  if (Subtarget->isAmdHsaOS())
5227  return emitNonHSAIntrinsicError(DAG, DL, VT);
5228 
5229  return lowerImplicitZextParam(DAG, Op, MVT::i16,
5232  if (Subtarget->isAmdHsaOS())
5233  return emitNonHSAIntrinsicError(DAG, DL, VT);
5234 
5235  return lowerImplicitZextParam(DAG, Op, MVT::i16,
5238  if (Subtarget->isAmdHsaOS())
5239  return emitNonHSAIntrinsicError(DAG, DL, VT);
5240 
5241  return lowerImplicitZextParam(DAG, Op, MVT::i16,
5245  return getPreloadedValue(DAG, *MFI, VT,
5249  return getPreloadedValue(DAG, *MFI, VT,
5253  return getPreloadedValue(DAG, *MFI, VT,
5257  return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
5258  SDLoc(DAG.getEntryNode()),
5259  MFI->getArgInfo().WorkItemIDX);
5262  return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
5263  SDLoc(DAG.getEntryNode()),
5264  MFI->getArgInfo().WorkItemIDY);
5267  return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
5268  SDLoc(DAG.getEntryNode()),
5269  MFI->getArgInfo().WorkItemIDZ);
5270  case SIIntrinsic::SI_load_const: {
5271  SDValue Load =
5272  lowerSBuffer(MVT::i32, DL, Op.getOperand(1), Op.getOperand(2),
5273  DAG.getTargetConstant(0, DL, MVT::i1), DAG);
5274  return DAG.getNode(ISD::BITCAST, DL, MVT::f32, Load);
5275  }
5277  unsigned Cache = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
5278  return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
5279  DAG.getTargetConstant(Cache & 1, DL, MVT::i1), DAG);
5280  }
5282  return lowerFDIV_FAST(Op, DAG);
5284  SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
5285  SDValue Glue = M0.getValue(1);
5286  return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, Op.getOperand(1),
5287  Op.getOperand(2), Op.getOperand(3), Glue);
5288  }
5290  SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
5291  SDValue Glue = M0.getValue(1);
5292  return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1),
5293  Op.getOperand(2), Op.getOperand(3), Glue);
5294  }
5296  SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
5297  SDValue Glue = SDValue(M0.getNode(), 1);
5298  return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1),
5299  Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
5300  Glue);
5301  }
5302  case Intrinsic::amdgcn_sin:
5303  return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
5304 
5305  case Intrinsic::amdgcn_cos:
5306  return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
5307 
5310  return SDValue();
5311 
5312  DiagnosticInfoUnsupported BadIntrin(
5313  MF.getFunction(), "intrinsic not supported on subtarget",
5314  DL.getDebugLoc());
5315  DAG.getContext()->diagnose(BadIntrin);
5316  return DAG.getUNDEF(VT);
5317  }
5319  return DAG.getNode(AMDGPUISD::LDEXP, DL, VT,
5320  Op.getOperand(1), Op.getOperand(2));
5321 
5323  return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
5324 
5326  return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
5327  Op.getOperand(1), Op.getOperand(2));
5329  return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
5330  Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
5331  Op.getOperand(4));
5332 
5334  return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
5335  Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5336 
5338  return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
5339  Op.getOperand(1), Op.getOperand(2));
5341  // 3rd parameter required to be a constant.
5342  const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
5343  if (!Param)
5344  return DAG.getMergeValues({ DAG.getUNDEF(VT), DAG.getUNDEF(MVT::i1) }, DL);
5345 
5346  // Translate to the operands expected by the machine instruction. The
5347  // first parameter must be the same as the first instruction.
5348  SDValue Numerator = Op.getOperand(1);
5349  SDValue Denominator = Op.getOperand(2);
5350 
5351  // Note this order is opposite of the machine instruction's operations,
5352  // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
5353  // intrinsic has the numerator as the first operand to match a normal
5354  // division operation.
5355 
5356  SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
5357 
5358  return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
5359  Denominator, Numerator);
5360  }
5361  case Intrinsic::amdgcn_icmp: {
5362  // There is a Pat that handles this variant, so return it as-is.
5363  if (Op.getOperand(1).getValueType() == MVT::i1 &&
5364  Op.getConstantOperandVal(2) == 0 &&
5365  Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
5366  return Op;
5367  return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
5368  }
5369  case Intrinsic::amdgcn_fcmp: {
5370  return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
5371  }
5373  return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
5374  Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5376  return DAG.getNode(AMDGPUISD::FDOT2, DL, VT,
5377  Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
5378  Op.getOperand(4));
5380  return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
5381  Op.getOperand(1), Op.getOperand(2));
5383  return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
5385  return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
5386  Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5388  return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
5389  Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5395  // FIXME: Stop adding cast if v2f16/v2i16 are legal.
5396  EVT VT = Op.getValueType();
5397  unsigned Opcode;
5398 
5399  if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
5401  else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
5403  else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
5405  else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
5406  Opcode = AMDGPUISD::CVT_PK_I16_I32;
5407  else
5408  Opcode = AMDGPUISD::CVT_PK_U16_U32;
5409 
5410  if (isTypeLegal(VT))
5411  return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
5412 
5413  SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
5414  Op.getOperand(1), Op.getOperand(2));
5415  return DAG.getNode(ISD::BITCAST, DL, VT, Node);
5416  }
5417  case Intrinsic::amdgcn_wqm: {
5418  SDValue Src = Op.getOperand(1);
5419  return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src),
5420  0);
5421  }
5422  case Intrinsic::amdgcn_wwm: {
5423  SDValue Src = Op.getOperand(1);
5424  return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src),
5425  0);
5426  }
5428  return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
5429  Op.getOperand(2), Op.getOperand(3));
5430  default:
5431  if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5432  AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
5433  return lowerImage(Op, ImageDimIntr, DAG);
5434 
5435  return Op;
5436  }
5437 }
5438 
5439 SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
5440  SelectionDAG &DAG) const {
5441  unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
5442  SDLoc DL(Op);
5443 
5444  switch (IntrID) {
5447  MemSDNode *M = cast<MemSDNode>(Op);
5448  SDValue Chain = M->getOperand(0);
5449  SDValue M0 = M->getOperand(2);
5450  SDValue Value = M->getOperand(3);
5451  unsigned OrderedCountIndex = M->getConstantOperandVal(7);
5452  unsigned WaveRelease = M->getConstantOperandVal(8);
5453  unsigned WaveDone = M->getConstantOperandVal(9);
5454  unsigned ShaderType;
5455  unsigned Instruction;
5456 
5457  switch (IntrID) {
5459  Instruction = 0;
5460  break;
5462  Instruction = 1;
5463  break;
5464  }
5465 
5466  if (WaveDone && !WaveRelease)
5467  report_fatal_error("ds_ordered_count: wave_done requires wave_release");
5468 
5469  switch (DAG.getMachineFunction().getFunction().getCallingConv()) {
5472  ShaderType = 0;
5473  break;
5475  ShaderType = 1;
5476  break;
5478  ShaderType = 2;
5479  break;
5481  ShaderType = 3;
5482  break;
5483  default:
5484  report_fatal_error("ds_ordered_count unsupported for this calling conv");
5485  }
5486 
5487  unsigned Offset0 = OrderedCountIndex << 2;
5488  unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) |
5489  (Instruction << 4);
5490  unsigned Offset = Offset0 | (Offset1 << 8);
5491 
5492  SDValue Ops[] = {
5493  Chain,
5494  Value,
5495  DAG.getTargetConstant(Offset, DL, MVT::i16),
5496  copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
5497  };
5499  M->getVTList(), Ops, M->getMemoryVT(),
5500  M->getMemOperand());
5501  }
5507  MemSDNode *M = cast<MemSDNode>(Op);
5508  unsigned Opc;
5509  switch (IntrID) {
5511  Opc = AMDGPUISD::ATOMIC_INC;
5512  break;
5514  Opc = AMDGPUISD::ATOMIC_DEC;
5515  break;
5518  break;
5521  break;
5524  break;
5525  default:
5526  llvm_unreachable("Unknown intrinsic!");
5527  }
5528  SDValue Ops[] = {
5529  M->getOperand(0), // Chain
5530  M->getOperand(2), // Ptr
5531  M->getOperand(3) // Value
5532  };
5533 
5534  return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
5535  M->getMemoryVT(), M->getMemOperand());
5536  }
5539  unsigned Glc = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
5540  unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
5541  unsigned IdxEn = 1;
5542  if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3)))
5543  IdxEn = Idx->getZExtValue() != 0;
5544  SDValue Ops[] = {
5545  Op.getOperand(0), // Chain
5546  Op.getOperand(2), // rsrc
5547  Op.getOperand(3), // vindex
5548  SDValue(), // voffset -- will be set by setBufferOffsets
5549  SDValue(), // soffset -- will be set by setBufferOffsets
5550  SDValue(), // offset -- will be set by setBufferOffsets
5551  DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
5552  DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
5553  };
5554 
5555  setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]);
5556  unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
5558 
5559  EVT VT = Op.getValueType();
5560  EVT IntVT = VT.changeTypeToInteger();
5561  auto *M = cast<MemSDNode>(Op);
5562  EVT LoadVT = Op.getValueType();
5563 
5564  if (LoadVT.getScalarType() == MVT::f16)
5565  return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
5566  M, DAG, Ops);
5567  return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
5568  M->getMemOperand());
5569  }
5572  auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
5573  SDValue Ops[] = {
5574  Op.getOperand(0), // Chain
5575  Op.getOperand(2), // rsrc
5576  DAG.getConstant(0, DL, MVT::i32), // vindex
5577  Offsets.first, // voffset
5578  Op.getOperand(4), // soffset
5579  Offsets.second, // offset
5580  Op.getOperand(5), // cachepolicy
5581  DAG.getConstant(0, DL, MVT::i1), // idxen
5582  };
5583 
5584  unsigned Opc = (IntrID == Intrinsic::amdgcn_raw_buffer_load) ?
5586 
5587  EVT VT = Op.getValueType();
5588  EVT IntVT = VT.changeTypeToInteger();
5589  auto *M = cast<MemSDNode>(Op);
5590  EVT LoadVT = Op.getValueType();
5591 
5592  if (LoadVT.getScalarType() == MVT::f16)
5593  return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
5594  M, DAG, Ops);
5595  return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
5596  M->getMemOperand());
5597  }
5600  auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
5601  SDValue Ops[] = {
5602  Op.getOperand(0), // Chain
5603  Op.getOperand(2), // rsrc
5604  Op.getOperand(3), // vindex
5605  Offsets.first, // voffset
5606  Op.getOperand(5), // soffset
5607  Offsets.second, // offset
5608  Op.getOperand(6), // cachepolicy
5609  DAG.getConstant(1, DL, MVT::i1), // idxen
5610  };
5611 
5612  unsigned Opc = (IntrID == Intrinsic::amdgcn_struct_buffer_load) ?
5614 
5615  EVT VT = Op.getValueType();
5616  EVT IntVT = VT.changeTypeToInteger();
5617  auto *M = cast<MemSDNode>(Op);
5618  EVT LoadVT = Op.getValueType();
5619 
5620  if (LoadVT.getScalarType() == MVT::f16)
5621  return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
5622  M, DAG, Ops);
5623  return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
5624  M->getMemOperand());
5625  }
5627  MemSDNode *M = cast<MemSDNode>(Op);
5628  EVT LoadVT = Op.getValueType();
5629 
5630  unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
5631  unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
5632  unsigned Glc = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
5633  unsigned Slc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
5634  unsigned IdxEn = 1;
5635  if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3)))
5636  IdxEn = Idx->getZExtValue() != 0;
5637  SDValue Ops[] = {
5638  Op.getOperand(0), // Chain
5639  Op.getOperand(2), // rsrc
5640  Op.getOperand(3), // vindex
5641  Op.getOperand(4), // voffset
5642  Op.getOperand(5), // soffset
5643  Op.getOperand(6), // offset
5644  DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
5645  DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
5646  DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
5647  };
5648 
5649  if (LoadVT.getScalarType() == MVT::f16)
5650  return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
5651  M, DAG, Ops);
5653  Op->getVTList(), Ops, LoadVT,
5654  M->getMemOperand());
5655  }
5657  MemSDNode *M = cast<MemSDNode>(Op);
5658  EVT LoadVT = Op.getValueType();
5659  auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
5660 
5661  SDValue Ops[] = {
5662  Op.getOperand(0), // Chain
5663  Op.getOperand(2), // rsrc
5664  DAG.getConstant(0, DL, MVT::i32), // vindex
5665  Offsets.first, // voffset
5666  Op.getOperand(4), // soffset
5667  Offsets.second, // offset
5668  Op.getOperand(5), // format
5669  Op.getOperand(6), // cachepolicy
5670  DAG.getConstant(0, DL, MVT::i1), // idxen
5671  };
5672 
5673  if (LoadVT.getScalarType() == MVT::f16)
5674  return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
5675  M, DAG, Ops);
5677  Op->getVTList(), Ops, LoadVT,
5678  M->getMemOperand());
5679  }
5681  MemSDNode *M = cast<MemSDNode>(Op);
5682  EVT LoadVT = Op.getValueType();
5683  auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
5684 
5685  SDValue Ops[] = {
5686  Op.getOperand(0), // Chain
5687  Op.getOperand(2), // rsrc
5688  Op.getOperand(3), // vindex
5689  Offsets.first, // voffset
5690  Op.getOperand(5), // soffset
5691  Offsets.second, // offset
5692  Op.getOperand(6), // format
5693  Op.getOperand(7), // cachepolicy
5694  DAG.getConstant(1, DL, MVT::i1), // idxen
5695  };
5696 
5697  if (LoadVT.getScalarType() == MVT::f16)
5698  return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
5699  M, DAG, Ops);
5701  Op->getVTList(), Ops, LoadVT,
5702  M->getMemOperand());
5703  }
5714  unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
5715  unsigned IdxEn = 1;
5716  if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
5717  IdxEn = Idx->getZExtValue() != 0;
5718  SDValue Ops[] = {
5719  Op.getOperand(0), // Chain
5720  Op.getOperand(2), // vdata
5721  Op.getOperand(3), // rsrc
5722  Op.getOperand(4), // vindex
5723  SDValue(), // voffset -- will be set by setBufferOffsets
5724  SDValue(), // soffset -- will be set by setBufferOffsets
5725  SDValue(), // offset -- will be set by setBufferOffsets
5726  DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy
5727  DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
5728  };
5729  setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
5730  EVT VT = Op.getValueType();
5731 
5732  auto *M = cast<MemSDNode>(Op);
5733  unsigned Opcode = 0;
5734 
5735  switch (IntrID) {
5738  break;
5741  break;
5744  break;
5747  break;
5750  break;
5753  break;
5756  break;
5759  break;
5761  Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
5762  break;
5765  break;
5766  default:
5767  llvm_unreachable("unhandled atomic opcode");
5768  }
5769 
5770  return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
5771  M->getMemOperand());
5772  }
5783  auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
5784  SDValue Ops[] = {
5785  Op.getOperand(0), // Chain
5786  Op.getOperand(2), // vdata
5787  Op.getOperand(3), // rsrc
5788  DAG.getConstant(0, DL, MVT::i32), // vindex
5789  Offsets.first, // voffset
5790  Op.getOperand(5), // soffset
5791  Offsets.second, // offset
5792  Op.getOperand(6), // cachepolicy
5793  DAG.getConstant(0, DL, MVT::i1), // idxen
5794  };
5795  EVT VT = Op.getValueType();
5796 
5797  auto *M = cast<MemSDNode>(Op);
5798  unsigned Opcode = 0;
5799 
5800  switch (IntrID) {
5803  break;
5806  break;
5809  break;
5812  break;
5815  break;
5818  break;
5821  break;
5824  break;
5826  Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
5827  break;
5830  break;
5831  default:
5832  llvm_unreachable("unhandled atomic opcode");
5833  }
5834 
5835  return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
5836  M->getMemOperand());
5837  }
5848  auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
5849  SDValue Ops[] = {
5850  Op.getOperand(0), // Chain
5851  Op.getOperand(2), // vdata
5852  Op.getOperand(3), // rsrc
5853  Op.getOperand(4), // vindex
5854  Offsets.first, // voffset
5855  Op.getOperand(6), // soffset
5856  Offsets.second, // offset
5857  Op.getOperand(7), // cachepolicy
5858  DAG.getConstant(1, DL, MVT::i1), // idxen
5859  };
5860  EVT VT = Op.getValueType();
5861 
5862  auto *M = cast<MemSDNode>(Op);
5863  unsigned Opcode = 0;
5864 
5865  switch (IntrID) {
5868  break;
5871  break;
5874  break;
5877  break;
5880  break;
5883  break;
5886  break;
5889  break;
5891  Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
5892  break;
5895  break;
5896  default:
5897  llvm_unreachable("unhandled atomic opcode");
5898  }
5899 
5900  return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
5901  M->getMemOperand());
5902  }
5904  unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
5905  unsigned IdxEn = 1;
5906  if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(5)))
5907  IdxEn = Idx->getZExtValue() != 0;
5908  SDValue Ops[] = {
5909  Op.getOperand(0), // Chain
5910  Op.getOperand(2), // src
5911  Op.getOperand(3), // cmp
5912  Op.getOperand(4), // rsrc
5913  Op.getOperand(5), // vindex
5914  SDValue(), // voffset -- will be set by setBufferOffsets
5915  SDValue(), // soffset -- will be set by setBufferOffsets
5916  SDValue(), // offset -- will be set by setBufferOffsets
5917  DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy
5918  DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
5919  };
5920  setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]);
5921  EVT VT = Op.getValueType();
5922  auto *M = cast<MemSDNode>(Op);
5923 
5925  Op->getVTList(), Ops, VT, M->getMemOperand());
5926  }
5928  auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
5929  SDValue Ops[] = {
5930  Op.getOperand(0), // Chain
5931  Op.getOperand(2), // src
5932  Op.getOperand(3), // cmp
5933  Op.getOperand(4), // rsrc
5934  DAG.getConstant(0, DL, MVT::i32), // vindex
5935  Offsets.first, // voffset
5936  Op.getOperand(6), // soffset
5937  Offsets.second, // offset
5938  Op.getOperand(7), // cachepolicy
5939  DAG.getConstant(0, DL, MVT::i1), // idxen
5940  };
5941  EVT VT = Op.getValueType();
5942  auto *M = cast<MemSDNode>(Op);
5943 
5945  Op->getVTList(), Ops, VT, M->getMemOperand());
5946  }
5948  auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG);
5949  SDValue Ops[] = {
5950  Op.getOperand(0), // Chain
5951  Op.getOperand(2), // src
5952  Op.getOperand(3), // cmp
5953  Op.getOperand(4), // rsrc
5954  Op.getOperand(5), // vindex
5955  Offsets.first, // voffset
5956  Op.getOperand(7), // soffset
5957  Offsets.second, // offset
5958  Op.getOperand(8), // cachepolicy
5959  DAG.getConstant(1, DL, MVT::i1), // idxen
5960  };
5961  EVT VT = Op.getValueType();
5962  auto *M = cast<MemSDNode>(Op);
5963 
5965  Op->getVTList(), Ops, VT, M->getMemOperand());
5966  }
5967 
5968  default:
5969  if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5971  return lowerImage(Op, ImageDimIntr, DAG);
5972 
5973  return SDValue();
5974  }
5975 }
5976 
5977 SDValue SITargetLowering::handleD16VData(SDValue VData,
5978  SelectionDAG &DAG) const {
5979  EVT StoreVT = VData.getValueType();
5980 
5981  // No change for f16 and legal vector D16 types.
5982  if (!StoreVT.isVector())
5983  return VData;
5984 
5985  SDLoc DL(VData);
5986  assert((StoreVT.getVectorNumElements() != 3) && "Handle v3f16");
5987 
5988  if (Subtarget->hasUnpackedD16VMem()) {
5989  // We need to unpack the packed data to store.
5990  EVT IntStoreVT = StoreVT.changeTypeToInteger();
5991  SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
5992 
5993  EVT EquivStoreVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
5994  StoreVT.getVectorNumElements());
5995  SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
5996  return DAG.UnrollVectorOp(ZExt.getNode());
5997  }
5998 
5999  assert(isTypeLegal(StoreVT));
6000  return VData;
6001 }
6002 
6003 SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
6004  SelectionDAG &DAG) const {
6005  SDLoc DL(Op);
6006  SDValue Chain = Op.getOperand(0);
6007  unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
6008  MachineFunction &MF = DAG.getMachineFunction();
6009 
6010  switch (IntrinsicID) {
6011  case Intrinsic::amdgcn_exp: {
6012  const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
6013  const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
6014  const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(8));
6015  const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(9));
6016 
6017  const SDValue Ops[] = {
6018  Chain,
6019  DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
6020  DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), // en
6021  Op.getOperand(4), // src0
6022  Op.getOperand(5), // src1
6023  Op.getOperand(6), // src2
6024  Op.getOperand(7), // src3
6025  DAG.getTargetConstant(0, DL, MVT::i1), // compr
6026  DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
6027  };
6028 
6029  unsigned Opc = Done->isNullValue() ?
6031  return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
6032  }
6034  const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
6035  const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
6036  SDValue Src0 = Op.getOperand(4);
6037  SDValue Src1 = Op.getOperand(5);
6038  const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
6039  const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(7));
6040 
6041  SDValue Undef = DAG.getUNDEF(MVT::f32);
6042  const SDValue Ops[] = {
6043  Chain,
6044  DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
6045  DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), // en
6046  DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0),
6047  DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1),
6048  Undef, // src2
6049  Undef, // src3
6050  DAG.getTargetConstant(1, DL, MVT::i1), // compr
6051  DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
6052  };
6053 
6054  unsigned Opc = Done->isNullValue() ?
6056  return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
6057  }
6060  unsigned NodeOp = (IntrinsicID == Intrinsic::amdgcn_s_sendmsg) ?
6062  Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
6063  SDValue Glue = Chain.getValue(1);
6064  return DAG.getNode(NodeOp, DL, MVT::Other, Chain,
6065  Op.getOperand(2), Glue);
6066  }
6068  return DAG.getNode(AMDGPUISD::INIT_EXEC, DL, MVT::Other, Chain,
6069  Op.getOperand(2));
6070  }
6072  return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain,
6073  Op.getOperand(2), Op.getOperand(3));
6074  }
6077  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6078  unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
6079  if (WGSize <= ST.getWavefrontSize())
6080  return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
6081  Op.getOperand(0)), 0);
6082  }
6083  return SDValue();
6084  };
6086  SDValue VData = Op.getOperand(2);
6087  bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
6088  if (IsD16)
6089  VData = handleD16VData(VData, DAG);
6090  unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
6091  unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
6092  unsigned Glc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
6093  unsigned Slc = cast<ConstantSDNode>(Op.getOperand(11))->getZExtValue();
6094  unsigned IdxEn = 1;
6095  if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
6096  IdxEn = Idx->getZExtValue() != 0;
6097  SDValue Ops[] = {
6098  Chain,
6099  VData, // vdata
6100  Op.getOperand(3), // rsrc
6101  Op.getOperand(4), // vindex
6102  Op.getOperand(5), // voffset
6103  Op.getOperand(6), // soffset
6104  Op.getOperand(7), // offset
6105  DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
6106  DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
6107  DAG.getConstant(IdxEn, DL, MVT::i1), // idexen
6108  };
6109  unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
6111  MemSDNode *M = cast<MemSDNode>(Op);
6112  return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6113  M->getMemoryVT(), M->getMemOperand());
6114  }
6115 
6117  SDValue VData = Op.getOperand(2);
6118  bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
6119  if (IsD16)
6120  VData = handleD16VData(VData, DAG);
6121  auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
6122  SDValue Ops[] = {
6123  Chain,
6124  VData, // vdata
6125  Op.getOperand(3), // rsrc
6126  Op.getOperand(4), // vindex
6127  Offsets.first, // voffset
6128  Op.getOperand(6), // soffset
6129  Offsets.second, // offset
6130  Op.getOperand(7), // format
6131  Op.getOperand(8), // cachepolicy
6132  DAG.getConstant(1, DL, MVT::i1), // idexen
6133  };
6134  unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
6136  MemSDNode *M = cast<MemSDNode>(Op);
6137  return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6138  M->getMemoryVT(), M->getMemOperand());
6139  }
6140 
6142  SDValue VData = Op.getOperand(2);
6143  bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
6144  if (IsD16)
6145  VData = handleD16VData(VData, DAG);
6146  auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
6147  SDValue Ops[] = {
6148  Chain,
6149  VData, // vdata
6150  Op.getOperand(3), // rsrc
6151  DAG.getConstant(0, DL, MVT::i32), // vindex
6152  Offsets.first, // voffset
6153  Op.getOperand(5), // soffset
6154  Offsets.second, // offset
6155  Op.getOperand(6), // format
6156  Op.getOperand(7), // cachepolicy
6157  DAG.getConstant(0, DL, MVT::i1), // idexen
6158  };
6159  unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
6161  MemSDNode *M = cast<MemSDNode>(Op);
6162  return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6163  M->getMemoryVT(), M->getMemOperand());
6164  }
6165 
6168  SDValue VData = Op.getOperand(2);
6169  bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
6170  if (IsD16)
6171  VData = handleD16VData(VData, DAG);
6172  unsigned Glc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
6173  unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
6174  unsigned IdxEn = 1;
6175  if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
6176  IdxEn = Idx->getZExtValue() != 0;
6177  SDValue Ops[] = {
6178  Chain,
6179  VData,
6180  Op.getOperand(3), // rsrc
6181  Op.getOperand(4), // vindex
6182  SDValue(), // voffset -- will be set by setBufferOffsets
6183  SDValue(), // soffset -- will be set by setBufferOffsets
6184  SDValue(), // offset -- will be set by setBufferOffsets
6185  DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
6186  DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
6187  };
6188  setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
6189  unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
6191  Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
6192  MemSDNode *M = cast<MemSDNode>(Op);
6193  return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6194  M->getMemoryVT(), M->getMemOperand());
6195  }
6196 
6199  SDValue VData = Op.getOperand(2);
6200  bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
6201  if (IsD16)
6202  VData = handleD16VData(VData, DAG);
6203  auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
6204  SDValue Ops[] = {
6205  Chain,
6206  VData,
6207  Op.getOperand(3), // rsrc
6208  DAG.getConstant(0, DL, MVT::i32), // vindex
6209  Offsets.first, // voffset
6210  Op.getOperand(5), // soffset
6211  Offsets.second, // offset
6212  Op.getOperand(6), // cachepolicy
6213  DAG.getConstant(0, DL, MVT::i1), // idxen
6214  };
6215  unsigned Opc = IntrinsicID == Intrinsic::amdgcn_raw_buffer_store ?
6217  Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
6218  MemSDNode *M = cast<MemSDNode>(Op);
6219  return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6220  M->getMemoryVT(), M->getMemOperand());
6221  }
6222 
6225  SDValue VData = Op.getOperand(2);
6226  bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
6227  if (IsD16)
6228  VData = handleD16VData(VData, DAG);
6229  auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
6230  SDValue Ops[] = {
6231  Chain,
6232  VData,
6233  Op.getOperand(3), // rsrc
6234  Op.getOperand(4), // vindex
6235  Offsets.first, // voffset
6236  Op.getOperand(6), // soffset
6237  Offsets.second, // offset
6238  Op.getOperand(7), // cachepolicy
6239  DAG.getConstant(1, DL, MVT::i1), // idxen
6240  };
6241  unsigned Opc = IntrinsicID == Intrinsic::amdgcn_struct_buffer_store ?
6243  Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
6244  MemSDNode *M = cast<MemSDNode>(Op);
6245  return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6246  M->getMemoryVT(), M->getMemOperand());
6247  }
6248 
6249  default: {
6250  if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
6251  AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
6252  return lowerImage(Op, ImageDimIntr, DAG);
6253 
6254  return Op;
6255  }
6256  }
6257 }
6258 
6259 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
6260 // offset (the offset that is included in bounds checking and swizzling, to be
6261 // split between the instruction's voffset and immoffset fields) and soffset
6262 // (the offset that is excluded from bounds checking and swizzling, to go in
6263 // the instruction's soffset field). This function takes the first kind of
6264 // offset and figures out how to split it between voffset and immoffset.
6265 std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
6266  SDValue Offset, SelectionDAG &DAG) const {
6267  SDLoc DL(Offset);
6268  const unsigned MaxImm = 4095;
6269  SDValue N0 = Offset;
6270  ConstantSDNode *C1 = nullptr;
6271 
6272  if ((C1 = dyn_cast<ConstantSDNode>(N0)))
6273  N0 = SDValue();
6274  else if (DAG.isBaseWithConstantOffset(N0)) {
6275  C1 = cast<ConstantSDNode>(N0.getOperand(1));
6276  N0 = N0.getOperand(0);
6277  }
6278 
6279  if (C1) {
6280  unsigned ImmOffset = C1->getZExtValue();
6281  // If the immediate value is too big for the immoffset field, put the value
6282  // and -4096 into the immoffset field so that the value that is copied/added
6283  // for the voffset field is a multiple of 4096, and it stands more chance
6284  // of being CSEd with the copy/add for another similar load/store.
6285  // However, do not do that rounding down to a multiple of 4096 if that is a
6286  // negative number, as it appears to be illegal to have a negative offset
6287  // in the vgpr, even if adding the immediate offset makes it positive.
6288  unsigned Overflow = ImmOffset & ~MaxImm;
6289  ImmOffset -= Overflow;
6290  if ((int32_t)Overflow < 0) {
6291  Overflow += ImmOffset;
6292  ImmOffset = 0;
6293  }
6294  C1 = cast<ConstantSDNode>(DAG.getConstant(ImmOffset, DL, MVT::i32));
6295  if (Overflow) {
6296  auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
6297  if (!N0)
6298  N0 = OverflowVal;
6299  else {
6300  SDValue Ops[] = { N0, OverflowVal };
6301  N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
6302  }
6303  }
6304  }
6305  if (!N0)
6306  N0 = DAG.getConstant(0, DL, MVT::i32);
6307  if (!C1)
6308  C1 = cast<ConstantSDNode>(DAG.getConstant(0, DL, MVT::i32));
6309  return {N0, SDValue(C1, 0)};
6310 }
6311 
6312 // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
6313 // three offsets (voffset, soffset and instoffset) into the SDValue[3] array
6314 // pointed to by Offsets.
6315 void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
6316  SelectionDAG &DAG, SDValue *Offsets,
6317  unsigned Align) const {
6318  SDLoc DL(CombinedOffset);
6319  if (auto C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
6320  uint32_t Imm = C->getZExtValue();
6321  uint32_t SOffset, ImmOffset;
6322  if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget, Align)) {
6323  Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
6324  Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
6325  Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
6326  return;
6327  }
6328  }
6329  if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
6330  SDValue N0 = CombinedOffset.getOperand(0);
6331  SDValue N1 = CombinedOffset.getOperand(1);
6332  uint32_t SOffset, ImmOffset;
6333  int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
6334  if (Offset >= 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
6335  Subtarget, Align)) {
6336  Offsets[0] = N0;
6337  Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
6338  Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
6339  return;
6340  }
6341  }
6342  Offsets[0] = CombinedOffset;
6343  Offsets[1] = DAG.getConstant(0, DL, MVT::i32);
6344  Offsets[2] = DAG.getConstant(0, DL, MVT::i32);
6345 }
6346 
6349  const SDLoc &SL, EVT VT) {
6350  if (VT.bitsLT(Op.getValueType()))
6351  return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
6352 
6353  switch (ExtType) {
6354  case ISD::SEXTLOAD:
6355  return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
6356  case ISD::ZEXTLOAD:
6357  return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
6358  case ISD::EXTLOAD:
6359  return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
6360  case ISD::NON_EXTLOAD:
6361  return Op;
6362  }
6363 
6364  llvm_unreachable("invalid ext type");
6365 }
6366 
6367 SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const {
6368  SelectionDAG &DAG = DCI.DAG;
6369  if (Ld->getAlignment() < 4 || Ld->isDivergent())
6370  return SDValue();
6371 
6372  // FIXME: Constant loads should all be marked invariant.
6373  unsigned AS = Ld->getAddressSpace();
6374  if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
6376  (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
6377  return SDValue();
6378 
6379  // Don't do this early, since it may interfere with adjacent load merging for
6380  // illegal types. We can avoid losing alignment information for exotic types
6381  // pre-legalize.
6382  EVT MemVT = Ld->getMemoryVT();
6383  if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
6384  MemVT.getSizeInBits() >= 32)
6385  return SDValue();
6386 
6387  SDLoc SL(Ld);
6388 
6389  assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
6390  "unexpected vector extload");
6391 
6392  // TODO: Drop only high part of range.
6393  SDValue Ptr = Ld->getBasePtr();
6395  MVT::i32, SL, Ld->getChain(), Ptr,
6396  Ld->getOffset(),
6397  Ld->getPointerInfo(), MVT::i32,
6398  Ld->getAlignment(),
6399  Ld->getMemOperand()->getFlags(),
6400  Ld->getAAInfo(),
6401  nullptr); // Drop ranges
6402 
6403  EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
6404  if (MemVT.isFloatingPoint()) {
6406  "unexpected fp extload");
6407  TruncVT = MemVT.changeTypeToInteger();
6408  }
6409 
6410  SDValue Cvt = NewLoad;
6411  if (Ld->getExtensionType() == ISD::SEXTLOAD) {
6412  Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
6413  DAG.getValueType(TruncVT));
6414  } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
6416  Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
6417  } else {
6419  }
6420 
6421  EVT VT = Ld->getValueType(0);
6422  EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
6423 
6424  DCI.AddToWorklist(Cvt.getNode());
6425 
6426  // We may need to handle exotic cases, such as i16->i64 extloads, so insert
6427  // the appropriate extension from the 32-bit load.
6428  Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
6429  DCI.AddToWorklist(Cvt.getNode());
6430 
6431  // Handle conversion back to floating point if necessary.
6432  Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
6433 
6434  return DAG.getMergeValues({ Cvt, NewLoad.getValue(1) }, SL);
6435 }
6436 
6437 SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
6438  SDLoc DL(Op);
6439  LoadSDNode *Load = cast<LoadSDNode>(Op);
6441  EVT MemVT = Load->getMemoryVT();
6442 
6443  if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
6444  if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
6445  return SDValue();
6446 
6447  // FIXME: Copied from PPC
6448  // First, load into 32 bits, then truncate to 1 bit.
6449 
6450  SDValue Chain = Load->getChain();
6451  SDValue BasePtr = Load->getBasePtr();
6452  MachineMemOperand *MMO = Load->getMemOperand();
6453 
6454  EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
6455 
6456  SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
6457  BasePtr, RealMemVT, MMO);
6458 
6459  SDValue Ops[] = {
6460  DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
6461  NewLD.getValue(1)
6462  };
6463 
6464  return DAG.getMergeValues(Ops, DL);
6465  }
6466 
6467  if (!MemVT.isVector())
6468  return SDValue();
6469 
6471  "Custom lowering for non-i32 vectors hasn't been implemented.");
6472 
6473  unsigned Alignment = Load->getAlignment();
6474  unsigned AS = Load->getAddressSpace();
6475  if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
6476  AS, Alignment)) {
6477  SDValue Ops[2];
6478  std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
6479  return DAG.getMergeValues(Ops, DL);
6480  }
6481 
6482  MachineFunction &MF = DAG.getMachineFunction();
6484  // If there is a possibilty that flat instruction access scratch memory
6485  // then we need to use the same legalization rules we use for private.
6486  if (AS == AMDGPUAS::FLAT_ADDRESS)
6487  AS = MFI->hasFlatScratchInit() ?
6489 
6490  unsigned NumElements = MemVT.getVectorNumElements();
6491 
6492  if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
6494  if (!Op->isDivergent() && Alignment >= 4 && NumElements < 32)
6495  return SDValue();
6496  // Non-uniform loads will be selected to MUBUF instructions, so they
6497  // have the same legalization requirements as global and private
6498  // loads.
6499  //
6500  }
6501 
6502  if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
6504  AS == AMDGPUAS::GLOBAL_ADDRESS) {
6505  if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
6506  !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load) &&
6507  Alignment >= 4 && NumElements < 32)
6508  return SDValue();
6509  // Non-uniform loads will be selected to MUBUF instructions, so they
6510  // have the same legalization requirements as global and private
6511  // loads.
6512  //
6513  }
6514  if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
6516  AS == AMDGPUAS::GLOBAL_ADDRESS ||
6517  AS == AMDGPUAS::FLAT_ADDRESS) {
6518  if (NumElements > 4)
6519  return SplitVectorLoad(Op, DAG);
6520  // v4 loads are supported for private and global memory.
6521  return SDValue();
6522  }
6523  if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
6524  // Depending on the setting of the private_element_size field in the
6525  // resource descriptor, we can only make private accesses up to a certain
6526  // size.
6527  switch (Subtarget->getMaxPrivateElementSize()) {
6528  case 4:
6529  return scalarizeVectorLoad(Load, DAG);
6530  case 8:
6531  if (NumElements > 2)
6532  return SplitVectorLoad(Op, DAG);
6533  return SDValue();
6534  case 16:
6535  // Same as global/flat
6536  if (NumElements > 4)
6537  return SplitVectorLoad(Op, DAG);
6538  return SDValue();
6539  default:
6540  llvm_unreachable("unsupported private_element_size");
6541  }
6542  } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
6543  // Use ds_read_b128 if possible.
6544  if (Subtarget->useDS128() && Load->getAlignment() >= 16 &&
6545  MemVT.getStoreSize() == 16)
6546  return SDValue();
6547 
6548  if (NumElements > 2)
6549  return SplitVectorLoad(Op, DAG);
6550 
6551  // SI has a hardware bug in the LDS / GDS boounds checking: if the base
6552  // address is negative, then the instruction is incorrectly treated as
6553  // out-of-bounds even if base + offsets is in bounds. Split vectorized
6554  // loads here to avoid emitting ds_read2_b32. We may re-combine the
6555  // load later in the SILoadStoreOptimizer.
6556  if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
6557  NumElements == 2 && MemVT.getStoreSize() == 8 &&
6558  Load->getAlignment() < 8) {
6559  return SplitVectorLoad(Op, DAG);
6560  }
6561  }
6562  return SDValue();
6563 }
6564 
6565 SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
6566  EVT VT = Op.getValueType();
6567  assert(VT.getSizeInBits() == 64);
6568 
6569  SDLoc DL(Op);
6570  SDValue Cond = Op.getOperand(0);
6571 
6572  SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
6573  SDValue One = DAG.getConstant(1, DL, MVT::i32);
6574 
6575  SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
6576  SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
6577 
6578  SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
6579  SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
6580 
6581  SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
6582 
6583  SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
6584  SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
6585 
6586  SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
6587 
6588  SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
6589  return DAG.getNode(ISD::BITCAST, DL, VT, Res);
6590 }
6591 
6592 // Catch division cases where we can use shortcuts with rcp and rsq
6593 // instructions.
6594 SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
6595  SelectionDAG &DAG) const {
6596  SDLoc SL(Op);
6597  SDValue LHS = Op.getOperand(0);
6598  SDValue RHS = Op.getOperand(1);
6599  EVT VT = Op.getValueType();
6600  const SDNodeFlags Flags = Op->getFlags();
6601  bool Unsafe = DAG.getTarget().Options.UnsafeFPMath || Flags.hasAllowReciprocal();
6602 
6603  if (!Unsafe && VT == MVT::f32 && Subtarget->hasFP32Denormals())
6604  return SDValue();
6605 
6606  if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
6607  if (Unsafe || VT == MVT::f32 || VT == MVT::f16) {
6608  if (CLHS->isExactlyValue(1.0)) {
6609  // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
6610  // the CI documentation has a worst case error of 1 ulp.
6611  // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
6612  // use it as long as we aren't trying to use denormals.
6613  //
6614  // v_rcp_f16 and v_rsq_f16 DO support denormals.
6615 
6616  // 1.0 / sqrt(x) -> rsq(x)
6617 
6618  // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
6619  // error seems really high at 2^29 ULP.
6620  if (RHS.getOpcode() == ISD::FSQRT)
6621  return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
6622 
6623  // 1.0 / x -> rcp(x)
6624  return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
6625  }
6626 
6627  // Same as for 1.0, but expand the sign out of the constant.
6628  if (CLHS->isExactlyValue(-1.0)) {
6629  // -1.0 / x -> rcp (fneg x)
6630  SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
6631  return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
6632  }
6633  }
6634  }
6635 
6636  if (Unsafe) {
6637  // Turn into multiply by the reciprocal.
6638  // x / y -> x * (1.0 / y)
6639  SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
6640  return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
6641  }
6642 
6643  return SDValue();
6644 }
6645 
6646 static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
6647  EVT VT, SDValue A, SDValue B, SDValue GlueChain) {
6648  if (GlueChain->getNumValues() <= 1) {
6649  return DAG.getNode(Opcode, SL, VT, A, B);
6650  }
6651 
6652  assert(GlueChain->getNumValues() == 3);
6653 
6654  SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
6655  switch (Opcode) {
6656  default: llvm_unreachable("no chain equivalent for opcode");
6657  case ISD::FMUL:
6658  Opcode = AMDGPUISD::FMUL_W_CHAIN;
6659  break;
6660  }
6661 
6662  return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B,
6663  GlueChain.getValue(2));
6664 }
6665 
6666 static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
6667  EVT VT, SDValue A, SDValue B, SDValue C,
6668  SDValue GlueChain) {
6669  if (GlueChain->getNumValues() <= 1) {
6670  return DAG.getNode(Opcode, SL, VT, A, B, C);
6671  }
6672 
6673  assert(GlueChain->getNumValues() == 3);
6674 
6675  SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
6676  switch (Opcode) {
6677  default: llvm_unreachable("no chain equivalent for opcode");
6678  case ISD::FMA:
6679  Opcode = AMDGPUISD::FMA_W_CHAIN;
6680  break;
6681  }
6682 
6683  return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, C,
6684  GlueChain.getValue(2));
6685 }
6686 
6687 SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
6688  if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
6689  return FastLowered;
6690 
6691  SDLoc SL(Op);
6692  SDValue Src0 = Op.getOperand(0);
6693  SDValue Src1 = Op.getOperand(1);
6694 
6695  SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
6696  SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
6697 
6698  SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
6699  SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
6700 
6701  SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
6702  SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
6703 
6704  return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
6705 }
6706 
6707 // Faster 2.5 ULP division that does not support denormals.
6708 SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
6709  SDLoc SL(Op);
6710  SDValue LHS = Op.getOperand(1);
6711  SDValue RHS = Op.getOperand(2);
6712 
6713  SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
6714 
6715  const APFloat K0Val(BitsToFloat(0x6f800000));
6716  const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
6717 
6718  const APFloat K1Val(BitsToFloat(0x2f800000));
6719  const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
6720 
6721  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
6722 
6723  EVT SetCCVT =
6725 
6726  SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
6727 
6728  SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
6729 
6730  // TODO: Should this propagate fast-math-flags?
6731  r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
6732 
6733  // rcp does not support denormals.
6734  SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
6735 
6736  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
6737 
6738  return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
6739 }
6740 
6741 SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
6742  if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
6743  return FastLowered;
6744 
6745  SDLoc SL(Op);
6746  SDValue LHS = Op.getOperand(0);
6747  SDValue RHS = Op.getOperand(1);
6748 
6749  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
6750 
6751  SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
6752 
6753  SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
6754  RHS, RHS, LHS);
6755  SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
6756  LHS, RHS, LHS);
6757 
6758  // Denominator is scaled to not be denormal, so using rcp is ok.
6759  SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
6760  DenominatorScaled);
6761  SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
6762  DenominatorScaled);
6763 
6764  const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE |
6767 
6768  const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16);
6769 
6770  if (!Subtarget->hasFP32Denormals()) {
6771  SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
6772  const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
6773  SL, MVT::i32);
6774  SDValue EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs,
6775  DAG.getEntryNode(),
6776  EnableDenormValue, BitField);
6777  SDValue Ops[3] = {
6778  NegDivScale0,
6779  EnableDenorm.getValue(0),
6780  EnableDenorm.getValue(1)
6781  };
6782 
6783  NegDivScale0 = DAG.getMergeValues(Ops, SL);
6784  }
6785 
6786  SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
6787  ApproxRcp, One, NegDivScale0);
6788 
6789  SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
6790  ApproxRcp, Fma0);
6791 
6792  SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
6793  Fma1, Fma1);
6794 
6795  SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
6796  NumeratorScaled, Mul);
6797 
6798  SDValue Fma3 = getFPTernOp(DAG, ISD::FMA,SL, MVT::f32, Fma2, Fma1, Mul, Fma2);
6799 
6800  SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
6801  NumeratorScaled, Fma3);
6802 
6803  if (!Subtarget->hasFP32Denormals()) {
6804  const SDValue DisableDenormValue =
6806  SDValue DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other,
6807  Fma4.getValue(1),
6808  DisableDenormValue,
6809  BitField,
6810  Fma4.getValue(2));
6811 
6812  SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
6813  DisableDenorm, DAG.getRoot());
6814  DAG.setRoot(OutputChain);
6815  }
6816 
6817  SDValue Scale = NumeratorScaled.getValue(1);
6818  SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
6819  Fma4, Fma1, Fma3, Scale);
6820 
6821  return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS);
6822 }
6823 
6824 SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
6825  if (DAG.getTarget().Options.UnsafeFPMath)
6826  return lowerFastUnsafeFDIV(Op, DAG);
6827 
6828  SDLoc SL(Op);
6829  SDValue X = Op.getOperand(0);
6830  SDValue Y = Op.getOperand(1);
6831 
6832  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
6833 
6834  SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
6835 
6836  SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
6837 
6838  SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
6839 
6840  SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
6841 
6842  SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
6843 
6844  SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
6845 
6846  SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
6847 
6848  SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
6849 
6850  SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
6851  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
6852 
6853  SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
6854  NegDivScale0, Mul, DivScale1);
6855 
6856  SDValue Scale;
6857 
6858  if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
6859  // Workaround a hardware bug on SI where the condition output from div_scale
6860  // is not usable.
6861 
6862  const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
6863 
6864  // Figure out if the scale to use for div_fmas.
6865  SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
6866  SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
6867  SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
6868  SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
6869 
6870  SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
6871  SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
6872 
6873  SDValue Scale0Hi
6874  = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
6875  SDValue Scale1Hi
6876  = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
6877 
6878  SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
6879  SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
6880  Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
6881  } else {
6882  Scale = DivScale1.getValue(1);
6883  }
6884 
6885  SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
6886  Fma4, Fma3, Mul, Scale);
6887 
6888  return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
6889 }
6890 
6891 SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
6892  EVT VT = Op.getValueType();
6893 
6894  if (VT == MVT::f32)
6895  return LowerFDIV32(Op, DAG);
6896 
6897  if (VT == MVT::f64)
6898  return LowerFDIV64(Op, DAG);
6899 
6900  if (VT == MVT::f16)
6901  return LowerFDIV16(Op, DAG);
6902 
6903  llvm_unreachable("Unexpected type for fdiv");
6904 }
6905 
6906 SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
6907  SDLoc DL(Op);
6908  StoreSDNode *Store = cast<StoreSDNode>(Op);
6909  EVT VT = Store->getMemoryVT();
6910 
6911  if (VT == MVT::i1) {
6912  return DAG.getTruncStore(Store->getChain(), DL,
6913  DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
6914  Store->getBasePtr(), MVT::i1, Store->getMemOperand());
6915  }
6916 
6917  assert(VT.isVector() &&
6918  Store->getValue().getValueType().getScalarType() == MVT::i32);
6919 
6920  unsigned AS = Store->getAddressSpace();
6921  if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
6922  AS, Store->getAlignment())) {
6923  return expandUnalignedStore(Store, DAG);
6924  }
6925 
6926  MachineFunction &MF = DAG.getMachineFunction();
6928  // If there is a possibilty that flat instruction access scratch memory
6929  // then we need to use the same legalization rules we use for private.
6930  if (AS == AMDGPUAS::FLAT_ADDRESS)
6931  AS = MFI->hasFlatScratchInit() ?
6933 
6934  unsigned NumElements = VT.getVectorNumElements();
6935  if (AS == AMDGPUAS::GLOBAL_ADDRESS ||
6936  AS == AMDGPUAS::FLAT_ADDRESS) {
6937  if (NumElements > 4)
6938  return SplitVectorStore(Op, DAG);
6939  return SDValue();
6940  } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
6941  switch (Subtarget->getMaxPrivateElementSize()) {
6942  case 4:
6943  return scalarizeVectorStore(Store, DAG);
6944  case 8:
6945  if (NumElements > 2)
6946  return SplitVectorStore(Op, DAG);
6947  return SDValue();
6948  case 16:
6949  if (NumElements > 4)
6950  return SplitVectorStore(Op, DAG);
6951  return SDValue();
6952  default:
6953  llvm_unreachable("unsupported private_element_size");
6954  }
6955  } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
6956  // Use ds_write_b128 if possible.
6957  if (Subtarget->useDS128() && Store->getAlignment() >= 16 &&
6958  VT.getStoreSize() == 16)
6959  return SDValue();
6960 
6961  if (NumElements > 2)
6962  return SplitVectorStore(Op, DAG);
6963 
6964  // SI has a hardware bug in the LDS / GDS boounds checking: if the base
6965  // address is negative, then the instruction is incorrectly treated as
6966  // out-of-bounds even if base + offsets is in bounds. Split vectorized
6967  // stores here to avoid emitting ds_write2_b32. We may re-combine the
6968  // store later in the SILoadStoreOptimizer.
6969  if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
6970  NumElements == 2 && VT.getStoreSize() == 8 &&
6971  Store->getAlignment() < 8) {
6972  return SplitVectorStore(Op, DAG);
6973  }
6974 
6975  return SDValue();
6976  } else {
6977  llvm_unreachable("unhandled address space");
6978  }
6979 }
6980 
6981 SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
6982  SDLoc DL(Op);
6983  EVT VT = Op.getValueType();
6984  SDValue Arg = Op.getOperand(0);
6985  SDValue TrigVal;
6986 
6987  // TODO: Should this propagate fast-math-flags?
6988 
6989  SDValue OneOver2Pi = DAG.getConstantFP(0.5 / M_PI, DL, VT);
6990 
6991  if (Subtarget->hasTrigReducedRange()) {
6992  SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi);
6993  TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal);
6994  } else {
6995  TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi);
6996  }
6997 
6998  switch (Op.getOpcode()) {
6999  case ISD::FCOS:
7000  return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal);
7001  case ISD::FSIN:
7002  return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal);
7003  default:
7004  llvm_unreachable("Wrong trig opcode");
7005  }
7006 }
7007 
7008 SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
7009  AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
7010  assert(AtomicNode->isCompareAndSwap());
7011  unsigned AS = AtomicNode->getAddressSpace();
7012 
7013  // No custom lowering required for local address space
7014  if (!isFlatGlobalAddrSpace(AS))
7015  return Op;
7016 
7017  // Non-local address space requires custom lowering for atomic compare
7018  // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
7019  SDLoc DL(Op);
7020  SDValue ChainIn = Op.getOperand(0);
7021  SDValue Addr = Op.getOperand(1);
7022  SDValue Old = Op.getOperand(2);
7023  SDValue New = Op.getOperand(3);
7024  EVT VT = Op.getValueType();
7025  MVT SimpleVT = VT.getSimpleVT();
7026  MVT VecType = MVT::getVectorVT(SimpleVT, 2);
7027 
7028  SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
7029  SDValue Ops[] = { ChainIn, Addr, NewOld };
7030 
7032  Ops, VT, AtomicNode->getMemOperand());
7033 }
7034 
7035 //===----------------------------------------------------------------------===//
7036 // Custom DAG optimizations
7037 //===----------------------------------------------------------------------===//
7038 
7039 SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
7040  DAGCombinerInfo &DCI) const {
7041  EVT VT = N->getValueType(0);
7042  EVT ScalarVT = VT.getScalarType();
7043  if (ScalarVT != MVT::f32)
7044  return SDValue();
7045 
7046  SelectionDAG &DAG = DCI.DAG;
7047  SDLoc DL(N);
7048 
7049  SDValue Src = N->getOperand(0);
7050  EVT SrcVT = Src.getValueType();
7051 
7052  // TODO: We could try to match extracting the higher bytes, which would be
7053  // easier if i8 vectors weren't promoted to i32 vectors, particularly after
7054  // types are legalized. v4i8 -> v4f32 is probably the only case to worry
7055  // about in practice.
7056  if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
7057  if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
7058  SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src);
7059  DCI.AddToWorklist(Cvt.getNode());
7060  return Cvt;
7061  }
7062  }
7063 
7064  return SDValue();
7065 }
7066 
7067 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
7068 
7069 // This is a variant of
7070 // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
7071 //
7072 // The normal DAG combiner will do this, but only if the add has one use since
7073 // that would increase the number of instructions.
7074 //
7075 // This prevents us from seeing a constant offset that can be folded into a
7076 // memory instruction's addressing mode. If we know the resulting add offset of
7077 // a pointer can be folded into an addressing offset, we can replace the pointer
7078 // operand with the add of new constant offset. This eliminates one of the uses,
7079 // and may allow the remaining use to also be simplified.
7080 //
7081 SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
7082  unsigned AddrSpace,
7083  EVT MemVT,
7084  DAGCombinerInfo &DCI) const {
7085  SDValue N0 = N->getOperand(0);
7086  SDValue N1 = N->getOperand(1);
7087 
7088  // We only do this to handle cases where it's profitable when there are
7089  // multiple uses of the add, so defer to the standard combine.
7090  if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
7091  N0->hasOneUse())
7092  return SDValue();
7093 
7094  const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
7095  if (!CN1)
7096  return SDValue();
7097 
7098  const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
7099  if (!CAdd)
7100  return SDValue();
7101 
7102  // If the resulting offset is too large, we can't fold it into the addressing
7103  // mode offset.
7104  APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
7105  Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
7106 
7107  AddrMode AM;
7108  AM.HasBaseReg = true;
7109  AM.BaseOffs = Offset.getSExtValue();
7110  if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
7111  return SDValue();
7112 
7113  SelectionDAG &DAG = DCI.DAG;
7114  SDLoc SL(N);
7115  EVT VT = N->getValueType(0);
7116 
7117  SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
7118  SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32);
7119 
7120  SDNodeFlags Flags;
7122  (N0.getOpcode() == ISD::OR ||
7123  N0->getFlags().hasNoUnsignedWrap()));
7124 
7125  return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
7126 }
7127 
7128 SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
7129  DAGCombinerInfo &DCI) const {
7130  SDValue Ptr = N->getBasePtr();
7131  SelectionDAG &DAG = DCI.DAG;
7132  SDLoc SL(N);
7133 
7134  // TODO: We could also do this for multiplies.
7135  if (Ptr.getOpcode() == ISD::SHL) {
7136  SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
7137  N->getMemoryVT(), DCI);
7138  if (NewPtr) {
7139  SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
7140 
7141  NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr;
7142  return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
7143  }
7144  }
7145 
7146  return SDValue();
7147 }
7148 
7149 static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
7150  return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
7151  (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
7152  (Opc == ISD::XOR && Val == 0);
7153 }
7154 
7155 // Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
7156 // will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
7157 // integer combine opportunities since most 64-bit operations are decomposed
7158 // this way. TODO: We won't want this for SALU especially if it is an inline
7159 // immediate.
7160 SDValue SITargetLowering::splitBinaryBitConstantOp(
7161  DAGCombinerInfo &DCI,
7162  const SDLoc &SL,
7163  unsigned Opc, SDValue LHS,
7164  const ConstantSDNode *CRHS) const {
7165  uint64_t Val = CRHS->getZExtValue();
7166  uint32_t ValLo = Lo_32(Val);
7167  uint32_t ValHi = Hi_32(Val);
7168  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
7169 
7170  if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
7171  bitOpWithConstantIsReducible(Opc, ValHi)) ||
7172  (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
7173  // If we need to materialize a 64-bit immediate, it will be split up later
7174  // anyway. Avoid creating the harder to understand 64-bit immediate
7175  // materialization.
7176  return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
7177  }
7178 
7179  return SDValue();
7180 }
7181 
7182 // Returns true if argument is a boolean value which is not serialized into
7183 // memory or argument and does not require v_cmdmask_b32 to be deserialized.
7184 static bool isBoolSGPR(SDValue V) {
7185  if (V.getValueType() != MVT::i1)
7186  return false;
7187  switch (V.getOpcode()) {
7188  default: break;
7189  case ISD::SETCC:
7190  case ISD::AND:
7191  case ISD::OR:
7192  case ISD::XOR:
7193  case AMDGPUISD::FP_CLASS:
7194  return true;
7195  }
7196  return false;
7197 }
7198 
7199 // If a constant has all zeroes or all ones within each byte return it.
7200 // Otherwise return 0.
7202  // 0xff for any zero byte in the mask
7203  uint32_t ZeroByteMask = 0;
7204  if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
7205  if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
7206  if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
7207  if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000;
7208  uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
7209  if ((NonZeroByteMask & C) != NonZeroByteMask)
7210  return 0; // Partial bytes selected.
7211  return C;
7212 }
7213 
7214 // Check if a node selects whole bytes from its operand 0 starting at a byte
7215 // boundary while masking the rest. Returns select mask as in the v_perm_b32
7216 // or -1 if not succeeded.
7217 // Note byte select encoding:
7218 // value 0-3 selects corresponding source byte;
7219 // value 0xc selects zero;
7220 // value 0xff selects 0xff.
7222  assert(V.getValueSizeInBits() == 32);
7223 
7224  if (V.getNumOperands() != 2)
7225  return ~0;
7226 
7228  if (!N1)
7229  return ~0;
7230 
7231  uint32_t C = N1->getZExtValue();
7232 
7233  switch (V.getOpcode()) {
7234  default:
7235  break;
7236  case ISD::AND:
7237  if (uint32_t ConstMask = getConstantPermuteMask(C)) {
7238  return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
7239  }
7240  break;
7241 
7242  case ISD::OR:
7243  if (uint32_t ConstMask = getConstantPermuteMask(C)) {
7244  return (0x03020100 & ~ConstMask) | ConstMask;
7245  }
7246  break;
7247 
7248  case ISD::SHL:
7249  if (C % 8)
7250  return ~0;
7251 
7252  return uint32_t((0x030201000c0c0c0cull << C) >> 32);
7253 
7254  case ISD::SRL:
7255  if (C % 8)
7256  return ~0;
7257 
7258  return uint32_t(0x0c0c0c0c03020100ull >> C);
7259  }
7260 
7261  return ~0;
7262 }
7263 
7264 SDValue SITargetLowering::performAndCombine(SDNode *N,
7265  DAGCombinerInfo &DCI) const {
7266  if (DCI.isBeforeLegalize())
7267  return SDValue();
7268 
7269  SelectionDAG &DAG = DCI.DAG;
7270  EVT VT = N->getValueType(0);
7271  SDValue LHS = N->getOperand(0);
7272  SDValue RHS = N->getOperand(1);
7273 
7274 
7275  const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
7276  if (VT == MVT::i64 && CRHS) {
7277  if (SDValue Split
7278  = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
7279  return Split;
7280  }
7281 
7282  if (CRHS && VT == MVT::i32) {
7283  // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
7284  // nb = number of trailing zeroes in mask
7285  // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
7286  // given that we are selecting 8 or 16 bit fields starting at byte boundary.
7287  uint64_t Mask = CRHS->getZExtValue();
7288  unsigned Bits = countPopulation(Mask);
7289  if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
7290  (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
7291  if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
7292  unsigned Shift = CShift->getZExtValue();
7293  unsigned NB = CRHS->getAPIntValue().countTrailingZeros();
7294  unsigned Offset = NB + Shift;
7295  if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
7296  SDLoc SL(N);
7297  SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
7298  LHS->getOperand(0),
7299  DAG.getConstant(Offset, SL, MVT::i32),
7300  DAG.getConstant(Bits, SL, MVT::i32));
7301  EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
7302  SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
7303  DAG.getValueType(NarrowVT));
7304  SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
7305  DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
7306  return Shl;
7307  }
7308  }
7309  }
7310 
7311  // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
7312  if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
7313  isa<ConstantSDNode>(LHS.getOperand(2))) {
7314  uint32_t Sel = getConstantPermuteMask(Mask);
7315  if (!Sel)
7316  return SDValue();
7317 
7318  // Select 0xc for all zero bytes
7319  Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
7320  SDLoc DL(N);
7321  return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
7322  LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
7323  }
7324  }
7325 
7326  // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
7327  // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
7328  if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
7329  ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
7330  ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
7331 
7332  SDValue X = LHS.getOperand(0);
7333  SDValue Y = RHS.getOperand(0);
7334  if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X)
7335  return SDValue();
7336 
7337  if (LCC == ISD::SETO) {
7338  if (X != LHS.getOperand(1))
7339  return SDValue();
7340 
7341  if (RCC == ISD::SETUNE) {
7343  if (!C1 || !C1->isInfinity() || C1->isNegative())
7344  return SDValue();
7345 
7352 
7353  static_assert(((~(SIInstrFlags::S_NAN |
7356  SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
7357  "mask not equal");
7358 
7359  SDLoc DL(N);
7360  return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
7361  X, DAG.getConstant(Mask, DL, MVT::i32));
7362  }
7363  }
7364  }
7365 
7366  if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
7367  std::swap(LHS, RHS);
7368 
7369  if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
7370  RHS.hasOneUse()) {
7371  ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
7372  // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan)
7373  // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan)
7375  if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
7376  (RHS.getOperand(0) == LHS.getOperand(0) &&
7377  LHS.getOperand(0) == LHS.getOperand(1))) {
7378  const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
7379  unsigned NewMask = LCC == ISD::SETO ?
7380  Mask->getZExtValue() & ~OrdMask :
7381  Mask->getZExtValue() & OrdMask;
7382 
7383  SDLoc DL(N);
7384  return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
7385  DAG.getConstant(NewMask, DL, MVT::i32));
7386  }
7387  }
7388 
7389  if (VT == MVT::i32 &&
7390  (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) {
7391  // and x, (sext cc from i1) => select cc, x, 0
7392  if (RHS.getOpcode() != ISD::SIGN_EXTEND)
7393  std::swap(LHS, RHS);
7394  if (isBoolSGPR(RHS.getOperand(0)))
7395  return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0),
7396  LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
7397  }
7398 
7399  // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
7400  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
7401  if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
7402  N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
7403  uint32_t LHSMask = getPermuteMask(DAG, LHS);
7404  uint32_t RHSMask = getPermuteMask(DAG, RHS);
7405  if (LHSMask != ~0u && RHSMask != ~0u) {
7406  // Canonicalize the expression in an attempt to have fewer unique masks
7407  // and therefore fewer registers used to hold the masks.
7408  if (LHSMask > RHSMask) {
7409  std::swap(LHSMask, RHSMask);
7410  std::swap(LHS, RHS);
7411  }
7412 
7413  // Select 0xc for each lane used from source operand. Zero has 0xc mask
7414  // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
7415  uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
7416  uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
7417 
7418  // Check of we need to combine values from two sources within a byte.
7419  if (!(LHSUsedLanes & RHSUsedLanes) &&
7420  // If we select high and lower word keep it for SDWA.
7421  // TODO: teach SDWA to work with v_perm_b32 and remove the check.
7422  !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
7423  // Each byte in each mask is either selector mask 0-3, or has higher
7424  // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
7425  // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
7426  // mask which is not 0xff wins. By anding both masks we have a correct
7427  // result except that 0x0c shall be corrected to give 0x0c only.
7428  uint32_t Mask = LHSMask & RHSMask;
7429  for (unsigned I = 0; I < 32; I += 8) {
7430  uint32_t ByteSel = 0xff << I;
7431  if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
7432  Mask &= (0x0c << I) & 0xffffffff;
7433  }
7434 
7435  // Add 4 to each active LHS lane. It will not affect any existing 0xff
7436  // or 0x0c.
7437  uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
7438  SDLoc DL(N);
7439 
7440  return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
7441  LHS.getOperand(0), RHS.getOperand(0),
7442  DAG.getConstant(Sel, DL, MVT::i32));
7443  }
7444  }
7445  }
7446 
7447  return SDValue();
7448 }
7449 
7450 SDValue SITargetLowering::performOrCombine(SDNode *N,
7451  DAGCombinerInfo &DCI) const {
7452  SelectionDAG &DAG = DCI.DAG;
7453  SDValue LHS = N->getOperand(0);
7454  SDValue RHS = N->getOperand(1);
7455 
7456  EVT VT = N->getValueType(0);
7457  if (VT == MVT::i1) {
7458  // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
7459  if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
7460  RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
7461  SDValue Src = LHS.getOperand(0);
7462  if (Src != RHS.getOperand(0))
7463  return SDValue();
7464 
7465  const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
7466  const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
7467  if (!CLHS || !CRHS)
7468  return SDValue();
7469 
7470  // Only 10 bits are used.
7471  static const uint32_t MaxMask = 0x3ff;
7472 
7473  uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
7474  SDLoc DL(N);
7475  return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
7476  Src, DAG.getConstant(NewMask, DL, MVT::i32));
7477  }
7478 
7479  return SDValue();
7480  }
7481 
7482  // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
7483  if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
7484  LHS.getOpcode() == AMDGPUISD::PERM &&
7485  isa<ConstantSDNode>(LHS.getOperand(2))) {
7487  if (!Sel)
7488  return SDValue();
7489 
7490  Sel |= LHS.getConstantOperandVal(2);
7491  SDLoc DL(N);
7492  return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
7493  LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
7494  }
7495 
7496  // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
7497  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
7498  if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
7499  N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
7500  uint32_t LHSMask = getPermuteMask(DAG, LHS);
7501  uint32_t RHSMask = getPermuteMask(DAG, RHS);
7502  if (LHSMask != ~0u && RHSMask != ~0u) {
7503  // Canonicalize the expression in an attempt to have fewer unique masks
7504  // and therefore fewer registers used to hold the masks.
7505  if (LHSMask > RHSMask) {
7506  std::swap(LHSMask, RHSMask);
7507  std::swap(LHS, RHS);
7508  }
7509 
7510  // Select 0xc for each lane used from source operand. Zero has 0xc mask
7511  // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
7512  uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
7513  uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
7514 
7515  // Check of we need to combine values from two sources within a byte.
7516  if (!(LHSUsedLanes & RHSUsedLanes) &&
7517  // If we select high and lower word keep it for SDWA.
7518  // TODO: teach SDWA to work with v_perm_b32 and remove the check.
7519  !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
7520  // Kill zero bytes selected by other mask. Zero value is 0xc.
7521  LHSMask &= ~RHSUsedLanes;
7522  RHSMask &= ~LHSUsedLanes;
7523  // Add 4 to each active LHS lane
7524  LHSMask |= LHSUsedLanes & 0x04040404;
7525  // Combine masks
7526  uint32_t Sel = LHSMask | RHSMask;
7527  SDLoc DL(N);
7528 
7529  return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
7530  LHS.getOperand(0), RHS.getOperand(0),
7531  DAG.getConstant(Sel, DL, MVT::i32));
7532  }
7533  }
7534  }
7535 
7536  if (VT != MVT::i64)
7537  return SDValue();
7538 
7539  // TODO: This could be a generic combine with a predicate for extracting the
7540  // high half of an integer being free.
7541 
7542  // (or i64:x, (zero_extend i32:y)) ->
7543  // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
7544  if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
7545  RHS.getOpcode() != ISD::ZERO_EXTEND)
7546  std::swap(LHS, RHS);
7547 
7548  if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
7549  SDValue ExtSrc = RHS.getOperand(0);
7550  EVT SrcVT = ExtSrc.getValueType();
7551  if (SrcVT == MVT::i32) {
7552  SDLoc SL(N);
7553  SDValue LowLHS, HiBits;
7554  std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
7555  SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
7556 
7557  DCI.AddToWorklist(LowOr.getNode());
7558  DCI.AddToWorklist(HiBits.getNode());
7559 
7561  LowOr, HiBits);
7562  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
7563  }
7564  }
7565 
7566  const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
7567  if (CRHS) {
7568  if (SDValue Split
7569  = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR, LHS, CRHS))
7570  return Split;
7571  }
7572 
7573  return SDValue();
7574 }
7575 
7576 SDValue SITargetLowering::performXorCombine(SDNode *N,
7577  DAGCombinerInfo &DCI) const {
7578  EVT VT = N->getValueType(0);
7579  if (VT != MVT::i64)
7580  return SDValue();
7581 
7582  SDValue LHS = N->getOperand(0);
7583  SDValue RHS = N->getOperand(1);
7584 
7585  const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
7586  if (CRHS) {
7587  if (SDValue Split
7588  = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
7589  return Split;
7590  }
7591 
7592  return SDValue();
7593 }
7594 
7595 // Instructions that will be lowered with a final instruction that zeros the
7596 // high result bits.
7597 // XXX - probably only need to list legal operations.
7598 static bool fp16SrcZerosHighBits(unsigned Opc) {
7599  switch (Opc) {
7600  case ISD::FADD:
7601  case ISD::FSUB:
7602  case ISD::FMUL:
7603  case ISD::FDIV:
7604  case ISD::FREM:
7605  case ISD::FMA:
7606  case ISD::FMAD:
7607  case ISD::FCANONICALIZE:
7608  case ISD::FP_ROUND:
7609  case ISD::UINT_TO_FP:
7610  case ISD::SINT_TO_FP:
7611  case ISD::FABS:
7612  // Fabs is lowered to a bit operation, but it's an and which will clear the
7613  // high bits anyway.
7614  case ISD::FSQRT:
7615  case ISD::FSIN:
7616  case ISD::FCOS:
7617  case ISD::FPOWI:
7618  case ISD::FPOW:
7619  case ISD::FLOG:
7620  case ISD::FLOG2:
7621  case ISD::FLOG10:
7622  case ISD::FEXP:
7623  case ISD::FEXP2:
7624  case ISD::FCEIL:
7625  case ISD::FTRUNC:
7626  case ISD::FRINT:
7627  case ISD::FNEARBYINT:
7628  case ISD::FROUND:
7629  case ISD::FFLOOR:
7630  case ISD::FMINNUM:
7631  case ISD::FMAXNUM:
7632  case AMDGPUISD::FRACT:
7633  case AMDGPUISD::CLAMP:
7634  case AMDGPUISD::COS_HW:
7635  case AMDGPUISD::SIN_HW:
7636  case AMDGPUISD::FMIN3:
7637  case AMDGPUISD::FMAX3:
7638  case AMDGPUISD::FMED3:
7639  case AMDGPUISD::FMAD_FTZ:
7640  case AMDGPUISD::RCP:
7641  case AMDGPUISD::RSQ:
7642  case AMDGPUISD::RCP_IFLAG:
7643  case AMDGPUISD::LDEXP:
7644  return true;
7645  default:
7646  // fcopysign, select and others may be lowered to 32-bit bit operations
7647  // which don't zero the high bits.
7648  return false;
7649  }
7650 }
7651 
7652 SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
7653  DAGCombinerInfo &DCI) const {
7654  if (!Subtarget->has16BitInsts() ||
7655  DCI.getDAGCombineLevel() < AfterLegalizeDAG)
7656  return SDValue();
7657 
7658  EVT VT = N->getValueType(0);
7659  if (VT != MVT::i32)
7660  return SDValue();
7661 
7662  SDValue Src = N->getOperand(0);
7663  if (Src.getValueType() != MVT::i16)
7664  return SDValue();
7665 
7666  // (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src
7667  // FIXME: It is not universally true that the high bits are zeroed on gfx9.
7668  if (Src.getOpcode() == ISD::BITCAST) {
7669  SDValue BCSrc = Src.getOperand(0);
7670  if (BCSrc.getValueType() == MVT::f16 &&
7672  return DCI.DAG.getNode(AMDGPUISD::FP16_ZEXT, SDLoc(N), VT, BCSrc);
7673  }
7674 
7675  return SDValue();
7676 }
7677 
7678 SDValue SITargetLowering::performClassCombine(SDNode *N,
7679  DAGCombinerInfo &DCI) const {
7680  SelectionDAG &DAG = DCI.DAG;
7681  SDValue Mask = N->getOperand(1);
7682 
7683  // fp_class x, 0 -> false
7684  if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) {
7685  if (CMask->isNullValue())
7686  return DAG.getConstant(0, SDLoc(N), MVT::i1);
7687  }
7688 
7689  if (N->getOperand(0).isUndef())
7690  return DAG.getUNDEF(MVT::i1);
7691 
7692  return SDValue();
7693 }
7694 
7695 SDValue SITargetLowering::performRcpCombine(SDNode *N,
7696  DAGCombinerInfo &DCI) const {
7697  EVT VT = N->getValueType(0);
7698  SDValue N0 = N->getOperand(0);
7699 
7700  if (N0.isUndef())
7701  return N0;
7702 
7703  if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
7704  N0.getOpcode() == ISD::SINT_TO_FP)) {
7705  return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
7706  N->getFlags());
7707  }
7708 
7710 }
7711 
7713  unsigned MaxDepth) const {
7714  unsigned Opcode = Op.getOpcode();
7715  if (Opcode == ISD::FCANONICALIZE)
7716  return true;
7717 
7718  if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
7719  auto F = CFP->getValueAPF();
7720  if (F.isNaN() && F.isSignaling())
7721  return false;
7722  return !F.isDenormal() || denormalsEnabledForType(Op.getValueType());
7723  }
7724 
7725  // If source is a result of another standard FP operation it is already in
7726  // canonical form.
7727  if (MaxDepth == 0)
7728  return false;
7729 
7730  switch (Opcode) {
7731  // These will flush denorms if required.
7732  case ISD::FADD:
7733  case ISD::FSUB:
7734  case ISD::FMUL:
7735  case ISD::FCEIL:
7736  case ISD::FFLOOR:
7737  case ISD::FMA:
7738  case ISD::FMAD:
7739  case ISD::FSQRT:
7740  case ISD::FDIV:
7741  case ISD::FREM:
7742  case ISD::FP_ROUND:
7743  case ISD::FP_EXTEND:
7745  case AMDGPUISD::FMAD_FTZ:
7746  case AMDGPUISD::RCP:
7747  case AMDGPUISD::RSQ:
7748  case AMDGPUISD::RSQ_CLAMP:
7749  case AMDGPUISD::RCP_LEGACY:
7750  case AMDGPUISD::RSQ_LEGACY:
7751  case AMDGPUISD::RCP_IFLAG:
7752  case AMDGPUISD::TRIG_PREOP:
7753  case AMDGPUISD::DIV_SCALE:
7754  case AMDGPUISD::DIV_FMAS:
7755  case AMDGPUISD::DIV_FIXUP:
7756  case AMDGPUISD::FRACT:
7757  case AMDGPUISD::LDEXP:
7763  return true;
7764 
7765  // It can/will be lowered or combined as a bit operation.
7766  // Need to check their input recursively to handle.
7767  case ISD::FNEG:
7768  case ISD::FABS:
7769  case ISD::FCOPYSIGN:
7770  return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
7771 
7772  case ISD::FSIN:
7773  case ISD::FCOS:
7774  case ISD::FSINCOS:
7775  return Op.getValueType().getScalarType() != MVT::f16;
7776 
7777  case ISD::FMINNUM:
7778  case ISD::FMAXNUM:
7779  case ISD::FMINNUM_IEEE:
7780  case ISD::FMAXNUM_IEEE:
7781  case AMDGPUISD::CLAMP:
7782  case AMDGPUISD::FMED3:
7783  case AMDGPUISD::FMAX3:
7784  case AMDGPUISD::FMIN3: {
7785  // FIXME: Shouldn't treat the generic operations different based these.
7786  // However, we aren't really required to flush the result from
7787  // minnum/maxnum..
7788 
7789  // snans will be quieted, so we only need to worry about denormals.
7790  if (Subtarget->supportsMinMaxDenormModes() ||
7792  return true;
7793 
7794  // Flushing may be required.
7795  // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
7796  // targets need to check their input recursively.
7797 
7798  // FIXME: Does this apply with clamp? It's implemented with max.
7799  for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
7800  if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
7801  return false;
7802  }
7803 
7804  return true;
7805  }
7806  case ISD::SELECT: {
7807  return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
7808  isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
7809  }
7810  case ISD::BUILD_VECTOR: {
7811  for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
7812  SDValue SrcOp = Op.getOperand(i);
7813  if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
7814  return false;
7815  }
7816 
7817  return true;
7818  }
7820  case ISD::EXTRACT_SUBVECTOR: {
7821  return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
7822  }
7823  case ISD::INSERT_VECTOR_ELT: {
7824  return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
7825  isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
7826  }
7827  case ISD::UNDEF:
7828  // Could be anything.
7829  return false;
7830 
7831  case ISD::BITCAST: {
7832  // Hack round the mess we make when legalizing extract_vector_elt
7833  SDValue Src = Op.getOperand(0);
7834  if (Src.getValueType() == MVT::i16 &&
7835  Src.getOpcode() == ISD::TRUNCATE) {
7836  SDValue TruncSrc = Src.getOperand(0);
7837  if (TruncSrc.getValueType() == MVT::i32 &&
7838  TruncSrc.getOpcode() == ISD::BITCAST &&
7839  TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
7840  return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
7841  }
7842  }
7843 
7844  return false;
7845  }
7846  case ISD::INTRINSIC_WO_CHAIN: {
7847  unsigned IntrinsicID
7848  = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
7849  // TODO: Handle more intrinsics
7850  switch (IntrinsicID) {
7855  return true;
7856  default:
7857  break;
7858  }
7859 
7861  }
7862  default:
7863  return denormalsEnabledForType(Op.getValueType()) &&
7864  DAG.isKnownNeverSNaN(Op);
7865  }
7866 
7867  llvm_unreachable("invalid operation");
7868 }
7869 
7870 // Constant fold canonicalize.
7871 SDValue SITargetLowering::getCanonicalConstantFP(
7872  SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
7873  // Flush denormals to 0 if not enabled.
7874  if (C.isDenormal() && !denormalsEnabledForType(VT))
7875  return DAG.getConstantFP(0.0, SL, VT);
7876 
7877  if (C.isNaN()) {
7878  APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
7879  if (C.isSignaling()) {
7880  // Quiet a signaling NaN.
7881  // FIXME: Is this supposed to preserve payload bits?
7882  return DAG.getConstantFP(CanonicalQNaN, SL, VT);
7883  }
7884 
7885  // Make sure it is the canonical NaN bitpattern.
7886  //
7887  // TODO: Can we use -1 as the canonical NaN value since it's an inline
7888  // immediate?
7889  if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
7890  return DAG.getConstantFP(CanonicalQNaN, SL, VT);
7891  }
7892 
7893  // Already canonical.
7894  return DAG.getConstantFP(C, SL, VT);
7895 }
7896 
7898  return Op.isUndef() || isa<ConstantFPSDNode>(Op);
7899 }
7900 
7901 SDValue SITargetLowering::performFCanonicalizeCombine(
7902  SDNode *N,
7903  DAGCombinerInfo &DCI) const {
7904  SelectionDAG &DAG = DCI.DAG;
7905  SDValue N0 = N->getOperand(0);
7906  EVT VT = N->getValueType(0);
7907 
7908  // fcanonicalize undef -> qnan
7909  if (N0.isUndef()) {
7911  return DAG.getConstantFP(QNaN, SDLoc(N), VT);
7912  }
7913 
7914  if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
7915  EVT VT = N->getValueType(0);
7916  return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
7917  }
7918 
7919  // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
7920  // (fcanonicalize k)
7921  //
7922  // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
7923 
7924  // TODO: This could be better with wider vectors that will be split to v2f16,
7925  // and to consider uses since there aren't that many packed operations.
7926  if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
7928  SDLoc SL(N);
7929  SDValue NewElts[2];
7930  SDValue Lo = N0.getOperand(0);
7931  SDValue Hi = N0.getOperand(1);
7932  EVT EltVT = Lo.getValueType();
7933 
7935  for (unsigned I = 0; I != 2; ++I) {
7936  SDValue Op = N0.getOperand(I);
7937  if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
7938  NewElts[I] = getCanonicalConstantFP(DAG, SL, EltVT,
7939  CFP->getValueAPF());
7940  } else if (Op.isUndef()) {
7941  // Handled below based on what the other operand is.
7942  NewElts[I] = Op;
7943  } else {
7944  NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
7945  }
7946  }
7947 
7948  // If one half is undef, and one is constant, perfer a splat vector rather
7949  // than the normal qNaN. If it's a register, prefer 0.0 since that's
7950  // cheaper to use and may be free with a packed operation.
7951  if (NewElts[0].isUndef()) {
7952  if (isa<ConstantFPSDNode>(NewElts[1]))
7953  NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) ?
7954  NewElts[1]: DAG.getConstantFP(0.0f, SL, EltVT);
7955  }
7956 
7957  if (NewElts[1].isUndef()) {
7958  NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) ?
7959  NewElts[0] : DAG.getConstantFP(0.0f, SL, EltVT);
7960  }
7961 
7962  return DAG.getBuildVector(VT, SL, NewElts);
7963  }
7964  }
7965 
7966  unsigned SrcOpc = N0.getOpcode();
7967 
7968  // If it's free to do so, push canonicalizes further up the source, which may
7969  // find a canonical source.
7970  //
7971  // TODO: More opcodes. Note this is unsafe for the the _ieee minnum/maxnum for
7972  // sNaNs.
7973  if (SrcOpc == ISD::FMINNUM || SrcOpc == ISD::FMAXNUM) {
7974  auto *CRHS = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
7975  if (CRHS && N0.hasOneUse()) {
7976  SDLoc SL(N);
7977  SDValue Canon0 = DAG.getNode(ISD::FCANONICALIZE, SL, VT,
7978  N0.getOperand(0));
7979  SDValue Canon1 = getCanonicalConstantFP(DAG, SL, VT, CRHS->getValueAPF());
7980  DCI.AddToWorklist(Canon0.getNode());
7981 
7982  return DAG.getNode(N0.getOpcode(), SL, VT, Canon0, Canon1);
7983  }
7984  }
7985 
7986  return isCanonicalized(DAG, N0) ? N0 : SDValue();
7987 }
7988 
7989 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
7990  switch (Opc) {
7991  case ISD::FMAXNUM:
7992  case ISD::FMAXNUM_IEEE:
7993  return AMDGPUISD::FMAX3;
7994  case ISD::SMAX:
7995  return AMDGPUISD::SMAX3;
7996  case ISD::UMAX:
7997  return AMDGPUISD::UMAX3;
7998  case ISD::FMINNUM:
7999  case ISD::FMINNUM_IEEE:
8000  return AMDGPUISD::FMIN3;
8001  case ISD::SMIN:
8002  return AMDGPUISD::SMIN3;
8003  case ISD::UMIN:
8004  return AMDGPUISD::UMIN3;
8005  default:
8006  llvm_unreachable("Not a min/max opcode");
8007  }
8008 }
8009 
8010 SDValue SITargetLowering::performIntMed3ImmCombine(
8011  SelectionDAG &DAG, const SDLoc &SL,
8012  SDValue Op0, SDValue Op1, bool Signed) const {
8014  if (!K1)
8015  return SDValue();
8016 
8018  if (!K0)
8019  return SDValue();
8020 
8021  if (Signed) {
8022  if (K0->getAPIntValue().sge(K1->getAPIntValue()))
8023  return SDValue();
8024  } else {
8025  if (K0->getAPIntValue().uge(K1->getAPIntValue()))
8026  return SDValue();
8027  }
8028 
8029  EVT VT = K0->getValueType(0);
8030  unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
8031  if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) {
8032  return DAG.getNode(Med3Opc, SL, VT,
8033  Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
8034  }
8035 
8036  // If there isn't a 16-bit med3 operation, convert to 32-bit.
8037  MVT NVT = MVT::i32;
8038  unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
8039 
8040  SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
8041  SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
8042  SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
8043 
8044  SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3);
8045  return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3);
8046 }
8047 
8049  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
8050  return C;
8051 
8052  if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
8053  if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
8054  return C;
8055  }
8056 
8057  return nullptr;
8058 }
8059 
8060 SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
8061  const SDLoc &SL,
8062  SDValue Op0,
8063  SDValue Op1) const {
8065  if (!K1)
8066  return SDValue();
8067 
8069  if (!K0)
8070  return SDValue();
8071 
8072  // Ordered >= (although NaN inputs should have folded away by now).
8074  if (Cmp == APFloat::cmpGreaterThan)
8075  return SDValue();
8076 
8077  // TODO: Check IEEE bit enabled?
8078  EVT VT = Op0.getValueType();
8079  if (Subtarget->enableDX10Clamp()) {
8080  // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
8081  // hardware fmed3 behavior converting to a min.
8082  // FIXME: Should this be allowing -0.0?
8083  if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
8084  return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
8085  }
8086 
8087  // med3 for f16 is only available on gfx9+, and not available for v2f16.
8088  if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
8089  // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
8090  // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
8091  // then give the other result, which is different from med3 with a NaN
8092  // input.
8093  SDValue Var = Op0.getOperand(0);
8094  if (!DAG.isKnownNeverSNaN(Var))
8095  return SDValue();
8096 
8097  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
8098 
8099  if ((!K0->hasOneUse() ||
8100  TII->isInlineConstant(K0->getValueAPF().bitcastToAPInt())) &&
8101  (!K1->hasOneUse() ||
8102  TII->isInlineConstant(K1->getValueAPF().bitcastToAPInt()))) {
8103  return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
8104  Var, SDValue(K0, 0), SDValue(K1, 0));
8105  }
8106  }
8107 
8108  return SDValue();
8109 }
8110 
8111 SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
8112  DAGCombinerInfo &DCI) const {
8113  SelectionDAG &DAG = DCI.DAG;
8114 
8115  EVT VT = N->getValueType(0);
8116  unsigned Opc = N->getOpcode();
8117  SDValue Op0 = N->getOperand(0);
8118  SDValue Op1 = N->getOperand(1);
8119 
8120  // Only do this if the inner op has one use since this will just increases
8121  // register pressure for no benefit.
8122 
8123 
8124  if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
8125  !VT.isVector() && VT != MVT::f64 &&
8126  ((VT != MVT::f16 && VT != MVT::i16) || Subtarget->hasMin3Max3_16())) {
8127  // max(max(a, b), c) -> max3(a, b, c)
8128  // min(min(a, b), c) -> min3(a, b, c)
8129  if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
8130  SDLoc DL(N);
8131  return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
8132  DL,
8133  N->getValueType(0),
8134  Op0.getOperand(0),
8135  Op0.getOperand(1),
8136  Op1);
8137  }
8138 
8139  // Try commuted.
8140  // max(a, max(b, c)) -> max3(a, b, c)
8141  // min(a, min(b, c)) -> min3(a, b, c)
8142  if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
8143  SDLoc DL(N);
8144  return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
8145  DL,
8146  N->getValueType(0),
8147  Op0,
8148  Op1.getOperand(0),
8149  Op1.getOperand(1));
8150  }
8151  }
8152 
8153  // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
8154  if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
8155  if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true))
8156  return Med3;
8157  }
8158 
8159  if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
8160  if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false))
8161  return Med3;
8162  }
8163 
8164  // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
8165  if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
8166  (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
8167  (Opc == AMDGPUISD::FMIN_LEGACY &&
8168  Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
8169  (VT == MVT::f32 || VT == MVT::f64 ||
8170  (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
8171  (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
8172  Op0.hasOneUse()) {
8173  if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
8174  return Res;
8175  }
8176 
8177  return SDValue();
8178 }
8179 
8181  if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
8182  if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
8183  // FIXME: Should this be allowing -0.0?
8184  return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
8185  (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
8186  }
8187  }
8188 
8189  return false;
8190 }
8191 
8192 // FIXME: Should only worry about snans for version with chain.
8193 SDValue SITargetLowering::performFMed3Combine(SDNode *N,
8194  DAGCombinerInfo &DCI) const {
8195  EVT VT = N->getValueType(0);
8196  // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
8197  // NaNs. With a NaN input, the order of the operands may change the result.
8198 
8199  SelectionDAG &DAG = DCI.DAG;
8200  SDLoc SL(N);
8201 
8202  SDValue Src0 = N->getOperand(0);
8203  SDValue Src1 = N->getOperand(1);
8204  SDValue Src2 = N->getOperand(2);
8205 
8206  if (isClampZeroToOne(Src0, Src1)) {
8207  // const_a, const_b, x -> clamp is safe in all cases including signaling
8208  // nans.
8209  // FIXME: Should this be allowing -0.0?
8210  return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
8211  }
8212 
8213  // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
8214  // handling no dx10-clamp?
8215  if (Subtarget->enableDX10Clamp()) {
8216  // If NaNs is clamped to 0, we are free to reorder the inputs.
8217 
8218  if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
8219  std::swap(Src0, Src1);
8220 
8221  if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
8222  std::swap(Src1, Src2);
8223 
8224  if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
8225  std::swap(Src0, Src1);
8226 
8227  if (isClampZeroToOne(Src1, Src2))
8228  return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
8229  }
8230 
8231  return SDValue();
8232 }
8233 
8234 SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
8235  DAGCombinerInfo &DCI) const {
8236  SDValue Src0 = N->getOperand(0);
8237  SDValue Src1 = N->getOperand(1);
8238  if (Src0.isUndef() && Src1.isUndef())
8239  return DCI.DAG.getUNDEF(N->getValueType(0));
8240  return SDValue();
8241 }
8242 
8243 SDValue SITargetLowering::performExtractVectorEltCombine(
8244  SDNode *N, DAGCombinerInfo &DCI) const {
8245  SDValue Vec = N->getOperand(0);
8246  SelectionDAG &DAG = DCI.DAG;
8247 
8248  EVT VecVT = Vec.getValueType();
8249  EVT EltVT = VecVT.getVectorElementType();
8250 
8251  if ((Vec.getOpcode() == ISD::FNEG ||
8252  Vec.getOpcode() == ISD::FABS) && allUsesHaveSourceMods(N)) {
8253  SDLoc SL(N);
8254  EVT EltVT = N->getValueType(0);
8255  SDValue Idx = N->getOperand(1);
8256  SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
8257  Vec.getOperand(0), Idx);
8258  return DAG.getNode(Vec.getOpcode(), SL, EltVT, Elt);
8259  }
8260 
8261  // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
8262  // =>
8263  // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
8264  // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
8265  // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
8266  if (Vec.hasOneUse() && DCI.isBeforeLegalize()) {
8267  SDLoc SL(N);
8268  EVT EltVT = N->getValueType(0);
8269  SDValue Idx = N->getOperand(1);
8270  unsigned Opc = Vec.getOpcode();
8271 
8272  switch(Opc) {
8273  default:
8274  break;
8275  // TODO: Support other binary operations.
8276  case ISD::FADD:
8277  case ISD::FSUB:
8278  case ISD::FMUL:
8279  case ISD::ADD:
8280  case ISD::UMIN:
8281  case ISD::UMAX:
8282  case ISD::SMIN:
8283  case ISD::SMAX:
8284  case ISD::FMAXNUM:
8285  case ISD::FMINNUM:
8286  case ISD::FMAXNUM_IEEE:
8287  case ISD::FMINNUM_IEEE: {
8288  SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
8289  Vec.getOperand(0), Idx);
8290  SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
8291  Vec.getOperand(1), Idx);
8292 
8293  DCI.AddToWorklist(Elt0.getNode());
8294  DCI.AddToWorklist(Elt1.getNode());
8295  return DAG.getNode(Opc, SL, EltVT, Elt0, Elt1, Vec->getFlags());
8296  }
8297  }
8298  }
8299 
8300  unsigned VecSize = VecVT.getSizeInBits();
8301  unsigned EltSize = EltVT.getSizeInBits();
8302 
8303  // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
8304  // This elminates non-constant index and subsequent movrel or scratch access.
8305  // Sub-dword vectors of size 2 dword or less have better implementation.
8306  // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
8307  // instructions.
8308  if (VecSize <= 256 && (VecSize > 64 || EltSize >= 32) &&
8309  !isa<ConstantSDNode>(N->getOperand(1))) {
8310  SDLoc SL(N);
8311  SDValue Idx = N->getOperand(1);
8312  EVT IdxVT = Idx.getValueType();
8313  SDValue V;
8314  for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
8315  SDValue IC = DAG.getConstant(I, SL, IdxVT);
8316  SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
8317  if (I == 0)
8318  V = Elt;
8319  else
8320  V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
8321  }
8322  return V;
8323  }
8324 
8325  if (!DCI.isBeforeLegalize())
8326  return SDValue();
8327 
8328  // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
8329  // elements. This exposes more load reduction opportunities by replacing
8330  // multiple small extract_vector_elements with a single 32-bit extract.
8331  auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
8332  if (isa<MemSDNode>(Vec) &&
8333  EltSize <= 16 &&
8334  EltVT.isByteSized() &&
8335  VecSize > 32 &&
8336  VecSize % 32 == 0 &&
8337  Idx) {
8338  EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
8339 
8340  unsigned BitIndex = Idx->getZExtValue() * EltSize;
8341  unsigned EltIdx = BitIndex / 32;
8342  unsigned LeftoverBitIdx = BitIndex % 32;
8343  SDLoc SL(N);
8344 
8345  SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
8346  DCI.AddToWorklist(Cast.getNode());
8347 
8348  SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
8349  DAG.getConstant(EltIdx, SL, MVT::i32));
8350  DCI.AddToWorklist(Elt.getNode());
8351  SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
8352  DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
8353  DCI.AddToWorklist(Srl.getNode());
8354 
8355  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, EltVT.changeTypeToInteger(), Srl);
8356  DCI.AddToWorklist(Trunc.getNode());
8357  return DAG.getNode(ISD::BITCAST, SL, EltVT, Trunc);
8358  }
8359 
8360  return SDValue();
8361 }
8362 
8363 SDValue
8364 SITargetLowering::performInsertVectorEltCombine(SDNode *N,
8365  DAGCombinerInfo &DCI) const {
8366  SDValue Vec = N->getOperand(0);
8367  SDValue Idx = N->getOperand(2);
8368  EVT VecVT = Vec.getValueType();
8369  EVT EltVT = VecVT.getVectorElementType();
8370  unsigned VecSize = VecVT.getSizeInBits();
8371  unsigned EltSize = EltVT.getSizeInBits();
8372 
8373  // INSERT_VECTOR_ELT (<n x e>, var-idx)
8374  // => BUILD_VECTOR n x select (e, const-idx)
8375  // This elminates non-constant index and subsequent movrel or scratch access.
8376  // Sub-dword vectors of size 2 dword or less have better implementation.
8377  // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
8378  // instructions.
8379  if (isa<ConstantSDNode>(Idx) ||
8380  VecSize > 256 || (VecSize <= 64 && EltSize < 32))
8381  return SDValue();
8382 
8383  SelectionDAG &DAG = DCI.DAG;
8384  SDLoc SL(N);
8385  SDValue Ins = N->getOperand(1);
8386  EVT IdxVT = Idx.getValueType();
8387 
8389  for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
8390  SDValue IC = DAG.getConstant(I, SL, IdxVT);
8391  SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
8392  SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
8393  Ops.push_back(V);
8394  }
8395 
8396  return DAG.getBuildVector(VecVT, SL, Ops);
8397 }
8398 
8399 unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
8400  const SDNode *N0,
8401  const SDNode *N1) const {
8402  EVT VT = N0->getValueType(0);
8403 
8404  // Only do this if we are not trying to support denormals. v_mad_f32 does not
8405  // support denormals ever.
8406  if ((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) ||
8407  (VT == MVT::f16 && !Subtarget->hasFP16Denormals()))
8408  return ISD::FMAD;
8409 
8410  const TargetOptions &Options = DAG.getTarget().Options;
8411  if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
8412  (N0->getFlags().hasAllowContract() &&
8413  N1->getFlags().hasAllowContract())) &&
8415  return ISD::FMA;
8416  }
8417 
8418  return 0;
8419 }
8420 
8421 static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
8422  EVT VT,
8423  SDValue N0, SDValue N1, SDValue N2,
8424  bool Signed) {
8425  unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32;
8426  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
8427  SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
8428  return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
8429 }
8430 
8431 SDValue SITargetLowering::performAddCombine(SDNode *N,
8432  DAGCombinerInfo &DCI) const {
8433  SelectionDAG &DAG = DCI.DAG;
8434  EVT VT = N->getValueType(0);
8435  SDLoc SL(N);
8436  SDValue LHS = N->getOperand(0);
8437  SDValue RHS = N->getOperand(1);
8438 
8439  if ((LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL)
8440  && Subtarget->hasMad64_32() &&
8441  !VT.isVector() && VT.getScalarSizeInBits() > 32 &&
8442  VT.getScalarSizeInBits() <= 64) {
8443  if (LHS.getOpcode() != ISD::MUL)
8444  std::swap(LHS, RHS);
8445 
8446  SDValue MulLHS = LHS.getOperand(0);
8447  SDValue MulRHS = LHS.getOperand(1);
8448  SDValue AddRHS = RHS;
8449 
8450  // TODO: Maybe restrict if SGPR inputs.
8451  if (numBitsUnsigned(MulLHS, DAG) <= 32 &&
8452  numBitsUnsigned(MulRHS, DAG) <= 32) {
8453  MulLHS = DAG.getZExtOrTrunc(MulLHS, SL, MVT::i32);
8454  MulRHS = DAG.getZExtOrTrunc(MulRHS, SL, MVT::i32);
8455  AddRHS = DAG.getZExtOrTrunc(AddRHS, SL, MVT::i64);
8456  return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, false);
8457  }
8458 
8459  if (numBitsSigned(MulLHS, DAG) < 32 && numBitsSigned(MulRHS, DAG) < 32) {
8460  MulLHS = DAG.getSExtOrTrunc(MulLHS, SL, MVT::i32);
8461  MulRHS = DAG.getSExtOrTrunc(MulRHS, SL, MVT::i32);
8462  AddRHS = DAG.getSExtOrTrunc(AddRHS, SL, MVT::i64);
8463  return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, true);
8464  }
8465 
8466  return SDValue();
8467  }
8468 
8469  if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
8470  return SDValue();
8471 
8472  // add x, zext (setcc) => addcarry x, 0, setcc
8473  // add x, sext (setcc) => subcarry x, 0, setcc
8474  unsigned Opc = LHS.getOpcode();
8475  if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
8476  Opc == ISD::ANY_EXTEND || Opc == ISD::ADDCARRY)
8477  std::swap(RHS, LHS);
8478 
8479  Opc = RHS.getOpcode();
8480  switch (Opc) {
8481  default: break;
8482  case ISD::ZERO_EXTEND:
8483  case ISD::SIGN_EXTEND:
8484  case ISD::ANY_EXTEND: {
8485  auto Cond = RHS.getOperand(0);
8486  if (!isBoolSGPR(Cond))
8487  break;
8488  SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
8489  SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
8490  Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::SUBCARRY : ISD::ADDCARRY;
8491  return DAG.getNode(Opc, SL, VTList, Args);
8492  }
8493  case ISD::ADDCARRY: {
8494  // add x, (addcarry y, 0, cc) => addcarry x, y, cc
8495  auto C = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
8496  if (!C || C->getZExtValue() != 0) break;
8497  SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) };
8498  return DAG.getNode(ISD::ADDCARRY, SDLoc(N), RHS->getVTList(), Args);
8499  }
8500  }
8501  return SDValue();
8502 }
8503 
8504 SDValue SITargetLowering::performSubCombine(SDNode *N,
8505  DAGCombinerInfo &DCI) const {
8506  SelectionDAG &DAG = DCI.DAG;
8507  EVT VT = N->getValueType(0);
8508 
8509  if (VT != MVT::i32)
8510  return SDValue();
8511 
8512  SDLoc SL(N);
8513  SDValue LHS = N->getOperand(0);
8514  SDValue RHS = N->getOperand(1);
8515 
8516  unsigned Opc = LHS.getOpcode();
8517  if (Opc != ISD::SUBCARRY)
8518  std::swap(RHS, LHS);
8519 
8520  if (LHS.getOpcode() == ISD::SUBCARRY) {
8521  // sub (subcarry x, 0, cc), y => subcarry x, y, cc
8522  auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
8523  if (!C || C->getZExtValue() != 0)
8524  return SDValue();
8525  SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
8526  return DAG.getNode(ISD::SUBCARRY, SDLoc(N), LHS->getVTList(), Args);
8527  }
8528  return SDValue();
8529 }
8530 
8531 SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
8532  DAGCombinerInfo &DCI) const {
8533 
8534  if (N->getValueType(0) != MVT::i32)
8535  return SDValue();
8536 
8537  auto C = dyn_cast<ConstantSDNode>(N->getOperand(1));
8538  if (!C || C->getZExtValue() != 0)
8539  return SDValue();
8540 
8541  SelectionDAG &DAG = DCI.DAG;
8542  SDValue LHS = N->getOperand(0);
8543 
8544  // addcarry (add x, y), 0, cc => addcarry x, y, cc
8545  // subcarry (sub x, y), 0, cc => subcarry x, y, cc
8546  unsigned LHSOpc = LHS.getOpcode();
8547  unsigned Opc = N->getOpcode();
8548  if ((LHSOpc == ISD::ADD && Opc == ISD::ADDCARRY) ||
8549  (LHSOpc == ISD::SUB && Opc == ISD::SUBCARRY)) {
8550  SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) };
8551  return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
8552  }
8553  return SDValue();
8554 }
8555 
8556 SDValue SITargetLowering::performFAddCombine(SDNode *N,
8557  DAGCombinerInfo &DCI) const {
8558  if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
8559  return SDValue();
8560 
8561  SelectionDAG &DAG = DCI.DAG;
8562  EVT VT = N->getValueType(0);
8563 
8564  SDLoc SL(N);
8565  SDValue LHS = N->getOperand(0);
8566  SDValue RHS = N->getOperand(1);
8567 
8568  // These should really be instruction patterns, but writing patterns with
8569  // source modiifiers is a pain.
8570 
8571  // fadd (fadd (a, a), b) -> mad 2.0, a, b
8572  if (LHS.getOpcode() == ISD::FADD) {
8573  SDValue A = LHS.getOperand(0);
8574  if (A == LHS.getOperand(1)) {
8575  unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
8576  if (FusedOp != 0) {
8577  const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
8578  return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
8579  }
8580  }
8581  }
8582 
8583  // fadd (b, fadd (a, a)) -> mad 2.0, a, b
8584  if (RHS.getOpcode() == ISD::FADD) {
8585  SDValue A = RHS.getOperand(0);
8586  if (A == RHS.getOperand(1)) {
8587  unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
8588  if (FusedOp != 0) {
8589  const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
8590  return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
8591  }
8592  }
8593  }
8594 
8595  return SDValue();
8596 }
8597 
8598 SDValue SITargetLowering::performFSubCombine(SDNode *N,
8599  DAGCombinerInfo &DCI) const {
8600  if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
8601  return SDValue();
8602 
8603  SelectionDAG &DAG = DCI.DAG;
8604  SDLoc SL(N);
8605  EVT VT = N->getValueType(0);
8606  assert(!VT.isVector());
8607 
8608  // Try to get the fneg to fold into the source modifier. This undoes generic
8609  // DAG combines and folds them into the mad.
8610  //
8611  // Only do this if we are not trying to support denormals. v_mad_f32 does
8612  // not support denormals ever.
8613  SDValue LHS = N->getOperand(0);
8614  SDValue RHS = N->getOperand(1);
8615  if (LHS.getOpcode() == ISD::FADD) {
8616  // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
8617  SDValue A = LHS.getOperand(0);
8618  if (A == LHS.getOperand(1)) {
8619  unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
8620  if (FusedOp != 0){
8621  const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
8622  SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
8623 
8624  return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
8625  }
8626  }
8627  }
8628 
8629  if (RHS.getOpcode() == ISD::FADD) {
8630  // (fsub c, (fadd a, a)) -> mad -2.0, a, c
8631 
8632  SDValue A = RHS.getOperand(0);
8633  if (A == RHS.getOperand(1)) {
8634  unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
8635  if (FusedOp != 0){
8636  const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
8637  return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
8638  }
8639  }
8640  }
8641 
8642  return SDValue();
8643 }
8644 
8645 SDValue SITargetLowering::performFMACombine(SDNode *N,
8646  DAGCombinerInfo &DCI) const {
8647  SelectionDAG &DAG = DCI.DAG;
8648  EVT VT = N->getValueType(0);
8649  SDLoc SL(N);
8650 
8651  if (!Subtarget->hasDotInsts() || VT != MVT::f32)
8652  return SDValue();
8653 
8654  // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
8655  // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
8656  SDValue Op1 = N->getOperand(0);
8657  SDValue Op2 = N->getOperand(1);
8658  SDValue FMA = N->getOperand(2);
8659 
8660  if (FMA.getOpcode() != ISD::FMA ||
8661  Op1.getOpcode() != ISD::FP_EXTEND ||
8662  Op2.getOpcode() != ISD::FP_EXTEND)
8663  return SDValue();
8664 
8665  // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
8666  // regardless of the denorm mode setting. Therefore, unsafe-fp-math/fp-contract
8667  // is sufficient to allow generaing fdot2.
8668  const TargetOptions &Options = DAG.getTarget().Options;
8669  if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
8670  (N->getFlags().hasAllowContract() &&
8671  FMA->getFlags().hasAllowContract())) {
8672  Op1 = Op1.getOperand(0);
8673  Op2 = Op2.getOperand(0);
8674  if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8676  return SDValue();
8677 
8678  SDValue Vec1 = Op1.getOperand(0);
8679  SDValue Idx1 = Op1.getOperand(1);
8680  SDValue Vec2 = Op2.getOperand(0);
8681 
8682  SDValue FMAOp1 = FMA.getOperand(0);
8683  SDValue FMAOp2 = FMA.getOperand(1);
8684  SDValue FMAAcc = FMA.getOperand(2);
8685 
8686  if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
8687  FMAOp2.getOpcode() != ISD::FP_EXTEND)
8688  return SDValue();
8689 
8690  FMAOp1 = FMAOp1.getOperand(0);
8691  FMAOp2 = FMAOp2.getOperand(0);
8692  if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8693  FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8694  return SDValue();
8695 
8696  SDValue Vec3 = FMAOp1.getOperand(0);
8697  SDValue Vec4 = FMAOp2.getOperand(0);
8698  SDValue Idx2 = FMAOp1.getOperand(1);
8699 
8700  if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
8701  // Idx1 and Idx2 cannot be the same.
8702  Idx1 == Idx2)
8703  return SDValue();
8704 
8705  if (Vec1 == Vec2 || Vec3 == Vec4)
8706  return SDValue();
8707 
8708  if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
8709  return SDValue();
8710 
8711  if ((Vec1 == Vec3 && Vec2 == Vec4) ||
8712  (Vec1 == Vec4 && Vec2 == Vec3)) {
8713  return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
8714  DAG.getTargetConstant(0, SL, MVT::i1));
8715  }
8716  }
8717  return SDValue();
8718 }
8719 
8720 SDValue SITargetLowering::performSetCCCombine(SDNode *N,
8721  DAGCombinerInfo &DCI) const {
8722  SelectionDAG &DAG = DCI.DAG;
8723  SDLoc SL(N);
8724 
8725  SDValue LHS = N->getOperand(0);
8726  SDValue RHS = N->getOperand(1);
8727  EVT VT = LHS.getValueType();
8728  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
8729 
8730  auto CRHS = dyn_cast<ConstantSDNode>(RHS);
8731  if (!CRHS) {
8732  CRHS = dyn_cast<ConstantSDNode>(LHS);
8733  if (CRHS) {
8734  std::swap(LHS, RHS);
8735  CC = getSetCCSwappedOperands(CC);
8736  }
8737  }
8738 
8739  if (CRHS) {
8740  if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
8741  isBoolSGPR(LHS.getOperand(0))) {
8742  // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
8743  // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
8744  // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
8745  // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
8746  if ((CRHS->isAllOnesValue() &&
8747  (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
8748  (CRHS->isNullValue() &&
8749  (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
8750  return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
8751  DAG.getConstant(-1, SL, MVT::i1));
8752  if ((CRHS->isAllOnesValue() &&
8753  (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
8754  (CRHS->isNullValue() &&
8755  (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
8756  return LHS.getOperand(0);
8757  }
8758 
8759  uint64_t CRHSVal = CRHS->getZExtValue();
8760  if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
8761  LHS.getOpcode() == ISD::SELECT &&
8762  isa<ConstantSDNode>(LHS.getOperand(1)) &&
8763  isa<ConstantSDNode>(LHS.getOperand(2)) &&
8764  LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
8765  isBoolSGPR(LHS.getOperand(0))) {
8766  // Given CT != FT:
8767  // setcc (select cc, CT, CF), CF, eq => xor cc, -1
8768  // setcc (select cc, CT, CF), CF, ne => cc
8769  // setcc (select cc, CT, CF), CT, ne => xor cc, -1
8770  // setcc (select cc, CT, CF), CT, eq => cc
8771  uint64_t CT = LHS.getConstantOperandVal(1);
8772  uint64_t CF = LHS.getConstantOperandVal(2);
8773 
8774  if ((CF == CRHSVal && CC == ISD::SETEQ) ||
8775  (CT == CRHSVal && CC == ISD::SETNE))
8776  return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
8777  DAG.getConstant(-1, SL, MVT::i1));
8778  if ((CF == CRHSVal && CC == ISD::SETNE) ||
8779  (CT == CRHSVal && CC == ISD::SETEQ))
8780  return LHS.getOperand(0);
8781  }
8782  }
8783 
8784  if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() &&
8785  VT != MVT::f16))
8786  return SDValue();
8787 
8788  // Match isinf/isfinite pattern
8789  // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
8790  // (fcmp one (fabs x), inf) -> (fp_class x,
8791  // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
8792  if ((CC == ISD::SETOEQ || CC == ISD::SETONE) && LHS.getOpcode() == ISD::FABS) {
8793  const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
8794  if (!CRHS)
8795  return SDValue();
8796 
8797  const APFloat &APF = CRHS->getValueAPF();
8798  if (APF.isInfinity() && !APF.isNegative()) {
8799  const unsigned IsInfMask = SIInstrFlags::P_INFINITY |
8801  const unsigned IsFiniteMask = SIInstrFlags::N_ZERO |
8807  unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
8808  return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
8809  DAG.getConstant(Mask, SL, MVT::i32));
8810  }
8811  }
8812 
8813  return SDValue();
8814 }
8815 
8816 SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
8817  DAGCombinerInfo &DCI) const {
8818  SelectionDAG &DAG = DCI.DAG;
8819  SDLoc SL(N);
8820  unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
8821 
8822  SDValue Src = N->getOperand(0);
8823  SDValue Srl = N->getOperand(0);
8824  if (Srl.getOpcode() == ISD::ZERO_EXTEND)
8825  Srl = Srl.getOperand(0);
8826 
8827  // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero.
8828  if (Srl.getOpcode() == ISD::SRL) {
8829  // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
8830  // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
8831  // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
8832 
8833  if (const ConstantSDNode *C =
8834  dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
8835  Srl = DAG.getZExtOrTrunc(Srl.getOperand(0), SDLoc(Srl.getOperand(0)),
8836  EVT(MVT::i32));
8837 
8838  unsigned SrcOffset = C->getZExtValue() + 8 * Offset;
8839  if (SrcOffset < 32 && SrcOffset % 8 == 0) {
8840  return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, SL,
8841  MVT::f32, Srl);
8842  }
8843  }
8844  }
8845 
8846  APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
8847 
8848  KnownBits Known;
8849  TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
8850  !DCI.isBeforeLegalizeOps());
8851  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8852  if (TLI.SimplifyDemandedBits(Src, Demanded, Known, TLO)) {
8853  DCI.CommitTargetLoweringOpt(TLO);
8854  }
8855 
8856  return SDValue();
8857 }
8858 
8859 SDValue SITargetLowering::performClampCombine(SDNode *N,
8860  DAGCombinerInfo &DCI) const {
8862  if (!CSrc)
8863  return SDValue();
8864 
8865  const APFloat &F = CSrc->getValueAPF();
8867  APFloat::cmpResult Cmp0 = F.compare(Zero);
8868  if (Cmp0 == APFloat::cmpLessThan ||
8869  (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) {
8870  return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
8871  }
8872 
8873  APFloat One(F.getSemantics(), "1.0");
8874  APFloat::cmpResult Cmp1 = F.compare(One);
8875  if (Cmp1 == APFloat::cmpGreaterThan)
8876  return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
8877 
8878  return SDValue(CSrc, 0);
8879 }
8880 
8881 
8883  DAGCombinerInfo &DCI) const {
8884  if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
8885  return SDValue();
8886 
8887  switch (N->getOpcode()) {
8888  default:
8890  case ISD::ADD:
8891  return performAddCombine(N, DCI);
8892  case ISD::SUB:
8893  return performSubCombine(N, DCI);
8894  case ISD::ADDCARRY:
8895  case ISD::SUBCARRY:
8896  return performAddCarrySubCarryCombine(N, DCI);
8897  case ISD::FADD:
8898  return performFAddCombine(N, DCI);
8899  case ISD::FSUB:
8900  return performFSubCombine(N, DCI);
8901  case ISD::SETCC:
8902  return performSetCCCombine(N, DCI);
8903  case ISD::FMAXNUM:
8904  case ISD::FMINNUM:
8905  case ISD::FMAXNUM_IEEE:
8906  case ISD::FMINNUM_IEEE:
8907  case ISD::SMAX:
8908  case ISD::SMIN:
8909  case ISD::UMAX:
8910  case ISD::UMIN:
8913  return performMinMaxCombine(N, DCI);
8914  case ISD::FMA:
8915  return performFMACombine(N, DCI);
8916  case ISD::LOAD: {
8917  if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI))
8918  return Widended;
8920  }
8921  case ISD::STORE:
8922  case ISD::ATOMIC_LOAD:
8923  case ISD::ATOMIC_STORE:
8924  case ISD::ATOMIC_CMP_SWAP:
8926  case ISD::ATOMIC_SWAP:
8927  case ISD::ATOMIC_LOAD_ADD:
8928  case ISD::ATOMIC_LOAD_SUB:
8929  case ISD::ATOMIC_LOAD_AND:
8930  case ISD::ATOMIC_LOAD_OR:
8931  case ISD::ATOMIC_LOAD_XOR:
8932  case ISD::ATOMIC_LOAD_NAND:
8933  case ISD::ATOMIC_LOAD_MIN:
8934  case ISD::ATOMIC_LOAD_MAX:
8935  case ISD::ATOMIC_LOAD_UMIN:
8936  case ISD::ATOMIC_LOAD_UMAX:
8937  case AMDGPUISD::ATOMIC_INC:
8938  case AMDGPUISD::ATOMIC_DEC:
8941  case AMDGPUISD::ATOMIC_LOAD_FMAX: // TODO: Target mem intrinsics.
8942  if (DCI.isBeforeLegalize())
8943  break;
8944  return performMemSDNodeCombine(cast<MemSDNode>(N), DCI);
8945  case ISD::AND:
8946  return performAndCombine(N, DCI);
8947  case ISD::OR:
8948  return performOrCombine(N, DCI);
8949  case ISD::XOR:
8950  return performXorCombine(N, DCI);
8951  case ISD::ZERO_EXTEND:
8952  return performZeroExtendCombine(N, DCI);
8953  case AMDGPUISD::FP_CLASS:
8954  return performClassCombine(N, DCI);
8955  case ISD::FCANONICALIZE:
8956  return performFCanonicalizeCombine(N, DCI);
8957  case AMDGPUISD::RCP:
8958  return performRcpCombine(N, DCI);
8959  case AMDGPUISD::FRACT:
8960  case AMDGPUISD::RSQ:
8961  case AMDGPUISD::RCP_LEGACY:
8962  case AMDGPUISD::RSQ_LEGACY:
8963  case AMDGPUISD::RCP_IFLAG:
8964  case AMDGPUISD::RSQ_CLAMP:
8965  case AMDGPUISD::LDEXP: {
8966  SDValue Src = N->getOperand(0);
8967  if (Src.isUndef())
8968  return Src;
8969  break;
8970  }
8971  case ISD::SINT_TO_FP:
8972  case ISD::UINT_TO_FP:
8973  return performUCharToFloatCombine(N, DCI);
8978  return performCvtF32UByteNCombine(N, DCI);
8979  case AMDGPUISD::FMED3:
8980  return performFMed3Combine(N, DCI);
8982  return performCvtPkRTZCombine(N, DCI);
8983  case AMDGPUISD::CLAMP:
8984  return performClampCombine(N, DCI);
8985  case ISD::SCALAR_TO_VECTOR: {
8986  SelectionDAG &DAG = DCI.DAG;
8987  EVT VT = N->getValueType(0);
8988 
8989  // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
8990  if (VT == MVT::v2i16 || VT == MVT::v2f16) {
8991  SDLoc SL(N);
8992  SDValue Src = N->getOperand(0);
8993  EVT EltVT = Src.getValueType();
8994  if (EltVT == MVT::f16)
8995  Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
8996 
8997  SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
8998  return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
8999  }
9000 
9001  break;
9002  }
9004  return performExtractVectorEltCombine(N, DCI);
9006  return performInsertVectorEltCombine(N, DCI);
9007  }
9009 }
9010 
9011 /// Helper function for adjustWritemask
9012 static unsigned SubIdx2Lane(unsigned Idx) {
9013  switch (Idx) {
9014  default: return 0;
9015  case AMDGPU::sub0: return 0;
9016  case AMDGPU::sub1: return 1;
9017  case AMDGPU::sub2: return 2;
9018  case AMDGPU::sub3: return 3;
9019  case AMDGPU::sub4: return 4; // Possible with TFE/LWE
9020  }
9021 }
9022 
9023 /// Adjust the writemask of MIMG instructions
9024 SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
9025  SelectionDAG &DAG) const {
9026  unsigned Opcode = Node->getMachineOpcode();
9027 
9028  // Subtract 1 because the vdata output is not a MachineSDNode operand.
9029  int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
9030  if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
9031  return Node; // not implemented for D16
9032 
9033  SDNode *Users[5] = { nullptr };
9034  unsigned Lane = 0;
9035  unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
9036  unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
9037  unsigned NewDmask = 0;
9038  unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
9039  unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
9040  bool UsesTFC = (Node->getConstantOperandVal(TFEIdx) ||
9041  Node->getConstantOperandVal(LWEIdx)) ? 1 : 0;
9042  unsigned TFCLane = 0;
9043  bool HasChain = Node->getNumValues() > 1;
9044 
9045  if (OldDmask == 0) {
9046  // These are folded out, but on the chance it happens don't assert.
9047  return Node;
9048  }
9049 
9050  unsigned OldBitsSet = countPopulation(OldDmask);
9051  // Work out which is the TFE/LWE lane if that is enabled.
9052  if (UsesTFC) {
9053  TFCLane = OldBitsSet;
9054  }
9055 
9056  // Try to figure out the used register components
9057  for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
9058  I != E; ++I) {
9059 
9060  // Don't look at users of the chain.
9061  if (I.getUse().getResNo() != 0)
9062  continue;
9063 
9064  // Abort if we can't understand the usage
9065  if (!I->isMachineOpcode() ||
9066  I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
9067  return Node;
9068 
9069  // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
9070  // Note that subregs are packed, i.e. Lane==0 is the first bit set
9071  // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
9072  // set, etc.
9073  Lane = SubIdx2Lane(I->getConstantOperandVal(1));
9074 
9075  // Check if the use is for the TFE/LWE generated result at VGPRn+1.
9076  if (UsesTFC && Lane == TFCLane) {
9077  Users[Lane] = *I;
9078  } else {
9079  // Set which texture component corresponds to the lane.
9080  unsigned Comp;
9081  for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
9082  Comp = countTrailingZeros(Dmask);
9083  Dmask &= ~(1 << Comp);
9084  }
9085 
9086  // Abort if we have more than one user per component.
9087  if (Users[Lane])
9088  return Node;
9089 
9090  Users[Lane] = *I;
9091  NewDmask |= 1 << Comp;
9092  }
9093  }
9094 
9095  // Don't allow 0 dmask, as hardware assumes one channel enabled.
9096  bool NoChannels = !NewDmask;
9097  if (NoChannels) {
9098  // If the original dmask has one channel - then nothing to do
9099  if (OldBitsSet == 1)
9100  return Node;
9101  // Use an arbitrary dmask - required for the instruction to work
9102  NewDmask = 1;
9103  }
9104  // Abort if there's no change
9105  if (NewDmask == OldDmask)
9106  return Node;
9107 
9108  unsigned BitsSet = countPopulation(NewDmask);
9109 
9110  // Check for TFE or LWE - increase the number of channels by one to account
9111  // for the extra return value
9112  // This will need adjustment for D16 if this is also included in
9113  // adjustWriteMask (this function) but at present D16 are excluded.
9114  unsigned NewChannels = BitsSet + UsesTFC;
9115 
9116  int NewOpcode =
9117  AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
9118  assert(NewOpcode != -1 &&
9119  NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
9120  "failed to find equivalent MIMG op");
9121 
9122  // Adjust the writemask in the node
9124  Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
9125  Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
9126  Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
9127 
9128  MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
9129 
9130  MVT ResultVT = NewChannels == 1 ?
9131  SVT : MVT::getVectorVT(SVT, NewChannels == 3 ? 4 :
9132  NewChannels == 5 ? 8 : NewChannels);
9133  SDVTList NewVTList = HasChain ?
9134  DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
9135 
9136 
9137  MachineSDNode *NewNode = DAG.getMachineNode(NewOpcode, SDLoc(Node),
9138  NewVTList, Ops);
9139 
9140  if (HasChain) {
9141  // Update chain.
9142  DAG.setNodeMemRefs(NewNode, Node->memoperands());
9143  DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
9144  }
9145 
9146  if (NewChannels == 1) {
9147  assert(Node->hasNUsesOfValue(1, 0));
9148  SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY,
9149  SDLoc(Node), Users[Lane]->getValueType(0),
9150  SDValue(NewNode, 0));
9151  DAG.ReplaceAllUsesWith(Users[Lane], Copy);
9152  return nullptr;
9153  }
9154 
9155  // Update the users of the node with the new indices
9156  for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
9157  SDNode *User = Users[i];
9158  if (!User) {
9159  // Handle the special case of NoChannels. We set NewDmask to 1 above, but
9160  // Users[0] is still nullptr because channel 0 doesn't really have a use.
9161  if (i || !NoChannels)
9162  continue;
9163  } else {
9164  SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
9165  DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
9166  }
9167 
9168  switch (Idx) {
9169  default: break;
9170  case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
9171  case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
9172  case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
9173  case AMDGPU::sub3: Idx = AMDGPU::sub4; break;
9174  }
9175  }
9176 
9177  DAG.RemoveDeadNode(Node);
9178  return nullptr;
9179 }
9180 
9181 static bool isFrameIndexOp(SDValue Op) {
9182  if (Op.getOpcode() == ISD::AssertZext)
9183  Op = Op.getOperand(0);
9184 
9185  return isa<FrameIndexSDNode>(Op);
9186 }
9187 
9188 /// Legalize target independent instructions (e.g. INSERT_SUBREG)
9189 /// with frame index operands.
9190 /// LLVM assumes that inputs are to these instructions are registers.
9192  SelectionDAG &DAG) const {
9193  if (Node->getOpcode() == ISD::CopyToReg) {
9194  RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
9195  SDValue SrcVal = Node->getOperand(2);
9196 
9197  // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
9198  // to try understanding copies to physical registers.
9199  if (SrcVal.getValueType() == MVT::i1 &&
9201  SDLoc SL(Node);
9203  SDValue VReg = DAG.getRegister(
9204  MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
9205 
9206  SDNode *Glued = Node->getGluedNode();
9207  SDValue ToVReg
9208  = DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
9209  SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
9210  SDValue ToResultReg
9211  = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
9212  VReg, ToVReg.getValue(1));
9213  DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
9214  DAG.RemoveDeadNode(Node);
9215  return ToResultReg.getNode();
9216  }
9217  }
9218 
9220  for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
9221  if (!isFrameIndexOp(Node->getOperand(i))) {
9222  Ops.push_back(Node->getOperand(i));
9223  continue;
9224  }
9225 
9226  SDLoc DL(Node);
9227  Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
9228  Node->getOperand(i).getValueType(),
9229  Node->getOperand(i)), 0));
9230  }
9231 
9232  return DAG.UpdateNodeOperands(Node, Ops);
9233 }
9234 
9235 /// Fold the instructions after selecting them.
9236 /// Returns null if users were already updated.
9238  SelectionDAG &DAG) const {
9239  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
9240  unsigned Opcode = Node->getMachineOpcode();
9241 
9242  if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() &&
9243  !TII->isGather4(Opcode)) {
9244  return adjustWritemask(Node, DAG);
9245  }
9246 
9247  if (Opcode == AMDGPU::INSERT_SUBREG ||
9248  Opcode == AMDGPU::REG_SEQUENCE) {
9249  legalizeTargetIndependentNode(Node, DAG);
9250  return Node;
9251  }
9252 
9253  switch (Opcode) {
9254  case AMDGPU::V_DIV_SCALE_F32:
9255  case AMDGPU::V_DIV_SCALE_F64: {
9256  // Satisfy the operand register constraint when one of the inputs is
9257  // undefined. Ordinarily each undef value will have its own implicit_def of
9258  // a vreg, so force these to use a single register.
9259  SDValue Src0 = Node->getOperand(0);
9260  SDValue Src1 = Node->getOperand(1);
9261  SDValue Src2 = Node->getOperand(2);
9262 
9263  if ((Src0.isMachineOpcode() &&
9264  Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
9265  (Src0 == Src1 || Src0 == Src2))
9266  break;
9267 
9268  MVT VT = Src0.getValueType().getSimpleVT();
9269  const TargetRegisterClass *RC = getRegClassFor(VT);
9270 
9272  SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
9273 
9274  SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node),
9275  UndefReg, Src0, SDValue());
9276 
9277  // src0 must be the same register as src1 or src2, even if the value is
9278  // undefined, so make sure we don't violate this constraint.
9279  if (Src0.isMachineOpcode() &&
9280  Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
9281  if (Src1.isMachineOpcode() &&
9282  Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
9283  Src0 = Src1;
9284  else if (Src2.isMachineOpcode() &&
9285  Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
9286  Src0 = Src2;
9287  else {
9288  assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
9289  Src0 = UndefReg;
9290  Src1 = UndefReg;
9291  }
9292  } else
9293  break;
9294 
9295  SmallVector<SDValue, 4> Ops = { Src0, Src1, Src2 };
9296  for (unsigned I = 3, N = Node->getNumOperands(); I != N; ++I)
9297  Ops.push_back(Node->getOperand(I));
9298 
9299  Ops.push_back(ImpDef.getValue(1));
9300  return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
9301  }
9302  default:
9303  break;
9304  }
9305 
9306  return Node;
9307 }
9308 
9309 /// Assign the register class depending on the number of
9310 /// bits set in the writemask
9312  SDNode *Node) const {
9313  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
9314 
9316 
9317  if (TII->isVOP3(MI.getOpcode())) {
9318  // Make sure constant bus requirements are respected.
9319  TII->legalizeOperandsVOP3(MRI, MI);
9320  return;
9321  }
9322 
9323  // Replace unused atomics with the no return version.
9324  int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());
9325  if (NoRetAtomicOp != -1) {
9326  if (!Node->hasAnyUseOfValue(0)) {
9327  MI.setDesc(TII->get(NoRetAtomicOp));
9328  MI.RemoveOperand(0);
9329  return;
9330  }
9331 
9332  // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg
9333  // instruction, because the return type of these instructions is a vec2 of
9334  // the memory type, so it can be tied to the input operand.
9335  // This means these instructions always have a use, so we need to add a
9336  // special case to check if the atomic has only one extract_subreg use,
9337  // which itself has no uses.
9338  if ((Node->hasNUsesOfValue(1, 0) &&
9339  Node->use_begin()->isMachineOpcode() &&
9340  Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG &&
9341  !Node->use_begin()->hasAnyUseOfValue(0))) {
9342  unsigned Def = MI.getOperand(0).getReg();
9343 
9344  // Change this into a noret atomic.
9345  MI.setDesc(TII->get(NoRetAtomicOp));
9346  MI.RemoveOperand(0);
9347 
9348  // If we only remove the def operand from the atomic instruction, the
9349  // extract_subreg will be left with a use of a vreg without a def.
9350  // So we need to insert an implicit_def to avoid machine verifier
9351  // errors.
9352  BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
9353  TII->get(AMDGPU::IMPLICIT_DEF), Def);
9354  }
9355  return;
9356  }
9357 }
9358 
9359 static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
9360  uint64_t Val) {
9361  SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
9362  return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
9363 }
9364 
9366  const SDLoc &DL,
9367  SDValue Ptr) const {
9368  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
9369 
9370  // Build the half of the subregister with the constants before building the
9371  // full 128-bit register. If we are building multiple resource descriptors,
9372  // this will allow CSEing of the 2-component register.
9373  const SDValue Ops0[] = {
9374  DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
9375  buildSMovImm32(DAG, DL, 0),
9376  DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
9377  buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
9378  DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
9379  };
9380 
9381  SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
9382  MVT::v2i32, Ops0), 0);
9383 
9384  // Combine the constants and the pointer.
9385  const SDValue Ops1[] = {
9386  DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
9387  Ptr,
9388  DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
9389  SubRegHi,
9390  DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
9391  };
9392 
9393  return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
9394 }
9395 
9396 /// Return a resource descriptor with the 'Add TID' bit enabled
9397 /// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
9398 /// of the resource descriptor) to create an offset, which is added to
9399 /// the resource pointer.
9401  SDValue Ptr, uint32_t RsrcDword1,
9402  uint64_t RsrcDword2And3) const {
9403  SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
9404  SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
9405  if (RsrcDword1) {
9406  PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
9407  DAG.getConstant(RsrcDword1, DL, MVT::i32)),
9408  0);
9409  }
9410 
9411  SDValue DataLo = buildSMovImm32(DAG, DL,
9412  RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
9413  SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
9414 
9415  const SDValue Ops[] = {
9416  DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
9417  PtrLo,
9418  DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
9419  PtrHi,
9420  DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
9421  DataLo,
9422  DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
9423  DataHi,
9424  DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
9425  };
9426 
9427  return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
9428 }
9429 
9430 //===----------------------------------------------------------------------===//
9431 // SI Inline Assembly Support
9432 //===----------------------------------------------------------------------===//
9433 
9434 std::pair<unsigned, const TargetRegisterClass *>
9436  StringRef Constraint,
9437  MVT VT) const {
9438  const TargetRegisterClass *RC = nullptr;
9439  if (Constraint.size() == 1) {
9440  switch (Constraint[0]) {
9441  default:
9442  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
9443  case 's':
9444  case 'r':
9445  switch (VT.getSizeInBits()) {
9446  default:
9447  return std::make_pair(0U, nullptr);
9448  case 32:
9449  case 16:
9450  RC = &AMDGPU::SReg_32_XM0RegClass;
9451  break;
9452  case 64:
9453  RC = &AMDGPU::SGPR_64RegClass;
9454  break;
9455  case 128:
9456  RC = &AMDGPU::SReg_128RegClass;
9457  break;
9458  case 256:
9459  RC = &AMDGPU::SReg_256RegClass;
9460  break;
9461  case 512:
9462  RC = &AMDGPU::SReg_512RegClass;
9463  break;
9464  }
9465  break;
9466  case 'v':
9467  switch (VT.getSizeInBits()) {
9468  default:
9469  return std::make_pair(0U, nullptr);
9470  case 32:
9471  case 16:
9472  RC = &AMDGPU::VGPR_32RegClass;
9473  break;
9474  case 64:
9475  RC = &AMDGPU::VReg_64RegClass;
9476  break;
9477  case 96:
9478  RC = &AMDGPU::VReg_96RegClass;
9479  break;
9480  case 128:
9481  RC = &AMDGPU::VReg_128RegClass;
9482  break;
9483  case 256:
9484  RC = &AMDGPU::VReg_256RegClass;
9485  break;
9486  case 512:
9487  RC = &AMDGPU::VReg_512RegClass;
9488  break;
9489  }
9490  break;
9491  }
9492  // We actually support i128, i16 and f16 as inline parameters
9493  // even if they are not reported as legal
9494  if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
9495  VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
9496  return std::make_pair(0U, RC);
9497  }
9498 
9499  if (Constraint.size() > 1) {
9500  if (Constraint[1] == 'v') {
9501  RC = &AMDGPU::VGPR_32RegClass;
9502  } else if (Constraint[1] == 's') {
9503  RC = &AMDGPU::SGPR_32RegClass;
9504  }
9505 
9506  if (RC) {
9507  uint32_t Idx;
9508  bool Failed = Constraint.substr(2).getAsInteger(10, Idx);
9509  if (!Failed && Idx < RC->getNumRegs())
9510  return std::make_pair(RC->getRegister(Idx), RC);
9511  }
9512  }
9513  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
9514 }
9515 
9518  if (Constraint.size() == 1) {
9519  switch (Constraint[0]) {
9520  default: break;
9521  case 's':
9522  case 'v':
9523  return C_RegisterClass;
9524  }
9525  }
9526  return TargetLowering::getConstraintType(Constraint);
9527 }
9528 
9529 // Figure out which registers should be reserved for stack access. Only after
9530 // the function is legalized do we know all of the non-spill stack objects or if
9531 // calls are present.
9535  const MachineFrameInfo &MFI = MF.getFrameInfo();
9536  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
9537 
9538  if (Info->isEntryFunction()) {
9539  // Callable functions have fixed registers used for stack access.
9540  reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);
9541  }
9542 
9543  // We have to assume the SP is needed in case there are calls in the function
9544  // during lowering. Calls are only detected after the function is
9545  // lowered. We're about to reserve registers, so don't bother using it if we
9546  // aren't really going to use it.
9547  bool NeedSP = !Info->isEntryFunction() ||
9548  MFI.hasVarSizedObjects() ||
9549  MFI.hasCalls();
9550 
9551  if (NeedSP) {
9552  unsigned ReservedStackPtrOffsetReg = TRI->reservedStackPtrOffsetReg(MF);
9553  Info->setStackPtrOffsetReg(ReservedStackPtrOffsetReg);
9554 
9555  assert(Info->getStackPtrOffsetReg() != Info->getFrameOffsetReg());
9556  assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
9557  Info->getStackPtrOffsetReg()));
9558  MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
9559  }
9560 
9561  MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
9562  MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
9563  MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG,
9564  Info->getScratchWaveOffsetReg());
9565 
9566  Info->limitOccupancy(MF);
9567 
9569 }
9570 
9572  KnownBits &Known,
9573  const APInt &DemandedElts,
9574  const SelectionDAG &DAG,
9575  unsigned Depth) const {
9576  TargetLowering::computeKnownBitsForFrameIndex(Op, Known, DemandedElts,
9577  DAG, Depth);
9578 
9579  if (getSubtarget()->enableHugePrivateBuffer())
9580  return;
9581 
9582  // Technically it may be possible to have a dispatch with a single workitem
9583  // that uses the full private memory size, but that's not really useful. We
9584  // can't use vaddr in MUBUF instructions if we don't know the address
9585  // calculation won't overflow, so assume the sign bit is never set.
9587 }
9588 
9590 static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
9592  do {
9593  // Follow the chain until we find an INLINEASM node.
9594  N = N->getOperand(0).getNode();
9595  if (N->getOpcode() == ISD::INLINEASM)
9596  return true;
9597  } while (N->getOpcode() == ISD::CopyFromReg);
9598  return false;
9599 }
9600 
9603 {
9604  switch (N->getOpcode()) {
9605  case ISD::CopyFromReg:
9606  {
9607  const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
9608  const MachineFunction * MF = FLI->MF;
9609  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
9610  const MachineRegisterInfo &MRI = MF->getRegInfo();
9612  unsigned Reg = R->getReg();
9613  if (TRI.isPhysicalRegister(Reg))
9614  return !TRI.isSGPRReg(MRI, Reg);
9615 
9616  if (MRI.isLiveIn(Reg)) {
9617  // workitem.id.x workitem.id.y workitem.id.z
9618  // Any VGPR formal argument is also considered divergent
9619  if (!TRI.isSGPRReg(MRI, Reg))
9620  return true;
9621  // Formal arguments of non-entry functions
9622  // are conservatively considered divergent
9623  else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv()))
9624  return true;
9625  return false;
9626  }
9627  const Value *V = FLI->getValueFromVirtualReg(Reg);
9628  if (V)
9629  return KDA->isDivergent(V);
9630  assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
9631  return !TRI.isSGPRReg(MRI, Reg);
9632  }
9633  break;
9634  case ISD::LOAD: {
9635  const LoadSDNode *L = cast<LoadSDNode>(N);
9636  unsigned AS = L->getAddressSpace();
9637  // A flat load may access private memory.
9638  return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
9639  } break;
9640  case ISD::CALLSEQ_END:
9641  return true;
9642  break;
9644  {
9645 
9646  }
9648  cast<ConstantSDNode>(N->getOperand(0))->getZExtValue());
9651  cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
9652  // In some cases intrinsics that are a source of divergence have been
9653  // lowered to AMDGPUISD so we also need to check those too.
9654  case AMDGPUISD::INTERP_MOV:
9655  case AMDGPUISD::INTERP_P1:
9656  case AMDGPUISD::INTERP_P2:
9657  return true;
9658  }
9659  return false;
9660 }
9661 
9663  switch (VT.getScalarType().getSimpleVT().SimpleTy) {
9664  case MVT::f32:
9665  return Subtarget->hasFP32Denormals();
9666  case MVT::f64:
9667  return Subtarget->hasFP64Denormals();
9668  case MVT::f16:
9669  return Subtarget->hasFP16Denormals();
9670  default:
9671  return false;
9672  }
9673 }
9674 
9676  const SelectionDAG &DAG,
9677  bool SNaN,
9678  unsigned Depth) const {
9679  if (Op.getOpcode() == AMDGPUISD::CLAMP) {
9680  if (Subtarget->enableDX10Clamp())
9681  return true; // Clamped to 0.
9682  return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
9683  }
9684 
9686  SNaN, Depth);
9687 }
bool hasBCNT(unsigned Size) const
bool enableIEEEBit(const MachineFunction &MF) const
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, unsigned Alignment=0, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
unsigned addImplicitBufferPtr(const SIRegisterInfo &TRI)
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
void setWorkItemIDX(ArgDescriptor Arg)
uint64_t CallInst * C
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:571
bool isInvariant() const
constexpr bool isUInt< 32 >(uint64_t x)
Definition: MathExtras.h:349
X = FP_ROUND(Y, TRUNC) - Rounding &#39;Y&#39; from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:538
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set, or Regs.size() if they are all allocated.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array...
static MVT getIntegerVT(unsigned BitWidth)
void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
const MachineInstrBuilder & add(const MachineOperand &MO) const
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition: MathExtras.h:552
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:877
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:61
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:111
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition: Analysis.cpp:198
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:594
BitVector & set()
Definition: BitVector.h:398
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
Interface definition for SIRegisterInfo.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:55
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
const SDValue & getOffset() const
bool isUndef() const
unsigned reservedPrivateSegmentBufferReg(const MachineFunction &MF) const
Return the end register initially reserved for the scratch buffer in case spilling is needed...
C - The default llvm calling convention, compatible with C.
Definition: CallingConv.h:35
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
const GlobalValue * getGlobal() const
bool hasApertureRegs() const
bool isFMAFasterThanFMulAndFAdd(EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
bool isDivergent(const Value *V) const
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
This class represents an incoming formal argument to a Function.
Definition: Argument.h:30
LLVMContext & Context
Diagnostic information for unsupported feature in backend.
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, unsigned Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG&#39;s MachineFunction.
AMDGPU specific subclass of TargetSubtarget.
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:250
bool isPSInputAllocated(unsigned Index) const
SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
bool useDS128() const
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond)
Helper function to make it easier to build SetCC&#39;s if you just have an ISD::CondCode instead of an SD...
Definition: SelectionDAG.h:937
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it&#39;s not CSE&#39;d)...
Definition: SelectionDAG.h:836
const TargetRegisterClass * getRegClass(unsigned Reg) const
Return the register class of the specified virtual register.
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR (an vector value) starting with the ...
Definition: ISDOpcodes.h:358
LLVM_ATTRIBUTE_NORETURN void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:140
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:650
This class represents lattice values for constants.
Definition: AllocatorList.h:24
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types...
value_iterator value_end() const
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:135
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise. ...
static MVT getVectorVT(MVT VT, unsigned NumElements)
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
uint64_t getDefaultRsrcDataFormat() const
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:367
iterator begin() const
begin/end - Return all of the registers in this class.
static bool isBoolSGPR(SDValue V)
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:260
bool isSized(SmallPtrSetImpl< Type *> *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:265
#define LLVM_FALLTHROUGH
Definition: Compiler.h:86
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
static void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader)
2: 32-bit floating point type
Definition: Type.h:59
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool isVector() const
Return true if this is a vector value type.
bool hasStackObjects() const
Return true if there are any stack objects in this function.
bool isAllocated(unsigned Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
const SDValue & getBasePtr() const
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:289
unsigned getRegister(unsigned i) const
Return the specified register in the class.
bool isNegative() const
Return true if the value is negative.
bool hasFlatGlobalInsts() const
bool canMergeStoresTo(unsigned AS, EVT MemVT, const SelectionDAG &DAG) const override
Returns if it&#39;s reasonable to merge stores to MemVT size.
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
bool supportsMinMaxDenormModes() const
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
void push_back(const T &Elt)
Definition: SmallVector.h:218
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:383
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
unsigned addLiveIn(unsigned PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:164
unsigned getReg() const
getReg - Returns the register number.
LLVM_NODISCARD LLVM_ATTRIBUTE_ALWAYS_INLINE size_t size() const
size - Get the string size.
Definition: StringRef.h:138
void setIsUndef(bool Val=true)
bool hasFmaMixInsts() const
const uint32_t * getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override
const SDValue & getValue() const
unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI)
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
SDVTList getVTList() const
This class represents a function call, abstracting a target machine&#39;s calling convention.
unsigned Reg
unsigned getSubReg() const
void setHasFloatingPointExceptions(bool FPExceptions=true)
Tells the code generator that this target supports floating point exceptions and cares about preservi...
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:253
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space...
Definition: Type.cpp:630
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change...
const GCNSubtarget * getSubtarget() const
bool hasTrigReducedRange() const
Address space for 32-bit constant memory.
Definition: AMDGPU.h:263
float BitsToFloat(uint32_t Bits)
This function takes a 32-bit integer and returns the bit equivalent float.
Definition: MathExtras.h:581
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
unsigned getVectorNumElements() const
const SDValue & getChain() const
Offsets
Offsets in bytes from the start of the input buffer.
Definition: SIInstrInfo.h:1025
Function Alias Analysis Results
Address space for private memory.
Definition: AMDGPU.h:261
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:705
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
Definition: MachineInstr.h:830
const SIInstrInfo * getInstrInfo() const override
static const fltSemantics & EVTToAPFloatSemantics(EVT VT)
Returns an APFloat semantics tag appropriate for the given type.
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
unsigned getAlignment() const
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned Dim)
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
Definition: ISDOpcodes.h:802
unsigned second
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
static uint32_t Concat[]
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain)
virtual const TargetRegisterClass * getRegClassFor(MVT VT) const
Return the register class that should be used for the specified value type.
STATISTIC(NumFunctions, "Total number of functions")
unsigned const TargetRegisterInfo * TRI
A debug info location.
Definition: DebugLoc.h:34
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:141
F(f)
void markPSInputEnabled(unsigned Index)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS)
Helper function to make it easier to build Select&#39;s if you just have operands and don&#39;t want to check...
Definition: SelectionDAG.h:950
const fltSemantics & getSemantics() const
Definition: APFloat.h:1155
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition: APFloat.h:855
1: 16-bit floating point type
Definition: Type.h:58
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is &#39;desirable&#39; to us...
[US]{MIN/MAX} - Binary minimum or maximum or signed or unsigned integers.
Definition: ISDOpcodes.h:384
const SDNodeFlags getFlags() const
MachineFunction & getMachineFunction() const
SDNode * getNode() const
get the SDNode which holds the desired result
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:230
iv Induction Variable Users
Definition: IVUsers.cpp:52
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition: SelectionDAG.h:466
LLVM_READONLY int getAtomicNoRetOp(uint16_t Opcode)
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:212
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
CLAMP value between 0.0 and 1.0.
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
bool isInlineConstant(const APInt &Imm) const
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1 at the ...
Definition: ISDOpcodes.h:353
bool hasFastFMAF32() const
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
unsigned getValueSizeInBits() const
Returns the size of the value in bits.
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:435
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
const DebugLoc & getDebugLoc() const
LLVM_READONLY const MIMGLZMappingInfo * getMIMGLZMappingInfo(unsigned L)
const SIRegisterInfo & getRegisterInfo() const
Definition: SIInstrInfo.h:165
bool hasMad64_32() const
static void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
return AArch64::GPR64RegClass contains(Reg)
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool hasVOP3PInsts() const
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:159
bool isMemLoc() const
static bool isFlatGlobalAddrSpace(unsigned AS)
unsigned getAddressSpace() const
Return the address space for the associated pointer.
bool hasFP64Denormals() const
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1135
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:136
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations...
Definition: ISDOpcodes.h:456
unsigned countTrailingZeros() const
Count the number of trailing zero bits.
Definition: APInt.h:1632
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
void setPrivateSegmentWaveByteOffset(unsigned Reg)
#define FP_DENORM_FLUSH_NONE
Definition: SIDefines.h:524
Calling convention used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:192
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
void setDebuggerWorkGroupIDStackObjectIndex(unsigned Dim, int ObjectIdx)
Sets stack object index for Dim&#39;s work group ID to ObjectIdx.
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:718
bool hasOneUse() const
Return true if there is exactly one use of this node.
static unsigned findFirstFreeSGPR(CCState &CCInfo)
A description of a memory reference used in the backend.
unsigned DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
bool isSigned() const
Definition: InstrTypes.h:816
bool useVGPRIndexMode(bool UserEnable) const
Address space for constant memory (VTX2)
Definition: AMDGPU.h:259
Calling convention used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:198
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1447
const HexagonInstrInfo * TII
ArrayRef< T > makeArrayRef(const T &OneElt)
Construct an ArrayRef from a single element.
Definition: ArrayRef.h:451
bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const
Shift and rotation operations.
Definition: ISDOpcodes.h:410
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:412
static bool isGather4(const MachineInstr &MI)
Definition: SIInstrInfo.h:463
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:202
void setHasNonSpillStackObjects(bool StackObject=true)
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
unsigned addDispatchID(const SIRegisterInfo &TRI)
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:455
void setDebuggerWorkItemIDStackObjectIndex(unsigned Dim, int ObjectIdx)
Sets stack object index for Dim&#39;s work item ID to ObjectIdx.
bool enableDX10Clamp() const
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
The returned value is undefined.
Definition: MathExtras.h:46
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s), MachineInstr opcode, and operands.
bool isMemOpUniform(const SDNode *N) const
void eraseFromParent()
Unlink &#39;this&#39; from the containing basic block and delete it.
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
CopyToReg - This node has three operands: a chain, a register number to set to this value...
Definition: ISDOpcodes.h:170
unsigned SubReg
ArrayRef< MachineMemOperand * > memoperands() const
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:197
op_iterator op_end() const
uint64_t getConstantOperandVal(unsigned i) const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
static ArgDescriptor allocateSGPR64Input(CCState &CCInfo)
SimpleValueType SimpleTy
unsigned getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:304
InstrTy * getInstruction() const
Definition: CallSite.h:92
static MVT memVTFromAggregate(Type *Ty)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
The memory access is dereferenceable (i.e., doesn&#39;t trap).
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted...
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN, ptr, amt) For double-word atomic operations: ValLo, ValHi, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amtLo, amtHi) ValLo, ValHi, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN, ptr, amtLo, amtHi) These correspond to the atomicrmw instruction.
Definition: ISDOpcodes.h:810
unsigned Intr
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:409
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:460
bool isTrapHandlerEnabled() const
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:401
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_NODISCARD LLVM_ATTRIBUTE_ALWAYS_INLINE R Default(T Value)
Definition: StringSwitch.h:203
unsigned reservedStackPtrOffsetReg(const MachineFunction &MF) const
This is an SDNode representing atomic operations.
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
unsigned getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
uint64_t getNumElements() const
Definition: DerivedTypes.h:359
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s, unsigned base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
LocInfo getLocInfo() const
bool isSGPRClass(const TargetRegisterClass *RC) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
This file implements a class to represent arbitrary precision integral constant values and operations...
INLINEASM - Represents an inline asm block.
Definition: ISDOpcodes.h:667
This represents a list of ValueType&#39;s that has been intern&#39;d by a SelectionDAG.
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, unsigned Alignment=0, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, LegacyDivergenceAnalysis *DA) const override
SmallVector< ISD::InputArg, 32 > Ins
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
unsigned getSizeInBits() const
MachineInstr * getVRegDef(unsigned Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
AttributeList getAttributes(LLVMContext &C, ID id)
Return the attributes for an intrinsic.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
Fast - This calling convention attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:43
unsigned getScalarSizeInBits() const
Definition: ValueTypes.h:298
static void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
Class to represent function types.
Definition: DerivedTypes.h:103
bool hasDotInsts() const
unsigned getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:292
unsigned getNextStackOffset() const
getNextStackOffset - Return the next stack offset such that all stack slots satisfy their alignment r...
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1575
bool isInfinity() const
Definition: APFloat.h:1144
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:245
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:398
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:478
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose...
void setStackPtrOffsetReg(unsigned Reg)
void limitOccupancy(const MachineFunction &MF)
Definition: Lint.cpp:84
void clearKillFlags(unsigned Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
bool useFlatForGlobal() const
SDValue getRegisterMask(const uint32_t *RegMask)
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, const GCNSubtarget *Subtarget, uint32_t Align)
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, unsigned OperandName) const
Returns the operand named Op.
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:402
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:201
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
SmallVector< ISD::OutputArg, 32 > Outs
static bool vectorEltWillFoldAway(SDValue Op)
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:221
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:33
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always beneficiates from combining into FMA for a given value type...
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
Definition: SelectionDAG.h:852
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
cmpResult
IEEE-754R 5.11: Floating Point Comparison Relations.
Definition: APFloat.h:166
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue is known to never be NaN.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG, SDValue *GLC, SDValue *SLC)
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out...
Definition: ISDOpcodes.h:959
op_iterator op_begin() const
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors...
LLVM_NODISCARD LLVM_ATTRIBUTE_ALWAYS_INLINE StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition: StringRef.h:598
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition: Analysis.cpp:161
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:576
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:747
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:25
bool getAddrModeArguments(IntrinsicInst *, SmallVectorImpl< Value *> &, Type *&) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock *> &Exits) const override
Insert explicit copies in entry and exit blocks.
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
unsigned getUndefRegState(bool B)
void markPSInputAllocated(unsigned Index)
amdgpu Simplify well known AMD library false Value * Callee
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:151
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *bb=nullptr)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
Calling convention used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (ve...
Definition: CallingConv.h:189
static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE, SDValue *LWE, bool &IsTexFail)
unsigned getSrcAddressSpace() const
Value * getOperand(unsigned i) const
Definition: User.h:170
Analysis containing CSE Info
Definition: CSEInfo.cpp:21
Class to represent pointers.
Definition: DerivedTypes.h:467
unsigned getByValSize() const
UNDEF - An undefined node.
Definition: ISDOpcodes.h:178
This class is used to represent ISD::STORE nodes.
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:524
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Flag
These should be considered private to the implementation of the MCInstrDesc class.
Definition: MCInstrDesc.h:118
bool hasAllowReciprocal() const
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a vector with the specified, possibly variable...
Definition: ISDOpcodes.h:327
TargetInstrInfo - Interface to description of machine instruction set.
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
const Value * getValue() const
Return the base address of the memory access.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Get a value with high bits set.
Definition: APInt.h:636
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition: ValueTypes.h:229
bool hasUnalignedBufferAccess() const
11: Arbitrary bit width integers
Definition: Type.h:71
bool hasAllowContract() const
The memory access is volatile.
constexpr uint64_t MinAlign(uint64_t A, uint64_t B)
A and B are either alignments or offsets.
Definition: MathExtras.h:610
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool hasFP32Denormals() const
const SDValue & getBasePtr() const
static bool fp16SrcZerosHighBits(unsigned Opc)
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:43
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:423
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:166
void addLiveIn(MCPhysReg PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fuse-fp-ops=xxx option.
Control flow instructions. These all have token chains.
Definition: ISDOpcodes.h:629
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
CodeGenOpt::Level getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
bool isNegative() const
Definition: APFloat.h:1147
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
constexpr bool isUInt< 8 >(uint64_t x)
Definition: MathExtras.h:343
unsigned const MachineRegisterInfo * MRI
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
std::size_t countTrailingZeros(T Val, ZeroBehavior ZB=ZB_Width)
Count number of 0&#39;s from the least significant bit to the most stopping at the first 1...
Definition: MathExtras.h:120
MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool shouldAssumeDSOLocal(const Module &M, const GlobalValue *GV) const
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:429
Machine Value Type.
unsigned reservedPrivateSegmentWaveByteOffsetReg(const MachineFunction &MF) const
Return the end register initially reserved for the scratch wave offset in case spilling is needed...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:46
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:69
unsigned getStackAlignment() const
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
Simple binary floating point operators.
Definition: ISDOpcodes.h:283
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void setTargetDAGCombine(ISD::NodeType NT)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
bool isMachineOpcode() const
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:273
Address space for flat memory.
Definition: AMDGPU.h:255
unsigned getScalarSizeInBits() const
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:149
bool isNaN() const
Definition: APFloat.h:1145
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
unsigned getReturnAddressReg(const MachineFunction &MF) const
bool hasDLInsts() const
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
LLVM_ATTRIBUTE_ALWAYS_INLINE iterator begin()
Definition: SmallVector.h:129
const SDValue & getOperand(unsigned Num) const
bool hasFPExceptions() const
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:934
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL...
Definition: ISDOpcodes.h:332
This file contains the declarations for the subclasses of Constant, which represent the different fla...
const MCPhysReg * getCalleeSavedRegsViaCopy(const MachineFunction *MF) const
bool hasBitPreservingFPLogic(EVT VT) const override
Return true if it is safe to transform an integer-domain bitwise operation into the equivalent floati...
std::pair< const ArgDescriptor *, const TargetRegisterClass * > getPreloadedValue(PreloadedValue Value) const
const GlobalValue * getGlobal() const
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition: ValueTypes.h:247
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
bool has16BitInsts() const
EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, MachineFunction &MF) const override
Returns the target specific optimal type for load and store operations as a result of memset...
bool isEntryFunctionCC(CallingConv::ID CC)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
Definition: SelectionDAG.h:824
static bool isUniformMMO(const MachineMemOperand *MMO)
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node&#39;s operand with EXTRACT_SUBVECTOR and return the low/high part.
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
virtual void computeKnownBitsForFrameIndex(const SDValue FIOp, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
const AMDGPUBufferPseudoSourceValue * getBufferPSV(const SIInstrInfo &TII, const Value *BufferRsrc)
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
SI DAG Lowering interface definition.
unsigned getDestAddressSpace() const
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
static ArgDescriptor createRegister(unsigned Reg)
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This class provides iterator support for SDUse operands that use a specific SDNode.
bool isExactlyValue(double V) const
We don&#39;t rely on operator== working on double values, as it returns true for things that are clearly ...
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side...
Address space for local memory.
Definition: AMDGPU.h:260
unsigned getMachineOpcode() const
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, const SDLoc &DL, LLVMContext &Context)
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:646
unsigned addQueuePtr(const SIRegisterInfo &TRI)
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using &#39;From&#39; to use &#39;To&#39; instead.
static unsigned getMOVRELDPseudo(const SIRegisterInfo &TRI, const TargetRegisterClass *VecRC)
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:767
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
const APInt & getAPIntValue() const
Generation getGeneration() const
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline...
const Triple & getTargetTriple() const
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:57
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:495
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:770
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition: SIDefines.h:521
SDNode * getGluedNode() const
If this node has a glue operand, return the node to which the glue operand points.
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
self_iterator getIterator()
Definition: ilist_node.h:82
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, unsigned Align=0, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, unsigned Size=0)
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
bool hasNUsesOfValue(unsigned NUses, unsigned Value) const
Return true if there are exactly NUSES uses of the indicated value.
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the &#39;usesCustomInserter&#39; fla...
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y)...
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:281
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
AMDGPUFunctionArgInfo & getArgInfo()
The AMDGPU TargetMachine interface definition for hw codgen targets.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:49
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:416
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo...
Definition: ISDOpcodes.h:796
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function. ...
Definition: Function.cpp:193
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:556
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
const Pass * getPass() const
Definition: SelectionDAG.h:399
Extended Value Type.
Definition: ValueTypes.h:34
Address space for global memory (RAT0, VTX0).
Definition: AMDGPU.h:256
static UndefValue * get(Type *T)
Static factory methods - Return an &#39;undef&#39; object of the specified type.
Definition: Constants.cpp:1415
Calling convention used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:195
void setScratchWaveOffsetReg(unsigned Reg)
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
This structure contains all information that is necessary for lowering calls.
size_t size() const
Definition: SmallVector.h:53
bool hasUnalignedScratchAccess() const
static PointerType * getInt8PtrTy(LLVMContext &C, unsigned AS=0)
Definition: Type.cpp:220
bool isVolatile() const
const TargetMachine & getTargetMachine() const
This class contains a discriminated union of information about pointers in memory operands...
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode...
unsigned getNumOperands() const
Return the number of values used by this operation.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
TrapHandlerAbi getTrapHandlerAbi() const
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
constexpr char NumSGPRs[]
Key for Kernel::CodeProps::Metadata::mNumSGPRs.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value...
unsigned addDispatchPtr(const SIRegisterInfo &TRI)
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, unsigned Alignment=0, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands...
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
bool hasUnpackedD16VMem() const
unsigned getAddressSpace() const
std::enable_if< std::numeric_limits< T >::is_signed, bool >::type getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition: StringRef.h:497
The memory access writes data.
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
bool getScalarizeGlobalBehavior() const
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:51
uint64_t getNullPointerValue(unsigned AddrSpace) const
Get the integer value of a null pointer in the given address space.
static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset, bool UseGPRIdxMode, bool IsIndirectSrc)
const unsigned MaxDepth
bool hasFlatAddressSpace() const
unsigned getWavefrontSize() const
bool hasAddr64() const
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, unsigned Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
static bool isUndef(ArrayRef< int > Mask)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
TokenFactor - This node takes multiple tokens as input and produces a single token result...
Definition: ISDOpcodes.h:50
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
EVT is not used in-tree, but is used by out-of-tree target.
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the &#39;Add TID&#39; bit enabled The TID (Thread ID) is multiplied by the ...
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:404
bool enableHugePrivateBuffer() const
Iterator for intrusive lists based on ilist_node.
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
CCState - This class holds information needed while lowering arguments and return values...
void setNoUnsignedWrap(bool b)
unsigned countPopulation(T Value)
Count the number of set bits in a value.
Definition: MathExtras.h:520
SDValue scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
static ArgDescriptor allocateSGPR32Input(CCState &CCInfo)
static uint32_t getPermuteMask(SelectionDAG &DAG, SDValue V)
bool isFPExtFoldable(unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance, because half-precision floating-point numbers are implicitly extended to float-precision) for an FMA instruction.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void setDesc(const MCInstrDesc &tid)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one...
This is the shared class of boolean and integer constants.
Definition: Constants.h:84
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:319
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:339
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:265
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:148
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:213
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:222
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
bool hasMadMixInsts() const
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:575
MachineOperand class - Representation of each machine instruction operand.
value_iterator value_begin() const
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:847
unsigned getScratchRSrcReg() const
Returns the physical register reserved for use as the resource descriptor for scratch accesses...
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:734
bool hasFFBL() const
const DebugLoc & getDebugLoc() const
CCValAssign - Represent assignment of one arg/retval to a location.
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, unsigned Align, bool isVol, bool AlwaysInline, bool isTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo)
unsigned getABITypeAlignment(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:730
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:644
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimum or maximum on two values, following the IEEE-754 2008 definition.
Definition: ISDOpcodes.h:600
const DataFlowGraph & G
Definition: RDFGraph.cpp:211
An SDNode that represents everything that will be needed to construct a MachineInstr.
bool hasMin3Max3_16() const
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:413
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:581
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain)
This is an abstract virtual class for memory operations.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
LLVM_READNONE bool isKernel(CallingConv::ID CC)
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
bool isDivergent() const
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, bool IsIndirectSrc)
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0)
Append the extracted elements from Start to Count out of the vector Op in Args.
Represents one node in the SelectionDAG.
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array...
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1293
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
int64_t getImm() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
Definition: SelectionDAG.h:679
bool isDenormal() const
Definition: APFloat.h:1148
const Function & getFunction() const
Return the LLVM function that this machine code represents.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
void setWorkItemIDZ(ArgDescriptor Arg)
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:539
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:941
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:164
bool hasFFBH() const
unsigned getPrivateSegmentWaveByteOffsetSystemSGPR() const
Class to represent vector types.
Definition: DerivedTypes.h:393
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT...
Definition: ValueTypes.h:73
EVT getMemoryVT() const
Return the type of the in-memory value.
Target - Wrapper for Target specific information.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
Class for arbitrary precision integers.
Definition: APInt.h:70
unsigned getByValAlign() const
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
bool isShader(CallingConv::ID cc)
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:420
Interface for the AMDGPU Implementation of the Intrinsic Info class.
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:241
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1309
static use_iterator use_end()
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:468
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
bool hasMed3_16() const
void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:471
bool isTailCall() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
LLVM_ATTRIBUTE_ALWAYS_INLINE StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:70
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:478
void replaceRegWith(unsigned FromReg, unsigned ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors...
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:312
void append(in_iter in_start, in_iter in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:394
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align, bool *IsFast) const override
Determine if the target supports unaligned memory accesses.
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
static bool isVOP3(const MachineInstr &MI)
Definition: SIInstrInfo.h:395
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:607
unsigned addPrivateSegmentBuffer(const SIRegisterInfo &TRI)
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
amdgpu Simplify well known AMD library false Value Value * Arg
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:254
unsigned addFlatScratchInit(const SIRegisterInfo &TRI)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
The memory access reads data.
unsigned getOrigArgIndex() const
std::pair< const ArgDescriptor *, const TargetRegisterClass * > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
bool isCheapAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
Provides AMDGPU specific target descriptions.
Representation of each machine instruction.
Definition: MachineInstr.h:64
static bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
These are IR-level optimization flags that may be propagated to SDNodes.
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array...
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
LLVM_ATTRIBUTE_ALWAYS_INLINE iterator end()
Definition: SmallVector.h:133
SmallVector< SDValue, 32 > OutVals
Interface definition for SIInstrInfo.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:423
unsigned getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:151
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:387
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB &#39;Other&#39; at the position From, and insert it into this MBB right before &#39;...
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
Definition: SelectionDAG.h:705
static cl::opt< unsigned > AssumeFrameIndexHighZeroBits("amdgpu-frame-index-zero-bits", cl::desc("High bits of frame index assumed to be zero"), cl::init(5), cl::ReallyHidden)
void emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:652
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
unsigned getLocMemOffset() const
LLVM_NODISCARD bool empty() const
Definition: SmallVector.h:56
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &IdxReg, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, bool IsIndirectSrc)
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:486
const MCInstrDesc & get(unsigned Opcode) const
Return the machine instruction descriptor that corresponds to the specified instruction opcode...
Definition: MCInstrInfo.h:45
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:614
static bool isClampZeroToOne(SDValue A, SDValue B)
PointerUnion< const Value *, const PseudoSourceValue * > ptrVal
TargetOptions Options
Definition: TargetMachine.h:97
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:107
static cl::opt< bool > EnableVGPRIndexMode("amdgpu-vgpr-index-mode", cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), cl::init(false))
#define I(x, y, z)
Definition: MD5.cpp:58
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations, those with specific masks.
#define N
static ArgDescriptor createStack(unsigned Reg)
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Flags getFlags() const
Return the raw flags of the source value,.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
The memory access always returns the same value (or traps).
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:193
bool hasFlatInstOffsets() const
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
bool isAmdHsaOrMesa(const Function &F) const
bool shouldEmitConstantsToTextSection(const Triple &TT)
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:323
uint32_t Size
Definition: Profile.cpp:47
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
const Value * getValueFromVirtualReg(unsigned Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
const MachineInstrBuilder & addReg(unsigned RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:43
unsigned getOpcode() const
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:608
SDValue getValue(unsigned R) const
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition: APFloat.h:904
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
bool isInfinity() const
Return true if the value is an infinity.
constexpr bool isUInt< 16 >(uint64_t x)
Definition: MathExtras.h:346
void setSimpleHint(unsigned VReg, unsigned PrefReg)
Specify the preferred (target independent) register allocation hint for the specified virtual registe...
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:175
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition: SelectionDAG.h:457
bool isReg() const
isReg - Tests if this is a MO_Register operand.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getCondCode(ISD::CondCode Cond)
static bool isFrameIndexOp(SDValue Op)
bool isRegLoc() const
FunTy * getCalledFunction() const
Return the function being called if this is a direct call, otherwise return null (if it&#39;s an indirect...
Definition: CallSite.h:107
const MachinePointerInfo & getPointerInfo() const
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:345
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if &#39;Op & Mask&#39; is known to be zero.
unsigned getRegister() const
bool hasFnAttribute(Attribute::AttrKind Kind) const
Equivalent to hasAttribute(AttributeList::FunctionIndex, Kind) but may be faster. ...
bool memoperands_empty() const
Return true if we don&#39;t have any memory operands which described the memory access done by this instr...
Definition: MachineInstr.h:546
bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, unsigned Alignment=1, bool *Fast=nullptr) const
Return true if the target supports a memory access of this type for the given address space and align...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
unsigned getReg() const
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, unsigned Alignment=0, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void insert(iterator MBBI, MachineBasicBlock *MBB)
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
static bool canGuaranteeTCO(CallingConv::ID CC)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo)
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:284
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:566
LLVM Value Representation.
Definition: Value.h:73
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:302
const AMDGPUImagePseudoSourceValue * getImagePSV(const SIInstrInfo &TII, const Value *ImgRsrc)
SDValue getRegister(unsigned Reg, EVT VT)
Address space for region memory. (GDS)
Definition: AMDGPU.h:257
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
uint64_t getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type...
Definition: DataLayout.h:419
bool denormalsEnabledForType(EVT VT) const
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type...
Definition: ValueTypes.h:115
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC&#39;s if you just have an ISD::CondCode instead of an...
Definition: SelectionDAG.h:962
bool hasNoUnsignedWrap() const
SDValue getValueType(EVT)
std::underlying_type< E >::type Mask()
Get a bitmask with 1s in all places up to the high-order bit of E&#39;s largest value.
Definition: BitmaskEnum.h:81
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.h:331
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
void computeKnownBitsForFrameIndex(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone...
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:59
Type * getElementType() const
Definition: DerivedTypes.h:360
IRTranslator LLVM IR MI
const APFloat & getValueAPF() const
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:49
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations...
Definition: ISDOpcodes.h:306
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:443
void RemoveOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with...
APInt bitcastToAPInt() const
Definition: APFloat.h:1094
void setWorkItemIDY(ArgDescriptor Arg)
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:160
unsigned getNumOperands() const
Conversion operators.
Definition: ISDOpcodes.h:465
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is &#39;desirable&#39; to us...
const SDValue & getOperand(unsigned i) const
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:789
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned char TargetFlags=0) const
unsigned getLocReg() const
uint64_t getZExtValue() const
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:474
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Op, int64_t Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object...
Definition: SelectionDAG.h:806
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:126
static void Split(std::vector< std::string > &V, StringRef S)
Splits a string of comma separated items in to a vector of strings.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue >> &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
void setBytesInStackArgArea(unsigned Bytes)
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
unsigned AllocateReg(unsigned Reg)
AllocateReg - Attempt to allocate one register.
unsigned getLiveInVirtReg(unsigned PReg) const
getLiveInVirtReg - If PReg is a live-in physical register, return the corresponding live-in physical ...
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand *> NewMemRefs)
Mutate the specified machine node&#39;s memory references to the provided list.
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:414
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation...
bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Definition: CallSite.h:271
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the &#39;usesCustomInserter&#39; fla...
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:584
unsigned getMaxPrivateElementSize() const
static void processShaderInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:64
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
unsigned AllocateStack(unsigned Size, unsigned Align)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
static IntegerType * getInt8Ty(LLVMContext &C)
Definition: Type.cpp:174
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:785
bool isSignaling() const
Definition: APFloat.h:1149
Calling convention for AMDGPU code object kernels.
Definition: CallingConv.h:201
const SDValue & getBasePtr() const
LLVMContext * getContext() const
Definition: SelectionDAG.h:407
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:375
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition: APFloat.h:886
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself...
PointerType * getType() const
Global values are always pointers.
Definition: GlobalValue.h:274
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
unsigned createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:242
iterator_range< arg_iterator > args()
Definition: Function.h:689
bool isStructTy() const
True if this is an instance of StructType.
Definition: Type.h:218
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:144
cmpResult compare(const APFloat &RHS) const
Definition: APFloat.h:1102
Type * getContainedType(unsigned i) const
This method is used to implement the type iterator (defined at the end of the file).
Definition: Type.h:333
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:44
const BasicBlock * getParent() const
Definition: Instruction.h:67
unsigned getRegisterByName(const char *RegName, EVT VT, SelectionDAG &DAG) const override
Return the register ID of the name passed in.
bool hasCalls() const
Return true if the current function has any function calls.
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned char TargetFlags=0)
Definition: SelectionDAG.h:622
const fltSemantics & getFltSemantics() const
Definition: Type.h:169
bool hasFP16Denormals() const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
This class is used to represent ISD::LOAD nodes.
const SIRegisterInfo * getRegisterInfo() const override