47 #define DEBUG_TYPE "amdgpu-codegenprepare" 54 "amdgpu-codegenprepare-widen-constant-loads",
55 cl::desc(
"Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
65 bool HasUnsafeFPMath =
false;
72 unsigned getBaseElementBitWidth(
const Type *
T)
const;
89 bool needsPromotionToI32(
const Type *T)
const;
111 bool promoteUniformOpToI32(
ICmpInst &I)
const;
122 bool promoteUniformOpToI32(
SelectInst &I)
const;
140 bool IsDiv,
bool IsSigned)
const;
154 bool canWidenScalarExtLoad(
LoadInst &I)
const;
163 bool visitInstruction(
Instruction &I) {
return false; }
172 bool doInitialization(
Module &M)
override;
175 StringRef getPassName()
const override {
return "AMDGPU IR optimizations"; }
186 unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(
const Type *
T)
const {
187 assert(needsPromotionToI32(T) &&
"T does not need promotion to i32");
191 return cast<VectorType>(
T)->getElementType()->getIntegerBitWidth();
195 assert(needsPromotionToI32(T) &&
"T does not need promotion to i32");
203 return I.
getOpcode() == Instruction::AShr ||
207 bool AMDGPUCodeGenPrepare::isSigned(
const SelectInst &I)
const {
209 cast<ICmpInst>(I.
getOperand(0))->isSigned() :
false;
212 bool AMDGPUCodeGenPrepare::needsPromotionToI32(
const Type *T)
const {
217 if (
const VectorType *VT = dyn_cast<VectorType>(T)) {
220 if (
ST->hasVOP3PInsts())
223 return needsPromotionToI32(VT->getElementType());
232 case Instruction::Shl:
234 case Instruction::Sub:
236 case Instruction::Mul:
246 case Instruction::Shl:
248 case Instruction::Mul:
250 case Instruction::Sub:
257 bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(
LoadInst &I)
const {
264 return I.
isSimple() && TySize < 32 && Align >= 4 && DA->isUniform(&I);
267 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(
BinaryOperator &I)
const {
269 "I does not need promotion to i32");
271 if (I.
getOpcode() == Instruction::SDiv ||
281 Value *ExtOp0 =
nullptr;
282 Value *ExtOp1 =
nullptr;
283 Value *ExtRes =
nullptr;
284 Value *TruncRes =
nullptr;
295 if (
Instruction *Inst = dyn_cast<Instruction>(ExtRes)) {
297 Inst->setHasNoSignedWrap();
300 Inst->setHasNoUnsignedWrap();
302 if (
const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I))
303 Inst->setIsExact(ExactOp->isExact());
314 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(
ICmpInst &I)
const {
316 "I does not need promotion to i32");
322 Value *ExtOp0 =
nullptr;
323 Value *ExtOp1 =
nullptr;
324 Value *NewICmp =
nullptr;
341 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(
SelectInst &I)
const {
343 "I does not need promotion to i32");
349 Value *ExtOp1 =
nullptr;
350 Value *ExtOp2 =
nullptr;
351 Value *ExtRes =
nullptr;
352 Value *TruncRes =
nullptr;
370 bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
373 "I must be bitreverse intrinsic");
375 "I does not need promotion to i32");
407 return HasDenormals ^ IsOne;
429 bool UnsafeDiv = HasUnsafeFPMath || FMF.
isFast() ||
445 Value *NewFDiv =
nullptr;
447 bool HasDenormals =
ST->hasFP32Denormals();
448 if (
VectorType *VT = dyn_cast<VectorType>(Ty)) {
453 for (
unsigned I = 0,
E = VT->getNumElements(); I !=
E; ++
I) {
459 NewElt = Builder.
CreateFDiv(NumEltI, DenEltI);
461 NewElt = Builder.
CreateCall(Decl, { NumEltI, DenEltI });
468 NewFDiv = Builder.
CreateCall(Decl, { Num, Den });
496 return std::make_pair(Lo, Hi);
500 return getMul64(Builder, LHS, RHS).second;
508 bool IsDiv,
bool IsSigned)
const {
521 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
522 unsigned DivBits = 32 - SignBits;
569 {FQNeg->
getType()}, {FQNeg, FB, FA}, FQ);
613 assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
614 Opc == Instruction::SRem || Opc == Instruction::SDiv);
620 if (isa<Constant>(Den))
623 bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
624 bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
640 if (
Value *Res = expandDivRem24(Builder, I, Num, Den, IsDiv, IsSigned)) {
649 Value *Sign =
nullptr;
655 Sign = IsDiv ? Builder.
CreateXor(LHSign, RHSign) : LHSign;
673 Value *RCP_LO, *RCP_HI;
674 std::tie(RCP_LO, RCP_HI) =
getMul64(Builder, RCP, Den);
715 Value *Tmp1 = Builder.
CreateAnd(Remainder_GE_Den, Remainder_GE_Zero);
730 Res = Builder.
CreateSelect(Num_GE_Num_S_Rem_CC, Div, Quotient_S_One);
742 Res = Builder.
CreateSelect(Num_GE_Num_S_Rem_CC, Rem, Remainder_A_Den);
755 bool AMDGPUCodeGenPrepare::visitBinaryOperator(
BinaryOperator &I) {
756 if (
ST->has16BitInsts() && needsPromotionToI32(I.
getType()) &&
757 DA->isUniform(&I) && promoteUniformOpToI32(I))
760 bool Changed =
false;
763 Value *NewDiv =
nullptr;
764 if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
765 Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
772 if (
VectorType *VT = dyn_cast<VectorType>(Ty)) {
775 for (
unsigned N = 0,
E = VT->getNumElements();
N !=
E; ++
N) {
778 Value *NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN);
780 NewElt = Builder.
CreateBinOp(Opc, NumEltN, DenEltN);
784 NewDiv = expandDivRem32(Builder, I, Num, Den);
797 bool AMDGPUCodeGenPrepare::visitLoadInst(
LoadInst &I) {
803 canWidenScalarExtLoad(I)) {
817 mdconst::extract<ConstantInt>(Range->getOperand(0));
833 int TySize =
Mod->getDataLayout().getTypeSizeInBits(I.
getType());
845 bool AMDGPUCodeGenPrepare::visitICmpInst(
ICmpInst &I) {
846 bool Changed =
false;
850 Changed |= promoteUniformOpToI32(I);
855 bool AMDGPUCodeGenPrepare::visitSelectInst(
SelectInst &I) {
856 bool Changed =
false;
858 if (
ST->has16BitInsts() && needsPromotionToI32(I.
getType()) &&
860 Changed |= promoteUniformOpToI32(I);
865 bool AMDGPUCodeGenPrepare::visitIntrinsicInst(
IntrinsicInst &I) {
868 return visitBitreverseIntrinsicInst(I);
874 bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(
IntrinsicInst &I) {
875 bool Changed =
false;
877 if (
ST->has16BitInsts() && needsPromotionToI32(I.
getType()) &&
879 Changed |= promoteUniformBitreverseToI32(I);
884 bool AMDGPUCodeGenPrepare::doInitialization(
Module &M) {
893 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
899 AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
900 DA = &getAnalysis<LegacyDivergenceAnalysis>();
903 bool MadeChange =
false;
909 MadeChange |= visit(*I);
917 "AMDGPU IR optimizations",
false,
false)
923 char AMDGPUCodeGenPrepare::
ID = 0;
926 return new AMDGPUCodeGenPrepare();
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
A parsed version of the target data layout string in and methods for querying it. ...
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
void copyFastMathFlags(FastMathFlags FMF)
Convenience function for transferring all fast-math flag values to this instruction, which must be an operator which supports these flags.
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
FastMathFlags getFastMathFlags() const
Get the flags to be applied to created floating point ops.
Base class for instruction visitors.
AMDGPU specific subclass of TargetSubtarget.
void setFast(bool B=true)
This class represents lattice values for constants.
BinaryOps getOpcode() const
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
A Module instance is used to store all the information related to an LLVM module. ...
APInt zext(unsigned width) const
Zero extend to a new width.
This class represents a function call, abstracting a target machine's calling convention.
An immutable pass that tracks lazily created AssumptionCache objects.
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space...
Address space for 32-bit constant memory.
float BitsToFloat(uint32_t Bits)
This function takes a 32-bit integer and returns the bit equivalent float.
A cache of @llvm.assume calls within a function.
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
FunctionPass * createAMDGPUCodeGenPreparePass()
An instruction for reading from memory.
static bool promotedOpIsNUW(const Instruction &I)
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Address space for constant memory (VTX2)
This class represents the LLVM 'select' instruction.
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
bool isIntegerTy() const
True if this is an instance of IntegerType.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
This file contains the simple types necessary to represent the attributes associated with functions a...
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Type * getType() const
All values are typed, get the type of this value.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
const APInt & getValue() const
Return the constant as an APInt value reference.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Value * CreateSIToFP(Value *V, Type *DestTy, const Twine &Name="")
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="")
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
void takeName(Value *V)
Transfer the name from V to this value.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type *> Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Value * getOperand(unsigned i) const
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="")
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
static bool promotedOpIsNSW(const Instruction &I)
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata *> MDs)
static bool runOnFunction(Function &F, bool PostInlining)
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits...
initializer< Ty > init(const Ty &Val)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
LLVM Basic Block Representation.
The instances of the Type class are immutable: once they are created, they are never changed...
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This is an important base class in LLVM.
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
ConstantFP - Floating Point Values [float, double].
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
bool isFast() const
'Fast' means all bits are set.
Represent the analysis usage information of a pass.
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
This instruction compares its operands according to the predicate given to the constructor.
FunctionPass class - This class is used to implement most global optimizations.
Value * getPointerOperand()
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
bool allowReciprocal() const
self_iterator getIterator()
Class to represent integer types.
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
The AMDGPU TargetMachine interface definition for hw codgen targets.
static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals)
static Value * getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="")
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="")
Value * CreateFCmpOGE(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Iterator for intrusive lists based on ilist_node.
This is the shared class of boolean and integer constants.
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type...
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
Utility class for floating point operations which can have information about relaxed accuracy require...
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type *> Types, ArrayRef< Value *> Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with args, mangled using Types.
Value * CreateInsertElement(Value *Vec, Value *NewElt, Value *Idx, const Twine &Name="")
unsigned getABITypeAlignment(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
static Constant * get(Type *Ty, uint64_t V, bool isSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
static Constant * get(Type *Ty, double V)
This returns a ConstantFP, or a vector containing a splat of a ConstantFP, for the specified value in...
static bool hasUnsafeFPMath(const Function &F)
The access may modify the value stored in memory.
static std::pair< Value *, Value * > getMul64(IRBuilder<> &Builder, Value *LHS, Value *RHS)
Class to represent vector types.
void setPreservesAll()
Set by analyses that do not transform their input at all.
uint64_t getTypeSizeInBits(Type *Ty) const
Size examples:
Predicate getPredicate() const
Return the predicate for this instruction.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
unsigned getAlignment() const
Return the alignment of the access that is being performed.
unsigned getIntegerBitWidth() const
StringRef getValueAsString() const
Return the attribute's value as a string.
float getFPAccuracy() const
Get the maximum error permitted by this operation in ULPs.
Value * CreateFPToSI(Value *V, Type *DestTy, const Twine &Name="")
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value *> Args=None, const Twine &Name="", MDNode *FPMathTag=nullptr)
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
bool hasNoUnsignedWrap() const
Determine whether the no unsigned wrap flag is set.
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateFDiv(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
LLVM Value Representation.
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
static VectorType * get(Type *ElementType, unsigned NumElements)
This static method is the primary way to construct an VectorType.
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
Convenience struct for specifying and reasoning about fast-math flags.
StringRef - Represent a constant reference to a string, i.e.
INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", false, false) INITIALIZE_PASS_END(AMDGPUCodeGenPrepare
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
Statically lint checks LLVM IR
Value * CreateFNeg(Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
bool isNullValue() const
Determine if all bits are clear.
A wrapper class for inspecting calls to intrinsic functions.
const BasicBlock * getParent() const