25 #define DEBUG_TYPE "amdgpu-atomic-optimizer" 37 DPP_ROW_BCAST15 = 0x142,
38 DPP_ROW_BCAST31 = 0x143
41 struct ReplacementInfo {
59 unsigned ValIdx,
bool ValDivergent)
const;
61 void setConvergent(
CallInst *
const CI)
const;
87 if (skipFunction(F)) {
91 DA = &getAnalysis<LegacyDivergenceAnalysis>();
94 getAnalysisIfAvailable<DominatorTreeWrapperPass>();
104 const bool Changed = !ToReplace.empty();
106 for (ReplacementInfo &
Info : ToReplace) {
115 void AMDGPUAtomicOptimizer::visitAtomicRMWInst(
AtomicRMWInst &
I) {
134 Op = Instruction::Sub;
138 const unsigned PtrIdx = 0;
139 const unsigned ValIdx = 1;
147 const bool ValDivergent = DA->isDivergent(I.
getOperand(ValIdx));
153 if (ValDivergent && (!HasDPP || (DL->getTypeSizeInBits(I.
getType()) != 32))) {
160 const ReplacementInfo
Info = {&
I,
Op, ValIdx, ValDivergent};
162 ToReplace.push_back(Info);
165 void AMDGPUAtomicOptimizer::visitIntrinsicInst(
IntrinsicInst &I) {
179 Op = Instruction::Sub;
183 const unsigned ValIdx = 0;
185 const bool ValDivergent = DA->isDivergent(I.
getOperand(ValIdx));
191 if (ValDivergent && (!HasDPP || (DL->getTypeSizeInBits(I.
getType()) != 32))) {
206 const ReplacementInfo
Info = {&
I,
Op, ValIdx, ValDivergent};
208 ToReplace.push_back(Info);
211 void AMDGPUAtomicOptimizer::optimizeAtomic(
Instruction &I,
214 bool ValDivergent)
const {
246 const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty);
273 {ExtractHi, PartialMbcnt});
277 Value *LaneOffset =
nullptr;
278 Value *NewV =
nullptr;
287 setConvergent(SetInactive);
290 const unsigned Iters = 6;
291 const unsigned DPPCtrl[Iters] = {DPP_ROW_SR1, DPP_ROW_SR2,
292 DPP_ROW_SR4, DPP_ROW_SR8,
293 DPP_ROW_BCAST15, DPP_ROW_BCAST31};
294 const unsigned RowMask[Iters] = {0xf, 0xf, 0xf, 0xf, 0xa, 0xc};
298 for (
unsigned Idx = 0; Idx < Iters; Idx++) {
325 if (TyBitWidth == 64) {
327 Value *
const ExtractHi =
331 setConvergent(ReadLaneLo);
334 setConvergent(ReadLaneHi);
337 Value *
const Insert =
340 }
else if (TyBitWidth == 32) {
343 setConvergent(ReadLane);
396 Value *BroadcastI =
nullptr;
398 if (TyBitWidth == 64) {
400 Value *
const ExtractHi =
404 setConvergent(ReadFirstLaneLo);
407 setConvergent(ReadFirstLaneHi);
410 Value *
const Insert =
413 }
else if (TyBitWidth == 32) {
416 setConvergent(ReadFirstLane);
417 BroadcastI = ReadFirstLane;
445 void AMDGPUAtomicOptimizer::setConvergent(
CallInst *
const CI)
const {
450 "AMDGPU atomic optimizations",
false,
false)
457 return new AMDGPUAtomicOptimizer();
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
A parsed version of the target data layout string in and methods for querying it. ...
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Base class for instruction visitors.
AMDGPU specific subclass of TargetSubtarget.
This class represents lattice values for constants.
static MDString * get(LLVMContext &Context, StringRef Str)
This class represents a function call, abstracting a target machine's calling convention.
LLVMContext & getContext() const
All values hold a context through their type.
TMC & getTM() const
Get the right type of TargetMachine for this target.
an instruction that atomically reads a memory location, combines it with another value, and then stores the result back.
void addAttribute(unsigned i, Attribute::AttrKind Kind)
adds the attribute to the list of attributes.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
char & AMDGPUAtomicOptimizerID
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
BinOp getOperation() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
DominatorTree & getDomTree()
Target-Independent Code Generator Pass Configuration Options.
Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following: ...
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Type * getType() const
All values are typed, get the type of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block...
Value * getOperand(unsigned i) const
Analysis containing CSE Info
ConstantInt * getIntN(unsigned N, uint64_t C)
Get a constant N-bit value, zero extended or truncated from a 64-bit value.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata *> MDs)
static bool runOnFunction(Function &F, bool PostInlining)
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
LLVM Basic Block Representation.
The instances of the Type class are immutable: once they are created, they are never changed...
This is an important class for using LLVM in a threaded context.
ConstantInt * getTrue()
Get the constant value for i1 true.
Represent the analysis usage information of a pass.
Address space for local memory.
FunctionPass class - This class is used to implement most global optimizations.
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Address space for global memory (RAT0, VTX0).
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Calling convention used for Mesa/AMDPAL pixel shaders.
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="")
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
unsigned getNumOperands() const
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small...
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type *> Types, ArrayRef< Value *> Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with args, mangled using Types.
Value * CreateInsertElement(Value *Vec, Value *NewElt, Value *Idx, const Twine &Name="")
FunctionPass * createAMDGPUAtomicOptimizerPass()
AMDGPU atomic optimizations
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
void setOperand(unsigned i, Value *Val)
ConstantInt * getFalse()
Get the constant value for i1 false.
Instruction * SplitBlockAndInsertIfThen(Value *Cond, Instruction *SplitBefore, bool Unreachable, MDNode *BranchWeights=nullptr, DominatorTree *DT=nullptr, LoopInfo *LI=nullptr)
Split the containing block at the specified instruction - everything before SplitBefore stays in the ...
INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE, "AMDGPU atomic optimizations", false, false) INITIALIZE_PASS_END(AMDGPUAtomicOptimizer
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Module * getParent()
Get the module that this global value is contained inside of...
LLVM Value Representation.
static VectorType * get(Type *ElementType, unsigned NumElements)
This static method is the primary way to construct an VectorType.
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Primary interface to the complete machine description for the target machine.
Legacy analysis pass which computes a DominatorTree.
A wrapper class for inspecting calls to intrinsic functions.
const BasicBlock * getParent() const