LLVM8Doxygen/AMDGPUAtomicOptimizer_8cpp_source.html

 //===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 /// \file
 /// This pass optimizes atomic operations by using a single lane of a wavefront
 /// to perform the atomic operation, thus reducing contention on that memory
 /// location.
 //
 //===----------------------------------------------------------------------===//

 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"

 #define DEBUG_TYPE "amdgpu-atomic-optimizer"

 using namespace llvm;

 namespace {

 enum DPP_CTRL {
   DPP_ROW_SR1 = 0x111,
   DPP_ROW_SR2 = 0x112,
   DPP_ROW_SR4 = 0x114,
   DPP_ROW_SR8 = 0x118,
   DPP_WF_SR1 = 0x138,
   DPP_ROW_BCAST15 = 0x142,
   DPP_ROW_BCAST31 = 0x143
 };

 struct ReplacementInfo {
   Instruction *I;
   Instruction::BinaryOps Op;
   unsigned ValIdx;
   bool ValDivergent;
 };

 class AMDGPUAtomicOptimizer : public FunctionPass,
                               public InstVisitor<AMDGPUAtomicOptimizer> {
 private:
   SmallVector<ReplacementInfo, 8> ToReplace;
   const LegacyDivergenceAnalysis *DA;
   const DataLayout *DL;
   DominatorTree *DT;
   bool HasDPP;
   bool IsPixelShader;

   void optimizeAtomic(Instruction &I, Instruction::BinaryOps Op,
                       unsigned ValIdx, bool ValDivergent) const;

   void setConvergent(CallInst *const CI) const;

 public:
   static char ID;

   AMDGPUAtomicOptimizer() : FunctionPass(ID) {}

   bool runOnFunction(Function &F) override;

   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addPreserved<DominatorTreeWrapperPass>();
     AU.addRequired<LegacyDivergenceAnalysis>();
     AU.addRequired<TargetPassConfig>();
   }

   void visitAtomicRMWInst(AtomicRMWInst &I);
   void visitIntrinsicInst(IntrinsicInst &I);
 };

 } // namespace

 char AMDGPUAtomicOptimizer::ID = 0;

 char &llvm::AMDGPUAtomicOptimizerID = AMDGPUAtomicOptimizer::ID;

 bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) {
   if (skipFunction(F)) {
     return false;
   }

   DA = &getAnalysis<LegacyDivergenceAnalysis>();
   DL = &F.getParent()->getDataLayout();
   DominatorTreeWrapperPass *const DTW =
       getAnalysisIfAvailable<DominatorTreeWrapperPass>();
   DT = DTW ? &DTW->getDomTree() : nullptr;
   const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
   const TargetMachine &TM = TPC.getTM<TargetMachine>();
   const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
   HasDPP = ST.hasDPP();
   IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;

   visit(F);

   const bool Changed = !ToReplace.empty();

   for (ReplacementInfo &Info : ToReplace) {
     optimizeAtomic(*Info.I, Info.Op, Info.ValIdx, Info.ValDivergent);
   }

   ToReplace.clear();

   return Changed;
 }

 void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) {
   // Early exit for unhandled address space atomic instructions.
   switch (I.getPointerAddressSpace()) {
   default:
     return;
   case AMDGPUAS::GLOBAL_ADDRESS:
   case AMDGPUAS::LOCAL_ADDRESS:
     break;
   }

   Instruction::BinaryOps Op;

   switch (I.getOperation()) {
   default:
     return;
   case AtomicRMWInst::Add:
     Op = Instruction::Add;
     break;
   case AtomicRMWInst::Sub:
     Op = Instruction::Sub;
     break;
   }

   const unsigned PtrIdx = 0;
   const unsigned ValIdx = 1;

   // If the pointer operand is divergent, then each lane is doing an atomic
   // operation on a different address, and we cannot optimize that.
   if (DA->isDivergent(I.getOperand(PtrIdx))) {
     return;
   }

   const bool ValDivergent = DA->isDivergent(I.getOperand(ValIdx));

   // If the value operand is divergent, each lane is contributing a different
   // value to the atomic calculation. We can only optimize divergent values if
   // we have DPP available on our subtarget, and the atomic operation is 32
   // bits.
   if (ValDivergent && (!HasDPP || (DL->getTypeSizeInBits(I.getType()) != 32))) {
     return;
   }

   // If we get here, we can optimize the atomic using a single wavefront-wide
   // atomic operation to do the calculation for the entire wavefront, so
   // remember the instruction so we can come back to it.
   const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent};

   ToReplace.push_back(Info);
 }

 void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) {
   Instruction::BinaryOps Op;

   switch (I.getIntrinsicID()) {
   default:
     return;
   case Intrinsic::amdgcn_buffer_atomic_add:
   case Intrinsic::amdgcn_struct_buffer_atomic_add:
   case Intrinsic::amdgcn_raw_buffer_atomic_add:
     Op = Instruction::Add;
     break;
   case Intrinsic::amdgcn_buffer_atomic_sub:
   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
     Op = Instruction::Sub;
     break;
   }

   const unsigned ValIdx = 0;

   const bool ValDivergent = DA->isDivergent(I.getOperand(ValIdx));

   // If the value operand is divergent, each lane is contributing a different
   // value to the atomic calculation. We can only optimize divergent values if
   // we have DPP available on our subtarget, and the atomic operation is 32
   // bits.
   if (ValDivergent && (!HasDPP || (DL->getTypeSizeInBits(I.getType()) != 32))) {
     return;
   }

   // If any of the other arguments to the intrinsic are divergent, we can't
   // optimize the operation.
   for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) {
     if (DA->isDivergent(I.getOperand(Idx))) {
       return;
     }
   }

   // If we get here, we can optimize the atomic using a single wavefront-wide
   // atomic operation to do the calculation for the entire wavefront, so
   // remember the instruction so we can come back to it.
   const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent};

   ToReplace.push_back(Info);
 }

 void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
                                            Instruction::BinaryOps Op,
                                            unsigned ValIdx,
                                            bool ValDivergent) const {
   LLVMContext &Context = I.getContext();

   // Start building just before the instruction.
   IRBuilder<> B(&I);

   // If we are in a pixel shader, because of how we have to mask out helper
   // lane invocations, we need to record the entry and exit BB's.
   BasicBlock *PixelEntryBB = nullptr;
   BasicBlock *PixelExitBB = nullptr;

   // If we're optimizing an atomic within a pixel shader, we need to wrap the
   // entire atomic operation in a helper-lane check. We do not want any helper
   // lanes that are around only for the purposes of derivatives to take part
   // in any cross-lane communication, and we use a branch on whether the lane is
   // live to do this.
   if (IsPixelShader) {
     // Record I's original position as the entry block.
     PixelEntryBB = I.getParent();

     Value *const Cond = B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {}, {});
     Instruction *const NonHelperTerminator =
         SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr);

     // Record I's new position as the exit block.
     PixelExitBB = I.getParent();

     I.moveBefore(NonHelperTerminator);
     B.SetInsertPoint(&I);
   }

   Type *const Ty = I.getType();
   const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty);
   Type *const VecTy = VectorType::get(B.getInt32Ty(), 2);

   // This is the value in the atomic operation we need to combine in order to
   // reduce the number of atomic operations.
   Value *const V = I.getOperand(ValIdx);

   // We need to know how many lanes are active within the wavefront, and we do
   // this by getting the exec register, which tells us all the lanes that are
   // active.
   MDNode *const RegName =
       llvm::MDNode::get(Context, llvm::MDString::get(Context, "exec"));
   Value *const Metadata = llvm::MetadataAsValue::get(Context, RegName);
   CallInst *const Exec =
       B.CreateIntrinsic(Intrinsic::read_register, {B.getInt64Ty()}, {Metadata});
   setConvergent(Exec);

   // We need to know how many lanes are active within the wavefront that are
   // below us. If we counted each lane linearly starting from 0, a lane is
   // below us only if its associated index was less than ours. We do this by
   // using the mbcnt intrinsic.
   Value *const BitCast = B.CreateBitCast(Exec, VecTy);
   Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0));
   Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1));
   CallInst *const PartialMbcnt = B.CreateIntrinsic(
       Intrinsic::amdgcn_mbcnt_lo, {}, {ExtractLo, B.getInt32(0)});
   CallInst *const Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {},
                                             {ExtractHi, PartialMbcnt});

   Value *const MbcntCast = B.CreateIntCast(Mbcnt, Ty, false);

   Value *LaneOffset = nullptr;
   Value *NewV = nullptr;

   // If we have a divergent value in each lane, we need to combine the value
   // using DPP.
   if (ValDivergent) {
     // First we need to set all inactive invocations to 0, so that they can
     // correctly contribute to the final result.
     CallInst *const SetInactive = B.CreateIntrinsic(
         Intrinsic::amdgcn_set_inactive, Ty, {V, B.getIntN(TyBitWidth, 0)});
     setConvergent(SetInactive);
     NewV = SetInactive;

     const unsigned Iters = 6;
     const unsigned DPPCtrl[Iters] = {DPP_ROW_SR1,     DPP_ROW_SR2,
                                      DPP_ROW_SR4,     DPP_ROW_SR8,
                                      DPP_ROW_BCAST15, DPP_ROW_BCAST31};
     const unsigned RowMask[Iters] = {0xf, 0xf, 0xf, 0xf, 0xa, 0xc};

     // This loop performs an inclusive scan across the wavefront, with all lanes
     // active (by using the WWM intrinsic).
     for (unsigned Idx = 0; Idx < Iters; Idx++) {
       CallInst *const DPP = B.CreateIntrinsic(Intrinsic::amdgcn_mov_dpp, Ty,
                                               {NewV, B.getInt32(DPPCtrl[Idx]),
                                                B.getInt32(RowMask[Idx]),
                                                B.getInt32(0xf), B.getFalse()});
       setConvergent(DPP);
       Value *const WWM = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, DPP);

       NewV = B.CreateBinOp(Op, NewV, WWM);
       NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV);
     }

     // NewV has returned the inclusive scan of V, but for the lane offset we
     // require an exclusive scan. We do this by shifting the values from the
     // entire wavefront right by 1, and by setting the bound_ctrl (last argument
     // to the intrinsic below) to true, we can guarantee that 0 will be shifted
     // into the 0'th invocation.
     CallInst *const DPP =
         B.CreateIntrinsic(Intrinsic::amdgcn_mov_dpp, {Ty},
                           {NewV, B.getInt32(DPP_WF_SR1), B.getInt32(0xf),
                            B.getInt32(0xf), B.getTrue()});
     setConvergent(DPP);
     LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, DPP);

     // Read the value from the last lane, which has accumlated the values of
     // each active lane in the wavefront. This will be our new value with which
     // we will provide to the atomic operation.
     if (TyBitWidth == 64) {
       Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty());
       Value *const ExtractHi =
           B.CreateTrunc(B.CreateLShr(NewV, B.getInt64(32)), B.getInt32Ty());
       CallInst *const ReadLaneLo = B.CreateIntrinsic(
           Intrinsic::amdgcn_readlane, {}, {ExtractLo, B.getInt32(63)});
       setConvergent(ReadLaneLo);
       CallInst *const ReadLaneHi = B.CreateIntrinsic(
           Intrinsic::amdgcn_readlane, {}, {ExtractHi, B.getInt32(63)});
       setConvergent(ReadLaneHi);
       Value *const PartialInsert = B.CreateInsertElement(
           UndefValue::get(VecTy), ReadLaneLo, B.getInt32(0));
       Value *const Insert =
           B.CreateInsertElement(PartialInsert, ReadLaneHi, B.getInt32(1));
       NewV = B.CreateBitCast(Insert, Ty);
     } else if (TyBitWidth == 32) {
       CallInst *const ReadLane = B.CreateIntrinsic(Intrinsic::amdgcn_readlane,
                                                    {}, {NewV, B.getInt32(63)});
       setConvergent(ReadLane);
       NewV = ReadLane;
     } else {
       llvm_unreachable("Unhandled atomic bit width");
     }
   } else {
     // Get the total number of active lanes we have by using popcount.
     Instruction *const Ctpop = B.CreateUnaryIntrinsic(Intrinsic::ctpop, Exec);
     Value *const CtpopCast = B.CreateIntCast(Ctpop, Ty, false);

     // Calculate the new value we will be contributing to the atomic operation
     // for the entire wavefront.
     NewV = B.CreateMul(V, CtpopCast);
     LaneOffset = B.CreateMul(V, MbcntCast);
   }

   // We only want a single lane to enter our new control flow, and we do this
   // by checking if there are any active lanes below us. Only one lane will
   // have 0 active lanes below us, so that will be the only one to progress.
   Value *const Cond = B.CreateICmpEQ(MbcntCast, B.getIntN(TyBitWidth, 0));

   // Store I's original basic block before we split the block.
   BasicBlock *const EntryBB = I.getParent();

   // We need to introduce some new control flow to force a single lane to be
   // active. We do this by splitting I's basic block at I, and introducing the
   // new block such that:
   // entry --> single_lane -\
   //       \------------------> exit
   Instruction *const SingleLaneTerminator =
       SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr);

   // Move the IR builder into single_lane next.
   B.SetInsertPoint(SingleLaneTerminator);

   // Clone the original atomic operation into single lane, replacing the
   // original value with our newly created one.
   Instruction *const NewI = I.clone();
   B.Insert(NewI);
   NewI->setOperand(ValIdx, NewV);

   // Move the IR builder into exit next, and start inserting just before the
   // original instruction.
   B.SetInsertPoint(&I);

   // Create a PHI node to get our new atomic result into the exit block.
   PHINode *const PHI = B.CreatePHI(Ty, 2);
   PHI->addIncoming(UndefValue::get(Ty), EntryBB);
   PHI->addIncoming(NewI, SingleLaneTerminator->getParent());

   // We need to broadcast the value who was the lowest active lane (the first
   // lane) to all other lanes in the wavefront. We use an intrinsic for this,
   // but have to handle 64-bit broadcasts with two calls to this intrinsic.
   Value *BroadcastI = nullptr;

   if (TyBitWidth == 64) {
     Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty());
     Value *const ExtractHi =
         B.CreateTrunc(B.CreateLShr(PHI, B.getInt64(32)), B.getInt32Ty());
     CallInst *const ReadFirstLaneLo =
         B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);
     setConvergent(ReadFirstLaneLo);
     CallInst *const ReadFirstLaneHi =
         B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi);
     setConvergent(ReadFirstLaneHi);
     Value *const PartialInsert = B.CreateInsertElement(
         UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));
     Value *const Insert =
         B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1));
     BroadcastI = B.CreateBitCast(Insert, Ty);
   } else if (TyBitWidth == 32) {
     CallInst *const ReadFirstLane =
         B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI);
     setConvergent(ReadFirstLane);
     BroadcastI = ReadFirstLane;
   } else {
     llvm_unreachable("Unhandled atomic bit width");
   }

   // Now that we have the result of our single atomic operation, we need to
   // get our individual lane's slice into the result. We use the lane offset we
   // previously calculated combined with the atomic result value we got from the
   // first lane, to get our lane's index into the atomic result.
   Value *const Result = B.CreateBinOp(Op, BroadcastI, LaneOffset);

   if (IsPixelShader) {
     // Need a final PHI to reconverge to above the helper lane branch mask.
     B.SetInsertPoint(PixelExitBB->getFirstNonPHI());

     PHINode *const PHI = B.CreatePHI(Ty, 2);
     PHI->addIncoming(UndefValue::get(Ty), PixelEntryBB);
     PHI->addIncoming(Result, I.getParent());
     I.replaceAllUsesWith(PHI);
   } else {
     // Replace the original atomic instruction with the new one.
     I.replaceAllUsesWith(Result);
   }

   // And delete the original.
   I.eraseFromParent();
 }

 void AMDGPUAtomicOptimizer::setConvergent(CallInst *const CI) const {
   CI->addAttribute(AttributeList::FunctionIndex, Attribute::Convergent);
 }

 INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE,
                       "AMDGPU atomic optimizations", false, false)
 INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
 INITIALIZE_PASS_END(AMDGPUAtomicOptimizer, DEBUG_TYPE,
                     "AMDGPU atomic optimizations", false, false)

 FunctionPass *llvm::createAMDGPUAtomicOptimizerPass() {
   return new AMDGPUAtomicOptimizer();
 }
llvm::Intrinsic::amdgcn_raw_buffer_atomic_add
Definition: Intrinsics.h:1029

LegacyDivergenceAnalysis.h

llvm::Instruction::eraseFromParent
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks &#39;this&#39; from the containing basic block and deletes it.
Definition: Instruction.cpp:68

llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:111

TargetPassConfig.h

llvm::Intrinsic::amdgcn_readlane
Definition: Intrinsics.h:1049

llvm::AnalysisUsage::addPreserved
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
Definition: PassAnalysisSupport.h:89

llvm::Intrinsic::amdgcn_mbcnt_lo
Definition: Intrinsics.h:1021

llvm::PHINode::addIncoming
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Definition: Instructions.h:2701

llvm::IRBuilder::CreateBinOp
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1298

Context
LLVMContext & Context
Definition: NVVMIntrRange.cpp:72

llvm::InstVisitor
Base class for instruction visitors.
Definition: InstVisitor.h:81

AMDGPUSubtarget.h
AMDGPU specific subclass of TargetSubtarget.

llvm
This class represents lattice values for constants.
Definition: AllocatorList.h:24

llvm::Intrinsic::read_register
Definition: Intrinsics.h:247

llvm::ARM_MB::ST
Definition: ARMBaseInfo.h:74

llvm::MDString::get
static MDString * get(LLVMContext &Context, StringRef Str)
Definition: Metadata.cpp:454

llvm::PHINode
Definition: Instructions.h:2557

llvm::CallInst
This class represents a function call, abstracting a target machine&#39;s calling convention.
Definition: Instructions.h:1438

AMDGPU.h

llvm::Value::getContext
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:705

llvm::Intrinsic::amdgcn_struct_buffer_atomic_add
Definition: Intrinsics.h:1079

llvm::MDNode
Metadata node.
Definition: Metadata.h:864

llvm::TargetPassConfig::getTM
TMC & getTM() const
Get the right type of TargetMachine for this target.
Definition: TargetPassConfig.h:152

F
F(f)

llvm::Function
Definition: Function.h:60

llvm::AtomicRMWInst::Sub
*p = old - v
Definition: Instructions.h:710

llvm::AtomicRMWInst
an instruction that atomically reads a memory location, combines it with another value, and then stores the result back.
Definition: Instructions.h:692

llvm::CallBase::addAttribute
void addAttribute(unsigned i, Attribute::AttrKind Kind)
adds the attribute to the list of attributes.
Definition: InstrTypes.h:1261

llvm::AtomicRMWInst::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:804

llvm::AMDGPUAtomicOptimizerID
char & AMDGPUAtomicOptimizerID
Definition: AMDGPUAtomicOptimizer.cpp:84

llvm::IRBuilderBase::getInt32Ty
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:347

llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:66

INITIALIZE_PASS_DEPENDENCY
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:51

llvm::Intrinsic::amdgcn_readfirstlane
Definition: Intrinsics.h:1048

DEBUG_TYPE
#define DEBUG_TYPE
Definition: AMDGPUAtomicOptimizer.cpp:25

llvm::Module::getDataLayout
const DataLayout & getDataLayout() const
Get the data layout for the module&#39;s target platform.
Definition: Module.cpp:371

llvm::IRBuilderBase::getInt64Ty
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Definition: IRBuilder.h:352

llvm::AtomicRMWInst::getOperation
BinOp getOperation() const
Definition: Instructions.h:745

llvm::IRBuilder
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:743

llvm::Intrinsic::amdgcn_wwm
Definition: Intrinsics.h:1115

llvm::DominatorTreeWrapperPass::getDomTree
DominatorTree & getDomTree()
Definition: Dominators.h:270

false
Definition: StackSlotColoring.cpp:142

llvm::Intrinsic::amdgcn_struct_buffer_atomic_sub
Definition: Intrinsics.h:1085

llvm::Instruction
Definition: Instruction.h:44

llvm::TargetPassConfig
Target-Independent Code Generator Pass Configuration Options.
Definition: TargetPassConfig.h:86

llvm::Instruction::clone
Instruction * clone() const
Create a copy of &#39;this&#39; instruction that is identical in all ways except the following: ...
Definition: Instruction.cpp:716

llvm::IRBuilder::CreateBitCast
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:1732

llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:245

llvm::Intrinsic::ID
ID
Definition: Intrinsics.h:37

llvm::Instruction::BinaryOps
BinaryOps
Definition: Instruction.h:680

llvm::Value::replaceAllUsesWith
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:429

llvm::IRBuilderBase::CreateUnaryIntrinsic
CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Definition: IRBuilder.cpp:734

llvm::SystemZISD::TM
Definition: SystemZISelLowering.h:68

llvm::Intrinsic::amdgcn_ps_live
Definition: Intrinsics.h:1026

llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
Definition: Dominators.h:145

llvm::IRBuilderBase::SetInsertPoint
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block...
Definition: IRBuilder.h:127

llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:170

Info
Analysis containing CSE Info
Definition: CSEInfo.cpp:21

llvm::IRBuilderBase::getIntN
ConstantInt * getIntN(unsigned N, uint64_t C)
Get a constant N-bit value, zero extended or truncated from a 64-bit value.
Definition: IRBuilder.h:318

llvm::MetadataAsValue::get
static MetadataAsValue * get(LLVMContext &Context, Metadata *MD)
Definition: Metadata.cpp:106

llvm::AttributeList::FunctionIndex
Definition: Attributes.h:331

llvm::MDNode::get
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata *> MDs)
Definition: Metadata.h:1166

runOnFunction
static bool runOnFunction(Function &F, bool PostInlining)
Definition: EntryExitInstrumenter.cpp:66

llvm::BasicBlock::getFirstNonPHI
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:190

B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")

llvm::Intrinsic::amdgcn_raw_buffer_atomic_sub
Definition: Intrinsics.h:1035

llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:58

llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:46

llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:69

llvm::IRBuilderBase::getTrue
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:287

llvm::Intrinsic::amdgcn_mbcnt_hi
Definition: Intrinsics.h:1020

InstVisitor.h

llvm::SIInstrFlags::DPP
Definition: SIDefines.h:43

llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:43

llvm::GCNSubtarget::hasDPP
bool hasDPP() const
Definition: AMDGPUSubtarget.h:796

AMDGPUAS::LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:260

llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:285

llvm::IRBuilder::CreateICmpEQ
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1839

llvm::IRBuilderBase::getInt64
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:312

llvm::IRBuilder::CreateExtractElement
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2041

AMDGPUAS::GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
Definition: AMDGPU.h:256

llvm::UndefValue::get
static UndefValue * get(Type *T)
Static factory methods - Return an &#39;undef&#39; object of the specified type.
Definition: Constants.cpp:1415

llvm::CallingConv::AMDGPU_PS
Calling convention used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:195

INITIALIZE_PASS_END
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
Definition: RegBankSelect.cpp:69

llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:136

llvm::IRBuilder::CreateMul
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1048

llvm::IRBuilder::CreateTrunc
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:1655

llvm::GCNSubtarget
Definition: AMDGPUSubtarget.h:246

llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:51

llvm::IRBuilder::CreatePHI
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:1969

llvm::User::getNumOperands
unsigned getNumOperands() const
Definition: User.h:192

llvm::Function::getCallingConv
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:213

llvm::IRBuilder::CreateIntCast
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:1801

llvm::SmallVector
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:847

llvm::IRBuilderBase::CreateIntrinsic
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type *> Types, ArrayRef< Value *> Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with args, mangled using Types.
Definition: IRBuilder.cpp:751

llvm::IRBuilder::CreateInsertElement
Value * CreateInsertElement(Value *Vec, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2054

llvm::createAMDGPUAtomicOptimizerPass
FunctionPass * createAMDGPUAtomicOptimizerPass()
Definition: AMDGPUAtomicOptimizer.cpp:456

optimizations
AMDGPU atomic optimizations
Definition: AMDGPUAtomicOptimizer.cpp:453

llvm::IRBuilderBase::getInt32
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:307

llvm::User::setOperand
void setOperand(unsigned i, Value *Val)
Definition: User.h:175

llvm::Intrinsic::amdgcn_buffer_atomic_add
Definition: Intrinsics.h:489

llvm::IRBuilderBase::getFalse
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:292

llvm::SplitBlockAndInsertIfThen
Instruction * SplitBlockAndInsertIfThen(Value *Cond, Instruction *SplitBefore, bool Unreachable, MDNode *BranchWeights=nullptr, DominatorTree *DT=nullptr, LoopInfo *LI=nullptr)
Split the containing block at the specified instruction - everything before SplitBefore stays in the ...
Definition: BasicBlockUtils.cpp:719

llvm::Intrinsic::amdgcn_set_inactive
Definition: Intrinsics.h:1076

llvm::Intrinsic::amdgcn_buffer_atomic_sub
Definition: Intrinsics.h:495

llvm::AtomicRMWInst::Add
*p = old + v
Definition: Instructions.h:708

DPP_CTRL
DPP_CTRL
Definition: AMDGPUAtomicOptimizer.cpp:31

I
#define I(x, y, z)
Definition: MD5.cpp:58

BasicBlockUtils.h

INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE, "AMDGPU atomic optimizations", false, false) INITIALIZE_PASS_END(AMDGPUAtomicOptimizer

IRBuilder.h

llvm::IRBuilder::Insert
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition: IRBuilder.h:794

llvm::MCID::Add
Definition: MCInstrDesc.h:153

llvm::GlobalValue::getParent
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:566

llvm::Value
LLVM Value Representation.
Definition: Value.h:73

llvm::VectorType::get
static VectorType * get(Type *ElementType, unsigned NumElements)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:606

llvm::Instruction::moveBefore
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
Definition: Instruction.cpp:87

llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:242

llvm::Intrinsic::amdgcn_mov_dpp
Definition: Intrinsics.h:1022

llvm::IRBuilder::CreateLShr
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1124

llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:59

llvm::LegacyDivergenceAnalysis
Definition: LegacyDivergenceAnalysis.h:27

llvm::Intrinsic::ctpop
Definition: Intrinsics.h:92

llvm::DominatorTreeWrapperPass
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:260

llvm::Attribute::Convergent
Definition: Attributes.h:84

llvm::Metadata
Root of the metadata hierarchy.
Definition: Metadata.h:58

AMDGPU
Definition: AMDGPUPTNote.h:20

llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:44

llvm::Instruction::getParent
const BasicBlock * getParent() const
Definition: Instruction.h:67