59 #include <forward_list> 66 #define LLE_OPTION "loop-load-elim" 67 #define DEBUG_TYPE LLE_OPTION 70 "runtime-check-per-loop-load-elim",
cl::Hidden,
71 cl::desc(
"Max number of memchecks allowed per eliminated load on average"),
76 cl::desc(
"The maximum number of SCEV checks allowed for Loop " 79 STATISTIC(NumLoopLoadEliminted,
"Number of loads eliminated by LLE");
84 struct StoreToLoadForwardingCandidate {
89 : Load(Load), Store(Store) {}
103 "Should be a known dependence");
115 auto *LoadPtrSCEV = cast<SCEVAddRecExpr>(PSE.
getSCEV(LoadPtr));
116 auto *StorePtrSCEV = cast<SCEVAddRecExpr>(PSE.
getSCEV(StorePtr));
120 auto *Dist = cast<SCEVConstant>(
122 const APInt &Val = Dist->getAPInt();
123 return Val == TypeByteSize;
130 const StoreToLoadForwardingCandidate &Cand) {
131 OS << *Cand.Store <<
" -->\n";
132 OS.
indent(2) << *Cand.Load <<
"\n";
159 class LoadEliminationForLoop {
163 : L(L), LI(LI), LAI(LAI), DT(DT), PSE(LAI.
getPSE()) {}
170 std::forward_list<StoreToLoadForwardingCandidate>
172 std::forward_list<StoreToLoadForwardingCandidate> Candidates;
184 for (
const auto &Dep : *Deps) {
186 Instruction *Destination = Dep.getDestination(LAI);
189 if (isa<LoadInst>(Source))
190 LoadsWithUnknownDepedence.
insert(Source);
191 if (isa<LoadInst>(Destination))
192 LoadsWithUnknownDepedence.
insert(Destination);
196 if (Dep.isBackward())
202 assert(Dep.isForward() &&
"Needs to be a forward dependence");
215 Candidates.emplace_front(Load, Store);
218 if (!LoadsWithUnknownDepedence.
empty())
219 Candidates.remove_if([&](
const StoreToLoadForwardingCandidate &
C) {
220 return LoadsWithUnknownDepedence.count(C.Load);
228 auto I = InstOrder.find(Inst);
229 assert(
I != InstOrder.end() &&
"No index for instruction");
252 void removeDependencesFromMultipleStores(
253 std::forward_list<StoreToLoadForwardingCandidate> &Candidates) {
256 using LoadToSingleCandT =
258 LoadToSingleCandT LoadToSingleCand;
260 for (
const auto &Cand : Candidates) {
262 LoadToSingleCandT::iterator Iter;
264 std::tie(Iter, NewElt) =
265 LoadToSingleCand.
insert(std::make_pair(Cand.Load, &Cand));
267 const StoreToLoadForwardingCandidate *&OtherCand = Iter->second;
269 if (OtherCand ==
nullptr)
275 if (Cand.Store->getParent() == OtherCand->Store->getParent() &&
276 Cand.isDependenceDistanceOfOne(PSE, L) &&
277 OtherCand->isDependenceDistanceOfOne(PSE, L)) {
279 if (getInstrIndex(OtherCand->Store) < getInstrIndex(Cand.Store))
286 Candidates.remove_if([&](
const StoreToLoadForwardingCandidate &Cand) {
287 if (LoadToSingleCand[Cand.Load] != &Cand) {
289 dbgs() <<
"Removing from candidates: \n" 291 <<
" The load may have multiple stores forwarding to " 304 bool needsChecking(
unsigned PtrIdx1,
unsigned PtrIdx2,
306 const std::set<Value *> &CandLoadPtrs) {
311 return ((PtrsWrittenOnFwdingPath.
count(Ptr1) && CandLoadPtrs.count(Ptr2)) ||
312 (PtrsWrittenOnFwdingPath.
count(Ptr2) && CandLoadPtrs.count(Ptr1)));
339 std::max_element(Candidates.
begin(), Candidates.
end(),
340 [&](
const StoreToLoadForwardingCandidate &A,
341 const StoreToLoadForwardingCandidate &
B) {
342 return getInstrIndex(A.Load) < getInstrIndex(
B.Load);
346 std::min_element(Candidates.
begin(), Candidates.
end(),
347 [&](
const StoreToLoadForwardingCandidate &A,
348 const StoreToLoadForwardingCandidate &
B) {
349 return getInstrIndex(A.Store) <
350 getInstrIndex(
B.Store);
360 if (
auto *S = dyn_cast<StoreInst>(
I))
361 PtrsWrittenOnFwdingPath.
insert(S->getPointerOperand());
364 std::for_each(MemInstrs.begin() + getInstrIndex(FirstStore) + 1,
365 MemInstrs.end(), InsertStorePtr);
366 std::for_each(MemInstrs.begin(), &MemInstrs[getInstrIndex(LastLoad)],
369 return PtrsWrittenOnFwdingPath;
378 findPointersWrittenOnForwardingPath(Candidates);
382 std::set<Value *> CandLoadPtrs;
384 std::inserter(CandLoadPtrs, CandLoadPtrs.begin()),
385 std::mem_fn(&StoreToLoadForwardingCandidate::getLoadPtr));
390 copy_if(AllChecks, std::back_inserter(Checks),
392 for (
auto PtrIdx1 : Check.first->Members)
393 for (
auto PtrIdx2 : Check.second->Members)
394 if (needsChecking(PtrIdx1, PtrIdx2, PtrsWrittenOnFwdingPath,
409 propagateStoredValueToLoadUsers(
const StoreToLoadForwardingCandidate &Cand,
426 Value *Ptr = Cand.Load->getPointerOperand();
427 auto *PtrSCEV = cast<SCEVAddRecExpr>(PSE.
getSCEV(Ptr));
430 PH->getTerminator());
432 new LoadInst(InitialPtr,
"load_initial",
false,
433 Cand.Load->getAlignment(), PH->getTerminator());
438 PHI->addIncoming(Cand.Store->getOperand(0), L->
getLoopLatch());
440 Cand.Load->replaceAllUsesWith(PHI);
447 <<
"\" checking " << *L <<
"\n");
468 auto StoreToLoadDependences = findStoreToLoadDependences(LAI);
469 if (StoreToLoadDependences.empty())
478 removeDependencesFromMultipleStores(StoreToLoadDependences);
479 if (StoreToLoadDependences.empty())
484 unsigned NumForwarding = 0;
485 for (
const StoreToLoadForwardingCandidate Cand : StoreToLoadDependences) {
501 if (!Cand.isDependenceDistanceOfOne(PSE, L))
508 <<
". Valid store-to-load forwarding across the loop backedge\n");
511 if (Candidates.
empty())
517 collectMemchecks(Candidates);
534 dbgs() <<
"Versioning is needed but not allowed when optimizing " 557 for (
const auto &Cand : Candidates)
558 propagateStoredValueToLoadUsers(Cand, SEE);
559 NumLoopLoadEliminted += NumForwarding;
590 for (
Loop *TopLevelLoop : LI)
597 bool Changed =
false;
598 for (
Loop *L : Worklist) {
600 LoadEliminationForLoop LEL(L, &LI, GetLAI(*L), &DT);
601 Changed |= LEL.processLoop();
622 auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
623 auto &LAA = getAnalysis<LoopAccessLegacyAnalysis>();
624 auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
648 static const char LLE_name[] =
"Loop Load Elimination";
659 return new LoopLoadElimination();
676 SE, TLI, TTI,
nullptr};
Legacy wrapper pass to provide the GlobalsAAResult object.
static bool Check(DecodeStatus &Out, DecodeStatus In)
TrackingVH< Value > PointerValue
Holds the pointer value that we need to check.
void printChecks(raw_ostream &OS, const SmallVectorImpl< PointerCheck > &Checks, unsigned Depth=0) const
Print Checks.
static bool eliminateLoadsAcrossLoops(Function &F, LoopInfo &LI, DominatorTree &DT, function_ref< const LoopAccessInfo &(Loop &)> GetLAI)
const MemoryDepChecker & getDepChecker() const
the Memory Dependence Checker which can determine the loop-independent and loop-carried dependences b...
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
DenseMap< Instruction *, unsigned > generateInstructionOrderMap() const
Generate a mapping between the memory instructions and their indices according to program order...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
This class represents lattice values for constants.
This is the interface for a simple mod/ref and alias analysis over globals.
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly...
void push_back(const T &Elt)
static cl::opt< unsigned > LoadElimSCEVCheckThreshold("loop-load-elimination-scev-check-threshold", cl::init(8), cl::Hidden, cl::desc("The maximum number of SCEV checks allowed for Loop " "Load Elimination"))
const RuntimePointerChecking * getRuntimePointerChecking() const
int64_t getPtrStride(PredicatedScalarEvolution &PSE, Value *Ptr, const Loop *Lp, const ValueToValueMap &StridesMap=ValueToValueMap(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of its element size.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
An efficient, type-erasing, non-owning reference to a callable.
Analysis pass providing the TargetTransformInfo.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly...
STATISTIC(NumFunctions, "Total number of functions")
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
Analysis pass which computes a DominatorTree.
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
An instruction for reading from memory.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
AnalysisUsage & addRequired()
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
#define INITIALIZE_PASS_DEPENDENCY(depName)
Type * getPointerElementType() const
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
static bool isLoadConditional(LoadInst *Load, Loop *L)
Return true if the load is not executed on all paths in the loop.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
static const char LLE_name[]
void getLoopLatches(SmallVectorImpl< BlockT *> &LoopLatches) const
Return all loop latch blocks of this loop.
Analysis pass that exposes the LoopInfo for a function.
This file implements a class to represent arbitrary precision integral constant values and operations...
BlockT * getHeader() const
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
const SmallVectorImpl< Instruction * > & getMemoryInstructions() const
The vector of memory access instructions.
Type * getType() const
All values are typed, get the type of this value.
This header provides classes for managing per-loop analyses.
void initializeLoopLoadEliminationPass(PassRegistry &)
An instruction for storing to memory.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static bool runOnFunction(Function &F, bool PostInlining)
initializer< Ty > init(const Ty &Val)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
A set of analyses that are preserved following a run of a transformation pass.
* if(!EatIfPresent(lltok::kw_thread_local)) return false
ParseOptionalThreadLocal := /*empty.
LLVM Basic Block Representation.
The instances of the Type class are immutable: once they are created, they are never changed...
This analysis provides dependence information for the memory accesses of a loop.
LLVM_NODISCARD bool empty() const
LLVM_ATTRIBUTE_ALWAYS_INLINE iterator begin()
const Instruction & front() const
A manager for alias analyses.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Represent the analysis usage information of a pass.
bool optForSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Value * expandCodeFor(const SCEV *SH, Type *Ty, Instruction *I)
Insert code to directly compute the specified SCEV expression into the program.
FunctionPass class - This class is used to implement most global optimizations.
Value * getPointerOperand()
const PredicatedScalarEvolution & getPSE() const
Used to add runtime SCEV checks.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS. Minus is represented in SCEV as A+B*-1.
void setAliasChecks(SmallVector< RuntimePointerChecking::PointerCheck, 4 > Checks)
Sets the runtime alias checks for versioning the loop.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
std::pair< const CheckingPtrGroup *, const CheckingPtrGroup * > PointerCheck
A memcheck which made up of a pair of grouped pointers.
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
A function analysis which provides an AssumptionCache.
const SmallVectorImpl< Dependence > * getDependences() const
Returns the memory dependences.
This header defines the LoopLoadEliminationPass object.
bool isAlwaysTrue() const override
Implementation of the SCEVPredicate interface.
const SCEVUnionPredicate & getUnionPredicate() const
AnalysisUsage & addRequiredID(const void *ID)
const PointerInfo & getPointerInfo(unsigned PtrIdx) const
Return PointerInfo for pointer at index PtrIdx.
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small...
bool dominates(const Instruction *Def, const Use &U) const
Return true if Def dominates a use in User.
Module.h This file contains the declarations for the Module class.
FunctionPass * createLoopLoadEliminationPass()
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Drive the analysis of memory accesses in the loop.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
This class emits a version of the loop where run-time checks ensure that may-alias pointers can't ove...
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Class for arbitrary precision integers.
This class uses information about analyze scalars to rewrite expressions in canonical form...
Analysis pass that exposes the ScalarEvolution for a function.
unsigned getComplexity() const override
We estimate the complexity of a union predicate as the size number of predicates in the union...
uint64_t getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
bool isLoopSimplifyForm() const
Return true if the Loop is in the form that the LoopSimplify form transforms loops to...
This analysis provides dependence information for the memory accesses of a loop.
LLVM_ATTRIBUTE_ALWAYS_INLINE iterator end()
Type * getPointerOperandType() const
LLVM_NODISCARD bool empty() const
Represents a single loop in the control flow graph.
StringRef getName() const
Return a constant reference to the value's name.
const Function * getParent() const
Return the enclosing method, or null if none.
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
raw_ostream & operator<<(raw_ostream &OS, const APInt &I)
OutputIt transform(R &&Range, OutputIt d_first, UnaryPredicate P)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere...
Analysis pass providing the TargetLibraryInfo.
iterator_range< df_iterator< T > > depth_first(const T &G)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static cl::opt< unsigned > CheckPerElim("runtime-check-per-loop-load-elim", cl::Hidden, cl::desc("Max number of memchecks allowed per eliminated load on average"), cl::init(1))
LLVM Value Representation.
This class implements an extremely fast bulk output stream that can only output to a stream...
The legacy pass manager's analysis pass to compute loop information.
A container for analyses that lazily runs them and caches their results.
Legacy analysis pass which computes a DominatorTree.
UnaryPredicate for_each(R &&Range, UnaryPredicate P)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly...
This header defines various interfaces for pass management in LLVM.
const SmallVector< PointerCheck, 4 > & getChecks() const
Returns the checks that generateChecks created.
static bool doesStoreDominatesAllLatches(BasicBlock *StoreBlock, Loop *L, DominatorTree *DT)
Check if the store dominates all latches, so as long as there is no intervening store this value will...
for(unsigned i=Desc.getNumOperands(), e=OldMI.getNumOperands();i !=e;++i)
Value * getPointerOperand()
const BasicBlock * getParent() const
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...