57 #define DEBUG_TYPE "pgo-memop-opt" 59 STATISTIC(NumOfPGOMemOPOpt,
"Number of memop intrinsics optimized.");
60 STATISTIC(NumOfPGOMemOPAnnotate,
"Number of memop intrinsics annotated.");
66 cl::desc(
"The minimum count to optimize memory " 78 cl::desc(
"The percentage threshold for the " 79 "memory intrinsic calls optimization"));
85 cl::desc(
"The max version for the optimized memory " 91 cl::desc(
"Scale the memop size counts using the basic " 92 " block count value"));
109 StringRef getPassName()
const override {
return "PGOMemOPSize"; }
124 "Optimize memory intrinsic using its size value profile",
128 "Optimize memory intrinsic using its
size value
profile",
132 return new PGOMemOPSizeOptLegacyPass();
136 class MemOPSizeOpt :
public InstVisitor<MemOPSizeOpt> {
140 : Func(Func),
BFI(BFI), ORE(ORE), DT(DT), Changed(
false) {
147 bool isChanged()
const {
return Changed; }
152 for (
auto &
MI : WorkList) {
153 ++NumOfPGOMemOPAnnotate;
158 <<
MI->getCalledFunction()->getName()
159 <<
"is Transformed.\n");
167 if (dyn_cast<ConstantInt>(Length))
169 WorkList.push_back(&MI);
178 std::vector<MemIntrinsic *> WorkList;
180 int64_t PreciseRangeStart;
182 int64_t PreciseRangeLast;
184 std::unique_ptr<InstrProfValueData[]> ValueDataArray;
190 enum MemOPSizeKind { PreciseValue, NonLargeGroup, LargeGroup };
192 MemOPSizeKind getMemOPSizeKind(int64_t
Value)
const {
193 if (Value == MemOPSizeLarge && MemOPSizeLarge != 0)
195 if (Value == PreciseRangeLast + 1)
196 return NonLargeGroup;
214 static bool isProfitable(uint64_t Count, uint64_t TotalCount) {
215 assert(Count <= TotalCount);
223 static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num,
229 return ScaleCount / Denom;
240 ValueDataArray.get(), NumVals, TotalCount))
243 uint64_t ActualCount = TotalCount;
244 uint64_t SavedTotalCount = TotalCount;
246 auto BBEdgeCount =
BFI.getBlockProfileCount(MI->
getParent());
249 ActualCount = *BBEdgeCount;
253 LLVM_DEBUG(
dbgs() <<
"Read one memory intrinsic profile with count " 254 << ActualCount <<
"\n");
257 : VDs) {
dbgs() <<
" (" << VD.Value <<
"," << VD.Count <<
")\n"; });
266 TotalCount = ActualCount;
269 <<
" denominator = " << SavedTotalCount <<
"\n");
272 uint64_t RemainCount = TotalCount;
273 uint64_t SavedRemainCount = SavedTotalCount;
276 uint64_t MaxCount = 0;
280 for (
auto &VD : VDs) {
281 int64_t V = VD.Value;
282 uint64_t
C = VD.Count;
284 C = getScaledCount(C, ActualCount, SavedTotalCount);
287 if (getMemOPSizeKind(V) != PreciseValue)
292 if (!isProfitable(C, RemainCount))
302 assert(SavedRemainCount >= VD.Count);
303 SavedRemainCount -= VD.Count;
312 CaseCounts[0] = RemainCount;
313 if (RemainCount > MaxCount)
314 MaxCount = RemainCount;
316 uint64_t SumForOpt = TotalCount - RemainCount;
318 LLVM_DEBUG(
dbgs() <<
"Optimize one memory intrinsic call to " << Version
319 <<
" Versions (covering " << SumForOpt <<
" out of " 320 << TotalCount <<
")\n");
341 auto OrigBBFreq =
BFI.getBlockFreq(BB);
348 MergeBB->setName(
"MemOP.Merge");
349 BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency());
350 DefaultBB->
setName(
"MemOP.Default");
353 auto &Ctx = Func.getContext();
362 if (SavedRemainCount > 0 || Version != NumVals)
365 SavedRemainCount, IPVK_MemOPSize, NumVals);
369 std::vector<DominatorTree::UpdateType> Updates;
371 Updates.reserve(2 * SizeIds.
size());
373 for (uint64_t SizeId : SizeIds) {
375 Ctx,
Twine(
"MemOP.Case.") +
Twine(SizeId), &Func, DefaultBB);
380 assert(SizeType &&
"Expected integer type size argument.");
386 SI->
addCase(CaseSizeId, CaseBB);
405 <<
"optimized " <<
NV(
"Intrinsic",
StringRef(getMIName(MI)))
406 <<
" with count " <<
NV(
"Count", SumForOpt) <<
" out of " 407 <<
NV(
"Total", TotalCount) <<
" for " <<
NV(
"Versions", Version)
423 MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT);
424 MemOPSizeOpt.perform();
425 return MemOPSizeOpt.isChanged();
430 getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
431 auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
432 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
Legacy wrapper pass to provide the GlobalsAAResult object.
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
void setProfMetadata(Module *M, Instruction *TI, ArrayRef< uint64_t > EdgeCounts, uint64_t MaxCount)
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Base class for instruction visitors.
DiagnosticInfoOptimizationBase::Argument NV
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
This class represents lattice values for constants.
This is the interface for a simple mod/ref and alias analysis over globals.
FunctionPass * createPGOMemOPSizeOptLegacyPass()
void push_back(const T &Elt)
void addCase(ConstantInt *OnVal, BasicBlock *Dest)
Add an entry to the switch instruction.
cl::opt< unsigned > MemOPSizeLarge
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
STATISTIC(NumFunctions, "Total number of functions")
Analysis pass which computes a DominatorTree.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
static cl::opt< unsigned > MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40), cl::Hidden, cl::ZeroOrMore, cl::desc("The percentage threshold for the " "memory intrinsic calls optimization"))
Value * getLength() const
std::enable_if< std::is_unsigned< T >::value, T >::type SaturatingMultiply(T X, T Y, bool *ResultOverflowed=nullptr)
Multiply two unsigned integers, X and Y, of type T.
INITIALIZE_PASS_BEGIN(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt", "Optimize memory intrinsic using its size value profile", false, false) INITIALIZE_PASS_END(PGOMemOPSizeOptLegacyPass
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI, OptimizationRemarkEmitter &ORE, DominatorTree *DT)
Legacy analysis pass which computes BlockFrequencyInfo.
void setName(const Twine &Name)
Change the name of the value.
Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following: ...
This file provides the interface for IR based instrumentation passes ( (profile-gen, and profile-use).
Type * getType() const
All values are typed, get the type of this value.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
void getMemOPSizeRangeFromOption(StringRef Str, int64_t &RangeStart, int64_t &RangeLast)
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
static bool runOnFunction(Function &F, bool PostInlining)
initializer< Ty > init(const Ty &Val)
bool getValueProfDataFromInst(const Instruction &Inst, InstrProfValueKind ValueKind, uint32_t MaxNumValueData, InstrProfValueData ValueData[], uint32_t &ActualNumValueData, uint64_t &TotalC)
Extract the value profile data from Inst which is annotated with value profile meta data...
A set of analyses that are preserved following a run of a transformation pass.
static constexpr UpdateKind Insert
LLVM Basic Block Representation.
Represent the analysis usage information of a pass.
Analysis pass providing a never-invalidated alias analysis result.
FunctionPass class - This class is used to implement most global optimizations.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Class to represent integer types.
void annotateValueSite(Module &M, Instruction &Inst, const InstrProfRecord &InstrProfR, InstrProfValueKind ValueKind, uint32_t SiteIndx, uint32_t MaxMDCount=3)
Get the value profile data for value site SiteIdx from InstrProfR and annotate the instruction Inst w...
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
static cl::opt< bool > MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden, cl::desc("Scale the memop size counts using the basic " " block count value"))
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
const InstListType & getInstList() const
Return the underlying instruction list container.
Analysis pass which computes BlockFrequencyInfo.
This is the common base class for memset/memcpy/memmove.
Iterator for intrusive lists based on ilist_node.
This is the shared class of boolean and integer constants.
auto size(R &&Range, typename std::enable_if< std::is_same< typename std::iterator_traits< decltype(Range.begin())>::iterator_category, std::random_access_iterator_tag >::value, void >::type *=nullptr) -> decltype(std::distance(Range.begin(), Range.end()))
Get the size of a range.
static cl::opt< unsigned > MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden, cl::ZeroOrMore, cl::desc("The max version for the optimized memory " " intrinsic calls"))
void applyUpdates(ArrayRef< DominatorTree::UpdateType > Updates, bool ForceRemoveDuplicates=false)
Apply updates on all available trees.
static Constant * get(Type *Ty, uint64_t V, bool isSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
static cl::opt< bool > DisableMemOPOPT("disable-memop-opt", cl::init(false), cl::Hidden, cl::desc("Disable optimize"))
void push_back(pointer val)
SwitchInst * CreateSwitch(Value *V, BasicBlock *Dest, unsigned NumCases=10, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a switch instruction with the specified value, default dest, and with a hint for the number of...
pgo instr Read PGO instrumentation profile
cl::opt< std::string > MemOPSizeRange
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
static cl::opt< unsigned > MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::ZeroOrMore, cl::init(1000), cl::desc("The minimum count to optimize memory " "intrinsic calls"))
void initializePGOMemOPSizeOptLegacyPassPass(PassRegistry &)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
LLVM Value Representation.
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
BasicBlock * SplitBlock(BasicBlock *Old, Instruction *SplitPt, DominatorTree *DT=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr)
Split the specified block at the specified instruction - everything before SplitPt stays in Old and e...
StringRef - Represent a constant reference to a string, i.e.
A container for analyses that lazily runs them and caches their results.
Legacy analysis pass which computes a DominatorTree.
This header defines various interfaces for pass management in LLVM.
const BasicBlock * getParent() const