LLVM  8.0.1
SampleProfile.cpp
Go to the documentation of this file.
1 //===- SampleProfile.cpp - Incorporate sample profiles into the IR --------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements the SampleProfileLoader transformation. This pass
11 // reads a profile file generated by a sampling profiler (e.g. Linux Perf -
12 // http://perf.wiki.kernel.org/) and generates IR metadata to reflect the
13 // profile information in the given profile.
14 //
15 // This pass generates branch weight annotations on the IR:
16 //
17 // - prof: Represents branch weights. This annotation is added to branches
18 // to indicate the weights of each edge coming out of the branch.
19 // The weight of each edge is the weight of the target block for
20 // that edge. The weight of a block B is computed as the maximum
21 // number of samples found in B.
22 //
23 //===----------------------------------------------------------------------===//
24 
26 #include "llvm/ADT/ArrayRef.h"
27 #include "llvm/ADT/DenseMap.h"
28 #include "llvm/ADT/DenseSet.h"
29 #include "llvm/ADT/None.h"
30 #include "llvm/ADT/SmallPtrSet.h"
31 #include "llvm/ADT/SmallSet.h"
32 #include "llvm/ADT/SmallVector.h"
33 #include "llvm/ADT/StringMap.h"
34 #include "llvm/ADT/StringRef.h"
35 #include "llvm/ADT/Twine.h"
38 #include "llvm/Analysis/LoopInfo.h"
43 #include "llvm/IR/BasicBlock.h"
44 #include "llvm/IR/CFG.h"
45 #include "llvm/IR/CallSite.h"
47 #include "llvm/IR/DebugLoc.h"
48 #include "llvm/IR/DiagnosticInfo.h"
49 #include "llvm/IR/Dominators.h"
50 #include "llvm/IR/Function.h"
51 #include "llvm/IR/GlobalValue.h"
52 #include "llvm/IR/InstrTypes.h"
53 #include "llvm/IR/Instruction.h"
54 #include "llvm/IR/Instructions.h"
55 #include "llvm/IR/IntrinsicInst.h"
56 #include "llvm/IR/LLVMContext.h"
57 #include "llvm/IR/MDBuilder.h"
58 #include "llvm/IR/Module.h"
59 #include "llvm/IR/PassManager.h"
61 #include "llvm/Pass.h"
65 #include "llvm/Support/Casting.h"
67 #include "llvm/Support/Debug.h"
69 #include "llvm/Support/ErrorOr.h"
72 #include "llvm/Transforms/IPO.h"
76 #include <algorithm>
77 #include <cassert>
78 #include <cstdint>
79 #include <functional>
80 #include <limits>
81 #include <map>
82 #include <memory>
83 #include <string>
84 #include <system_error>
85 #include <utility>
86 #include <vector>
87 
88 using namespace llvm;
89 using namespace sampleprof;
91 #define DEBUG_TYPE "sample-profile"
92 
93 // Command line option to specify the file to read samples from. This is
94 // mainly used for debugging.
96  "sample-profile-file", cl::init(""), cl::value_desc("filename"),
97  cl::desc("Profile file loaded by -sample-profile"), cl::Hidden);
98 
99 // The named file contains a set of transformations that may have been applied
100 // to the symbol names between the program from which the sample data was
101 // collected and the current program's symbols.
103  "sample-profile-remapping-file", cl::init(""), cl::value_desc("filename"),
104  cl::desc("Profile remapping file loaded by -sample-profile"), cl::Hidden);
105 
107  "sample-profile-max-propagate-iterations", cl::init(100),
108  cl::desc("Maximum number of iterations to go through when propagating "
109  "sample block/edge weights through the CFG."));
110 
112  "sample-profile-check-record-coverage", cl::init(0), cl::value_desc("N"),
113  cl::desc("Emit a warning if less than N% of records in the input profile "
114  "are matched to the IR."));
115 
117  "sample-profile-check-sample-coverage", cl::init(0), cl::value_desc("N"),
118  cl::desc("Emit a warning if less than N% of samples in the input profile "
119  "are matched to the IR."));
120 
122  "no-warn-sample-unused", cl::init(false), cl::Hidden,
123  cl::desc("Use this option to turn off/on warnings about function with "
124  "samples but without debug information to use those samples. "));
125 
127  "profile-sample-accurate", cl::Hidden, cl::init(false),
128  cl::desc("If the sample profile is accurate, we will mark all un-sampled "
129  "callsite and function as having 0 samples. Otherwise, treat "
130  "un-sampled callsites and functions conservatively as unknown. "));
131 
132 namespace {
133 
134 using BlockWeightMap = DenseMap<const BasicBlock *, uint64_t>;
135 using EquivalenceClassMap = DenseMap<const BasicBlock *, const BasicBlock *>;
136 using Edge = std::pair<const BasicBlock *, const BasicBlock *>;
137 using EdgeWeightMap = DenseMap<Edge, uint64_t>;
138 using BlockEdgeMap =
140 
141 class SampleCoverageTracker {
142 public:
143  SampleCoverageTracker() = default;
144 
145  bool markSamplesUsed(const FunctionSamples *FS, uint32_t LineOffset,
146  uint32_t Discriminator, uint64_t Samples);
147  unsigned computeCoverage(unsigned Used, unsigned Total) const;
148  unsigned countUsedRecords(const FunctionSamples *FS,
149  ProfileSummaryInfo *PSI) const;
150  unsigned countBodyRecords(const FunctionSamples *FS,
151  ProfileSummaryInfo *PSI) const;
152  uint64_t getTotalUsedSamples() const { return TotalUsedSamples; }
153  uint64_t countBodySamples(const FunctionSamples *FS,
154  ProfileSummaryInfo *PSI) const;
155 
156  void clear() {
157  SampleCoverage.clear();
158  TotalUsedSamples = 0;
159  }
160 
161 private:
162  using BodySampleCoverageMap = std::map<LineLocation, unsigned>;
163  using FunctionSamplesCoverageMap =
165 
166  /// Coverage map for sampling records.
167  ///
168  /// This map keeps a record of sampling records that have been matched to
169  /// an IR instruction. This is used to detect some form of staleness in
170  /// profiles (see flag -sample-profile-check-coverage).
171  ///
172  /// Each entry in the map corresponds to a FunctionSamples instance. This is
173  /// another map that counts how many times the sample record at the
174  /// given location has been used.
175  FunctionSamplesCoverageMap SampleCoverage;
176 
177  /// Number of samples used from the profile.
178  ///
179  /// When a sampling record is used for the first time, the samples from
180  /// that record are added to this accumulator. Coverage is later computed
181  /// based on the total number of samples available in this function and
182  /// its callsites.
183  ///
184  /// Note that this accumulator tracks samples used from a single function
185  /// and all the inlined callsites. Strictly, we should have a map of counters
186  /// keyed by FunctionSamples pointers, but these stats are cleared after
187  /// every function, so we just need to keep a single counter.
188  uint64_t TotalUsedSamples = 0;
189 };
190 
191 /// Sample profile pass.
192 ///
193 /// This pass reads profile data from the file specified by
194 /// -sample-profile-file and annotates every affected function with the
195 /// profile information found in that file.
196 class SampleProfileLoader {
197 public:
198  SampleProfileLoader(
199  StringRef Name, StringRef RemapName, bool IsThinLTOPreLink,
200  std::function<AssumptionCache &(Function &)> GetAssumptionCache,
201  std::function<TargetTransformInfo &(Function &)> GetTargetTransformInfo)
202  : GetAC(std::move(GetAssumptionCache)),
203  GetTTI(std::move(GetTargetTransformInfo)), Filename(Name),
204  RemappingFilename(RemapName), IsThinLTOPreLink(IsThinLTOPreLink) {}
205 
206  bool doInitialization(Module &M);
207  bool runOnModule(Module &M, ModuleAnalysisManager *AM,
208  ProfileSummaryInfo *_PSI);
209 
210  void dump() { Reader->dump(); }
211 
212 protected:
214  unsigned getFunctionLoc(Function &F);
215  bool emitAnnotations(Function &F);
216  ErrorOr<uint64_t> getInstWeight(const Instruction &I);
217  ErrorOr<uint64_t> getBlockWeight(const BasicBlock *BB);
218  const FunctionSamples *findCalleeFunctionSamples(const Instruction &I) const;
219  std::vector<const FunctionSamples *>
220  findIndirectCallFunctionSamples(const Instruction &I, uint64_t &Sum) const;
221  mutable DenseMap<const DILocation *, const FunctionSamples *> DILocation2SampleMap;
222  const FunctionSamples *findFunctionSamples(const Instruction &I) const;
223  bool inlineCallInstruction(Instruction *I);
224  bool inlineHotFunctions(Function &F,
225  DenseSet<GlobalValue::GUID> &InlinedGUIDs);
226  void printEdgeWeight(raw_ostream &OS, Edge E);
227  void printBlockWeight(raw_ostream &OS, const BasicBlock *BB) const;
228  void printBlockEquivalence(raw_ostream &OS, const BasicBlock *BB);
229  bool computeBlockWeights(Function &F);
230  void findEquivalenceClasses(Function &F);
231  template <bool IsPostDom>
232  void findEquivalencesFor(BasicBlock *BB1, ArrayRef<BasicBlock *> Descendants,
234 
235  void propagateWeights(Function &F);
236  uint64_t visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge);
237  void buildEdges(Function &F);
238  bool propagateThroughEdges(Function &F, bool UpdateBlockCount);
239  void computeDominanceAndLoopInfo(Function &F);
240  void clearFunctionData();
241 
242  /// Map basic blocks to their computed weights.
243  ///
244  /// The weight of a basic block is defined to be the maximum
245  /// of all the instruction weights in that block.
246  BlockWeightMap BlockWeights;
247 
248  /// Map edges to their computed weights.
249  ///
250  /// Edge weights are computed by propagating basic block weights in
251  /// SampleProfile::propagateWeights.
252  EdgeWeightMap EdgeWeights;
253 
254  /// Set of visited blocks during propagation.
256 
257  /// Set of visited edges during propagation.
258  SmallSet<Edge, 32> VisitedEdges;
259 
260  /// Equivalence classes for block weights.
261  ///
262  /// Two blocks BB1 and BB2 are in the same equivalence class if they
263  /// dominate and post-dominate each other, and they are in the same loop
264  /// nest. When this happens, the two blocks are guaranteed to execute
265  /// the same number of times.
266  EquivalenceClassMap EquivalenceClass;
267 
268  /// Map from function name to Function *. Used to find the function from
269  /// the function name. If the function name contains suffix, additional
270  /// entry is added to map from the stripped name to the function if there
271  /// is one-to-one mapping.
273 
274  /// Dominance, post-dominance and loop information.
275  std::unique_ptr<DominatorTree> DT;
276  std::unique_ptr<PostDominatorTree> PDT;
277  std::unique_ptr<LoopInfo> LI;
278 
279  std::function<AssumptionCache &(Function &)> GetAC;
280  std::function<TargetTransformInfo &(Function &)> GetTTI;
281 
282  /// Predecessors for each basic block in the CFG.
283  BlockEdgeMap Predecessors;
284 
285  /// Successors for each basic block in the CFG.
286  BlockEdgeMap Successors;
287 
288  SampleCoverageTracker CoverageTracker;
289 
290  /// Profile reader object.
291  std::unique_ptr<SampleProfileReader> Reader;
292 
293  /// Samples collected for the body of this function.
294  FunctionSamples *Samples = nullptr;
295 
296  /// Name of the profile file to load.
297  std::string Filename;
298 
299  /// Name of the profile remapping file to load.
300  std::string RemappingFilename;
301 
302  /// Flag indicating whether the profile input loaded successfully.
303  bool ProfileIsValid = false;
304 
305  /// Flag indicating if the pass is invoked in ThinLTO compile phase.
306  ///
307  /// In this phase, in annotation, we should not promote indirect calls.
308  /// Instead, we will mark GUIDs that needs to be annotated to the function.
309  bool IsThinLTOPreLink;
310 
311  /// Profile Summary Info computed from sample profile.
312  ProfileSummaryInfo *PSI = nullptr;
313 
314  /// Total number of samples collected in this profile.
315  ///
316  /// This is the sum of all the samples collected in all the functions executed
317  /// at runtime.
318  uint64_t TotalCollectedSamples = 0;
319 
320  /// Optimization Remark Emitter used to emit diagnostic remarks.
321  OptimizationRemarkEmitter *ORE = nullptr;
322 };
323 
324 class SampleProfileLoaderLegacyPass : public ModulePass {
325 public:
326  // Class identification, replacement for typeinfo
327  static char ID;
328 
329  SampleProfileLoaderLegacyPass(StringRef Name = SampleProfileFile,
330  bool IsThinLTOPreLink = false)
331  : ModulePass(ID),
332  SampleLoader(Name, SampleProfileRemappingFile, IsThinLTOPreLink,
333  [&](Function &F) -> AssumptionCache & {
334  return ACT->getAssumptionCache(F);
335  },
336  [&](Function &F) -> TargetTransformInfo & {
337  return TTIWP->getTTI(F);
338  }) {
341  }
342 
343  void dump() { SampleLoader.dump(); }
344 
345  bool doInitialization(Module &M) override {
346  return SampleLoader.doInitialization(M);
347  }
348 
349  StringRef getPassName() const override { return "Sample profile pass"; }
350  bool runOnModule(Module &M) override;
351 
352  void getAnalysisUsage(AnalysisUsage &AU) const override {
356  }
357 
358 private:
359  SampleProfileLoader SampleLoader;
360  AssumptionCacheTracker *ACT = nullptr;
361  TargetTransformInfoWrapperPass *TTIWP = nullptr;
362 };
363 
364 } // end anonymous namespace
365 
366 /// Return true if the given callsite is hot wrt to hot cutoff threshold.
367 ///
368 /// Functions that were inlined in the original binary will be represented
369 /// in the inline stack in the sample profile. If the profile shows that
370 /// the original inline decision was "good" (i.e., the callsite is executed
371 /// frequently), then we will recreate the inline decision and apply the
372 /// profile from the inlined callsite.
373 ///
374 /// To decide whether an inlined callsite is hot, we compare the callsite
375 /// sample count with the hot cutoff computed by ProfileSummaryInfo, it is
376 /// regarded as hot if the count is above the cutoff value.
377 static bool callsiteIsHot(const FunctionSamples *CallsiteFS,
378  ProfileSummaryInfo *PSI) {
379  if (!CallsiteFS)
380  return false; // The callsite was not inlined in the original binary.
381 
382  assert(PSI && "PSI is expected to be non null");
383  uint64_t CallsiteTotalSamples = CallsiteFS->getTotalSamples();
384  return PSI->isHotCount(CallsiteTotalSamples);
385 }
386 
387 /// Mark as used the sample record for the given function samples at
388 /// (LineOffset, Discriminator).
389 ///
390 /// \returns true if this is the first time we mark the given record.
391 bool SampleCoverageTracker::markSamplesUsed(const FunctionSamples *FS,
392  uint32_t LineOffset,
393  uint32_t Discriminator,
394  uint64_t Samples) {
395  LineLocation Loc(LineOffset, Discriminator);
396  unsigned &Count = SampleCoverage[FS][Loc];
397  bool FirstTime = (++Count == 1);
398  if (FirstTime)
399  TotalUsedSamples += Samples;
400  return FirstTime;
401 }
402 
403 /// Return the number of sample records that were applied from this profile.
404 ///
405 /// This count does not include records from cold inlined callsites.
406 unsigned
407 SampleCoverageTracker::countUsedRecords(const FunctionSamples *FS,
408  ProfileSummaryInfo *PSI) const {
409  auto I = SampleCoverage.find(FS);
410 
411  // The size of the coverage map for FS represents the number of records
412  // that were marked used at least once.
413  unsigned Count = (I != SampleCoverage.end()) ? I->second.size() : 0;
414 
415  // If there are inlined callsites in this function, count the samples found
416  // in the respective bodies. However, do not bother counting callees with 0
417  // total samples, these are callees that were never invoked at runtime.
418  for (const auto &I : FS->getCallsiteSamples())
419  for (const auto &J : I.second) {
420  const FunctionSamples *CalleeSamples = &J.second;
421  if (callsiteIsHot(CalleeSamples, PSI))
422  Count += countUsedRecords(CalleeSamples, PSI);
423  }
424 
425  return Count;
426 }
427 
428 /// Return the number of sample records in the body of this profile.
429 ///
430 /// This count does not include records from cold inlined callsites.
431 unsigned
432 SampleCoverageTracker::countBodyRecords(const FunctionSamples *FS,
433  ProfileSummaryInfo *PSI) const {
434  unsigned Count = FS->getBodySamples().size();
435 
436  // Only count records in hot callsites.
437  for (const auto &I : FS->getCallsiteSamples())
438  for (const auto &J : I.second) {
439  const FunctionSamples *CalleeSamples = &J.second;
440  if (callsiteIsHot(CalleeSamples, PSI))
441  Count += countBodyRecords(CalleeSamples, PSI);
442  }
443 
444  return Count;
445 }
446 
447 /// Return the number of samples collected in the body of this profile.
448 ///
449 /// This count does not include samples from cold inlined callsites.
450 uint64_t
451 SampleCoverageTracker::countBodySamples(const FunctionSamples *FS,
452  ProfileSummaryInfo *PSI) const {
453  uint64_t Total = 0;
454  for (const auto &I : FS->getBodySamples())
455  Total += I.second.getSamples();
456 
457  // Only count samples in hot callsites.
458  for (const auto &I : FS->getCallsiteSamples())
459  for (const auto &J : I.second) {
460  const FunctionSamples *CalleeSamples = &J.second;
461  if (callsiteIsHot(CalleeSamples, PSI))
462  Total += countBodySamples(CalleeSamples, PSI);
463  }
464 
465  return Total;
466 }
467 
468 /// Return the fraction of sample records used in this profile.
469 ///
470 /// The returned value is an unsigned integer in the range 0-100 indicating
471 /// the percentage of sample records that were used while applying this
472 /// profile to the associated function.
473 unsigned SampleCoverageTracker::computeCoverage(unsigned Used,
474  unsigned Total) const {
475  assert(Used <= Total &&
476  "number of used records cannot exceed the total number of records");
477  return Total > 0 ? Used * 100 / Total : 100;
478 }
479 
480 /// Clear all the per-function data used to load samples and propagate weights.
481 void SampleProfileLoader::clearFunctionData() {
482  BlockWeights.clear();
483  EdgeWeights.clear();
484  VisitedBlocks.clear();
485  VisitedEdges.clear();
486  EquivalenceClass.clear();
487  DT = nullptr;
488  PDT = nullptr;
489  LI = nullptr;
490  Predecessors.clear();
491  Successors.clear();
492  CoverageTracker.clear();
493 }
494 
495 #ifndef NDEBUG
496 /// Print the weight of edge \p E on stream \p OS.
497 ///
498 /// \param OS Stream to emit the output to.
499 /// \param E Edge to print.
500 void SampleProfileLoader::printEdgeWeight(raw_ostream &OS, Edge E) {
501  OS << "weight[" << E.first->getName() << "->" << E.second->getName()
502  << "]: " << EdgeWeights[E] << "\n";
503 }
504 
505 /// Print the equivalence class of block \p BB on stream \p OS.
506 ///
507 /// \param OS Stream to emit the output to.
508 /// \param BB Block to print.
509 void SampleProfileLoader::printBlockEquivalence(raw_ostream &OS,
510  const BasicBlock *BB) {
511  const BasicBlock *Equiv = EquivalenceClass[BB];
512  OS << "equivalence[" << BB->getName()
513  << "]: " << ((Equiv) ? EquivalenceClass[BB]->getName() : "NONE") << "\n";
514 }
515 
516 /// Print the weight of block \p BB on stream \p OS.
517 ///
518 /// \param OS Stream to emit the output to.
519 /// \param BB Block to print.
520 void SampleProfileLoader::printBlockWeight(raw_ostream &OS,
521  const BasicBlock *BB) const {
522  const auto &I = BlockWeights.find(BB);
523  uint64_t W = (I == BlockWeights.end() ? 0 : I->second);
524  OS << "weight[" << BB->getName() << "]: " << W << "\n";
525 }
526 #endif
527 
528 /// Get the weight for an instruction.
529 ///
530 /// The "weight" of an instruction \p Inst is the number of samples
531 /// collected on that instruction at runtime. To retrieve it, we
532 /// need to compute the line number of \p Inst relative to the start of its
533 /// function. We use HeaderLineno to compute the offset. We then
534 /// look up the samples collected for \p Inst using BodySamples.
535 ///
536 /// \param Inst Instruction to query.
537 ///
538 /// \returns the weight of \p Inst.
539 ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) {
540  const DebugLoc &DLoc = Inst.getDebugLoc();
541  if (!DLoc)
542  return std::error_code();
543 
544  const FunctionSamples *FS = findFunctionSamples(Inst);
545  if (!FS)
546  return std::error_code();
547 
548  // Ignore all intrinsics, phinodes and branch instructions.
549  // Branch and phinodes instruction usually contains debug info from sources outside of
550  // the residing basic block, thus we ignore them during annotation.
551  if (isa<BranchInst>(Inst) || isa<IntrinsicInst>(Inst) || isa<PHINode>(Inst))
552  return std::error_code();
553 
554  // If a direct call/invoke instruction is inlined in profile
555  // (findCalleeFunctionSamples returns non-empty result), but not inlined here,
556  // it means that the inlined callsite has no sample, thus the call
557  // instruction should have 0 count.
558  if ((isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) &&
559  !ImmutableCallSite(&Inst).isIndirectCall() &&
560  findCalleeFunctionSamples(Inst))
561  return 0;
562 
563  const DILocation *DIL = DLoc;
564  uint32_t LineOffset = FunctionSamples::getOffset(DIL);
565  uint32_t Discriminator = DIL->getBaseDiscriminator();
566  ErrorOr<uint64_t> R = FS->findSamplesAt(LineOffset, Discriminator);
567  if (R) {
568  bool FirstMark =
569  CoverageTracker.markSamplesUsed(FS, LineOffset, Discriminator, R.get());
570  if (FirstMark) {
571  ORE->emit([&]() {
572  OptimizationRemarkAnalysis Remark(DEBUG_TYPE, "AppliedSamples", &Inst);
573  Remark << "Applied " << ore::NV("NumSamples", *R);
574  Remark << " samples from profile (offset: ";
575  Remark << ore::NV("LineOffset", LineOffset);
576  if (Discriminator) {
577  Remark << ".";
578  Remark << ore::NV("Discriminator", Discriminator);
579  }
580  Remark << ")";
581  return Remark;
582  });
583  }
584  LLVM_DEBUG(dbgs() << " " << DLoc.getLine() << "."
585  << DIL->getBaseDiscriminator() << ":" << Inst
586  << " (line offset: " << LineOffset << "."
587  << DIL->getBaseDiscriminator() << " - weight: " << R.get()
588  << ")\n");
589  }
590  return R;
591 }
592 
593 /// Compute the weight of a basic block.
594 ///
595 /// The weight of basic block \p BB is the maximum weight of all the
596 /// instructions in BB.
597 ///
598 /// \param BB The basic block to query.
599 ///
600 /// \returns the weight for \p BB.
601 ErrorOr<uint64_t> SampleProfileLoader::getBlockWeight(const BasicBlock *BB) {
602  uint64_t Max = 0;
603  bool HasWeight = false;
604  for (auto &I : BB->getInstList()) {
605  const ErrorOr<uint64_t> &R = getInstWeight(I);
606  if (R) {
607  Max = std::max(Max, R.get());
608  HasWeight = true;
609  }
610  }
611  return HasWeight ? ErrorOr<uint64_t>(Max) : std::error_code();
612 }
613 
614 /// Compute and store the weights of every basic block.
615 ///
616 /// This populates the BlockWeights map by computing
617 /// the weights of every basic block in the CFG.
618 ///
619 /// \param F The function to query.
620 bool SampleProfileLoader::computeBlockWeights(Function &F) {
621  bool Changed = false;
622  LLVM_DEBUG(dbgs() << "Block weights\n");
623  for (const auto &BB : F) {
624  ErrorOr<uint64_t> Weight = getBlockWeight(&BB);
625  if (Weight) {
626  BlockWeights[&BB] = Weight.get();
627  VisitedBlocks.insert(&BB);
628  Changed = true;
629  }
630  LLVM_DEBUG(printBlockWeight(dbgs(), &BB));
631  }
632 
633  return Changed;
634 }
635 
636 /// Get the FunctionSamples for a call instruction.
637 ///
638 /// The FunctionSamples of a call/invoke instruction \p Inst is the inlined
639 /// instance in which that call instruction is calling to. It contains
640 /// all samples that resides in the inlined instance. We first find the
641 /// inlined instance in which the call instruction is from, then we
642 /// traverse its children to find the callsite with the matching
643 /// location.
644 ///
645 /// \param Inst Call/Invoke instruction to query.
646 ///
647 /// \returns The FunctionSamples pointer to the inlined instance.
648 const FunctionSamples *
649 SampleProfileLoader::findCalleeFunctionSamples(const Instruction &Inst) const {
650  const DILocation *DIL = Inst.getDebugLoc();
651  if (!DIL) {
652  return nullptr;
653  }
654 
655  StringRef CalleeName;
656  if (const CallInst *CI = dyn_cast<CallInst>(&Inst))
657  if (Function *Callee = CI->getCalledFunction())
658  CalleeName = Callee->getName();
659 
660  const FunctionSamples *FS = findFunctionSamples(Inst);
661  if (FS == nullptr)
662  return nullptr;
663 
665  DIL->getBaseDiscriminator()),
666  CalleeName);
667 }
668 
669 /// Returns a vector of FunctionSamples that are the indirect call targets
670 /// of \p Inst. The vector is sorted by the total number of samples. Stores
671 /// the total call count of the indirect call in \p Sum.
672 std::vector<const FunctionSamples *>
673 SampleProfileLoader::findIndirectCallFunctionSamples(
674  const Instruction &Inst, uint64_t &Sum) const {
675  const DILocation *DIL = Inst.getDebugLoc();
676  std::vector<const FunctionSamples *> R;
677 
678  if (!DIL) {
679  return R;
680  }
681 
682  const FunctionSamples *FS = findFunctionSamples(Inst);
683  if (FS == nullptr)
684  return R;
685 
686  uint32_t LineOffset = FunctionSamples::getOffset(DIL);
687  uint32_t Discriminator = DIL->getBaseDiscriminator();
688 
689  auto T = FS->findCallTargetMapAt(LineOffset, Discriminator);
690  Sum = 0;
691  if (T)
692  for (const auto &T_C : T.get())
693  Sum += T_C.second;
696  if (M->empty())
697  return R;
698  for (const auto &NameFS : *M) {
699  Sum += NameFS.second.getEntrySamples();
700  R.push_back(&NameFS.second);
701  }
702  llvm::sort(R, [](const FunctionSamples *L, const FunctionSamples *R) {
703  if (L->getEntrySamples() != R->getEntrySamples())
704  return L->getEntrySamples() > R->getEntrySamples();
705  return FunctionSamples::getGUID(L->getName()) <
707  });
708  }
709  return R;
710 }
711 
712 /// Get the FunctionSamples for an instruction.
713 ///
714 /// The FunctionSamples of an instruction \p Inst is the inlined instance
715 /// in which that instruction is coming from. We traverse the inline stack
716 /// of that instruction, and match it with the tree nodes in the profile.
717 ///
718 /// \param Inst Instruction to query.
719 ///
720 /// \returns the FunctionSamples pointer to the inlined instance.
721 const FunctionSamples *
722 SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
723  const DILocation *DIL = Inst.getDebugLoc();
724  if (!DIL)
725  return Samples;
726 
727  auto it = DILocation2SampleMap.try_emplace(DIL,nullptr);
728  if (it.second)
729  it.first->second = Samples->findFunctionSamples(DIL);
730  return it.first->second;
731 }
732 
733 bool SampleProfileLoader::inlineCallInstruction(Instruction *I) {
734  assert(isa<CallInst>(I) || isa<InvokeInst>(I));
735  CallSite CS(I);
736  Function *CalledFunction = CS.getCalledFunction();
737  assert(CalledFunction);
738  DebugLoc DLoc = I->getDebugLoc();
739  BasicBlock *BB = I->getParent();
740  InlineParams Params = getInlineParams();
741  Params.ComputeFullInlineCost = true;
742  // Checks if there is anything in the reachable portion of the callee at
743  // this callsite that makes this inlining potentially illegal. Need to
744  // set ComputeFullInlineCost, otherwise getInlineCost may return early
745  // when cost exceeds threshold without checking all IRs in the callee.
746  // The acutal cost does not matter because we only checks isNever() to
747  // see if it is legal to inline the callsite.
748  InlineCost Cost = getInlineCost(CS, Params, GetTTI(*CalledFunction), GetAC,
749  None, nullptr, nullptr);
750  if (Cost.isNever()) {
751  ORE->emit(OptimizationRemark(DEBUG_TYPE, "Not inline", DLoc, BB)
752  << "incompatible inlining");
753  return false;
754  }
755  InlineFunctionInfo IFI(nullptr, &GetAC);
756  if (InlineFunction(CS, IFI)) {
757  // The call to InlineFunction erases I, so we can't pass it here.
758  ORE->emit(OptimizationRemark(DEBUG_TYPE, "HotInline", DLoc, BB)
759  << "inlined hot callee '" << ore::NV("Callee", CalledFunction)
760  << "' into '" << ore::NV("Caller", BB->getParent()) << "'");
761  return true;
762  }
763  return false;
764 }
765 
766 /// Iteratively inline hot callsites of a function.
767 ///
768 /// Iteratively traverse all callsites of the function \p F, and find if
769 /// the corresponding inlined instance exists and is hot in profile. If
770 /// it is hot enough, inline the callsites and adds new callsites of the
771 /// callee into the caller. If the call is an indirect call, first promote
772 /// it to direct call. Each indirect call is limited with a single target.
773 ///
774 /// \param F function to perform iterative inlining.
775 /// \param InlinedGUIDs a set to be updated to include all GUIDs that are
776 /// inlined in the profiled binary.
777 ///
778 /// \returns True if there is any inline happened.
779 bool SampleProfileLoader::inlineHotFunctions(
780  Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
781  DenseSet<Instruction *> PromotedInsns;
782  bool Changed = false;
783  while (true) {
784  bool LocalChanged = false;
786  for (auto &BB : F) {
787  bool Hot = false;
789  for (auto &I : BB.getInstList()) {
790  const FunctionSamples *FS = nullptr;
791  if ((isa<CallInst>(I) || isa<InvokeInst>(I)) &&
792  !isa<IntrinsicInst>(I) && (FS = findCalleeFunctionSamples(I))) {
793  Candidates.push_back(&I);
794  if (callsiteIsHot(FS, PSI))
795  Hot = true;
796  }
797  }
798  if (Hot) {
799  CIS.insert(CIS.begin(), Candidates.begin(), Candidates.end());
800  }
801  }
802  for (auto I : CIS) {
803  Function *CalledFunction = CallSite(I).getCalledFunction();
804  // Do not inline recursive calls.
805  if (CalledFunction == &F)
806  continue;
807  if (CallSite(I).isIndirectCall()) {
808  if (PromotedInsns.count(I))
809  continue;
810  uint64_t Sum;
811  for (const auto *FS : findIndirectCallFunctionSamples(*I, Sum)) {
812  if (IsThinLTOPreLink) {
813  FS->findInlinedFunctions(InlinedGUIDs, F.getParent(),
815  continue;
816  }
817  auto CalleeFunctionName = FS->getFuncNameInModule(F.getParent());
818  // If it is a recursive call, we do not inline it as it could bloat
819  // the code exponentially. There is way to better handle this, e.g.
820  // clone the caller first, and inline the cloned caller if it is
821  // recursive. As llvm does not inline recursive calls, we will
822  // simply ignore it instead of handling it explicitly.
823  if (CalleeFunctionName == F.getName())
824  continue;
825 
826  const char *Reason = "Callee function not available";
827  auto R = SymbolMap.find(CalleeFunctionName);
828  if (R != SymbolMap.end() && R->getValue() &&
829  !R->getValue()->isDeclaration() &&
830  R->getValue()->getSubprogram() &&
831  isLegalToPromote(CallSite(I), R->getValue(), &Reason)) {
832  uint64_t C = FS->getEntrySamples();
833  Instruction *DI =
834  pgo::promoteIndirectCall(I, R->getValue(), C, Sum, false, ORE);
835  Sum -= C;
836  PromotedInsns.insert(I);
837  // If profile mismatches, we should not attempt to inline DI.
838  if ((isa<CallInst>(DI) || isa<InvokeInst>(DI)) &&
839  inlineCallInstruction(DI))
840  LocalChanged = true;
841  } else {
842  LLVM_DEBUG(dbgs()
843  << "\nFailed to promote indirect call to "
844  << CalleeFunctionName << " because " << Reason << "\n");
845  }
846  }
847  } else if (CalledFunction && CalledFunction->getSubprogram() &&
848  !CalledFunction->isDeclaration()) {
849  if (inlineCallInstruction(I))
850  LocalChanged = true;
851  } else if (IsThinLTOPreLink) {
852  findCalleeFunctionSamples(*I)->findInlinedFunctions(
853  InlinedGUIDs, F.getParent(), PSI->getOrCompHotCountThreshold());
854  }
855  }
856  if (LocalChanged) {
857  Changed = true;
858  } else {
859  break;
860  }
861  }
862  return Changed;
863 }
864 
865 /// Find equivalence classes for the given block.
866 ///
867 /// This finds all the blocks that are guaranteed to execute the same
868 /// number of times as \p BB1. To do this, it traverses all the
869 /// descendants of \p BB1 in the dominator or post-dominator tree.
870 ///
871 /// A block BB2 will be in the same equivalence class as \p BB1 if
872 /// the following holds:
873 ///
874 /// 1- \p BB1 is a descendant of BB2 in the opposite tree. So, if BB2
875 /// is a descendant of \p BB1 in the dominator tree, then BB2 should
876 /// dominate BB1 in the post-dominator tree.
877 ///
878 /// 2- Both BB2 and \p BB1 must be in the same loop.
879 ///
880 /// For every block BB2 that meets those two requirements, we set BB2's
881 /// equivalence class to \p BB1.
882 ///
883 /// \param BB1 Block to check.
884 /// \param Descendants Descendants of \p BB1 in either the dom or pdom tree.
885 /// \param DomTree Opposite dominator tree. If \p Descendants is filled
886 /// with blocks from \p BB1's dominator tree, then
887 /// this is the post-dominator tree, and vice versa.
888 template <bool IsPostDom>
889 void SampleProfileLoader::findEquivalencesFor(
890  BasicBlock *BB1, ArrayRef<BasicBlock *> Descendants,
892  const BasicBlock *EC = EquivalenceClass[BB1];
893  uint64_t Weight = BlockWeights[EC];
894  for (const auto *BB2 : Descendants) {
895  bool IsDomParent = DomTree->dominates(BB2, BB1);
896  bool IsInSameLoop = LI->getLoopFor(BB1) == LI->getLoopFor(BB2);
897  if (BB1 != BB2 && IsDomParent && IsInSameLoop) {
898  EquivalenceClass[BB2] = EC;
899  // If BB2 is visited, then the entire EC should be marked as visited.
900  if (VisitedBlocks.count(BB2)) {
901  VisitedBlocks.insert(EC);
902  }
903 
904  // If BB2 is heavier than BB1, make BB2 have the same weight
905  // as BB1.
906  //
907  // Note that we don't worry about the opposite situation here
908  // (when BB2 is lighter than BB1). We will deal with this
909  // during the propagation phase. Right now, we just want to
910  // make sure that BB1 has the largest weight of all the
911  // members of its equivalence set.
912  Weight = std::max(Weight, BlockWeights[BB2]);
913  }
914  }
915  if (EC == &EC->getParent()->getEntryBlock()) {
916  BlockWeights[EC] = Samples->getHeadSamples() + 1;
917  } else {
918  BlockWeights[EC] = Weight;
919  }
920 }
921 
922 /// Find equivalence classes.
923 ///
924 /// Since samples may be missing from blocks, we can fill in the gaps by setting
925 /// the weights of all the blocks in the same equivalence class to the same
926 /// weight. To compute the concept of equivalence, we use dominance and loop
927 /// information. Two blocks B1 and B2 are in the same equivalence class if B1
928 /// dominates B2, B2 post-dominates B1 and both are in the same loop.
929 ///
930 /// \param F The function to query.
931 void SampleProfileLoader::findEquivalenceClasses(Function &F) {
932  SmallVector<BasicBlock *, 8> DominatedBBs;
933  LLVM_DEBUG(dbgs() << "\nBlock equivalence classes\n");
934  // Find equivalence sets based on dominance and post-dominance information.
935  for (auto &BB : F) {
936  BasicBlock *BB1 = &BB;
937 
938  // Compute BB1's equivalence class once.
939  if (EquivalenceClass.count(BB1)) {
940  LLVM_DEBUG(printBlockEquivalence(dbgs(), BB1));
941  continue;
942  }
943 
944  // By default, blocks are in their own equivalence class.
945  EquivalenceClass[BB1] = BB1;
946 
947  // Traverse all the blocks dominated by BB1. We are looking for
948  // every basic block BB2 such that:
949  //
950  // 1- BB1 dominates BB2.
951  // 2- BB2 post-dominates BB1.
952  // 3- BB1 and BB2 are in the same loop nest.
953  //
954  // If all those conditions hold, it means that BB2 is executed
955  // as many times as BB1, so they are placed in the same equivalence
956  // class by making BB2's equivalence class be BB1.
957  DominatedBBs.clear();
958  DT->getDescendants(BB1, DominatedBBs);
959  findEquivalencesFor(BB1, DominatedBBs, PDT.get());
960 
961  LLVM_DEBUG(printBlockEquivalence(dbgs(), BB1));
962  }
963 
964  // Assign weights to equivalence classes.
965  //
966  // All the basic blocks in the same equivalence class will execute
967  // the same number of times. Since we know that the head block in
968  // each equivalence class has the largest weight, assign that weight
969  // to all the blocks in that equivalence class.
970  LLVM_DEBUG(
971  dbgs() << "\nAssign the same weight to all blocks in the same class\n");
972  for (auto &BI : F) {
973  const BasicBlock *BB = &BI;
974  const BasicBlock *EquivBB = EquivalenceClass[BB];
975  if (BB != EquivBB)
976  BlockWeights[BB] = BlockWeights[EquivBB];
977  LLVM_DEBUG(printBlockWeight(dbgs(), BB));
978  }
979 }
980 
981 /// Visit the given edge to decide if it has a valid weight.
982 ///
983 /// If \p E has not been visited before, we copy to \p UnknownEdge
984 /// and increment the count of unknown edges.
985 ///
986 /// \param E Edge to visit.
987 /// \param NumUnknownEdges Current number of unknown edges.
988 /// \param UnknownEdge Set if E has not been visited before.
989 ///
990 /// \returns E's weight, if known. Otherwise, return 0.
991 uint64_t SampleProfileLoader::visitEdge(Edge E, unsigned *NumUnknownEdges,
992  Edge *UnknownEdge) {
993  if (!VisitedEdges.count(E)) {
994  (*NumUnknownEdges)++;
995  *UnknownEdge = E;
996  return 0;
997  }
998 
999  return EdgeWeights[E];
1000 }
1001 
1002 /// Propagate weights through incoming/outgoing edges.
1003 ///
1004 /// If the weight of a basic block is known, and there is only one edge
1005 /// with an unknown weight, we can calculate the weight of that edge.
1006 ///
1007 /// Similarly, if all the edges have a known count, we can calculate the
1008 /// count of the basic block, if needed.
1009 ///
1010 /// \param F Function to process.
1011 /// \param UpdateBlockCount Whether we should update basic block counts that
1012 /// has already been annotated.
1013 ///
1014 /// \returns True if new weights were assigned to edges or blocks.
1015 bool SampleProfileLoader::propagateThroughEdges(Function &F,
1016  bool UpdateBlockCount) {
1017  bool Changed = false;
1018  LLVM_DEBUG(dbgs() << "\nPropagation through edges\n");
1019  for (const auto &BI : F) {
1020  const BasicBlock *BB = &BI;
1021  const BasicBlock *EC = EquivalenceClass[BB];
1022 
1023  // Visit all the predecessor and successor edges to determine
1024  // which ones have a weight assigned already. Note that it doesn't
1025  // matter that we only keep track of a single unknown edge. The
1026  // only case we are interested in handling is when only a single
1027  // edge is unknown (see setEdgeOrBlockWeight).
1028  for (unsigned i = 0; i < 2; i++) {
1029  uint64_t TotalWeight = 0;
1030  unsigned NumUnknownEdges = 0, NumTotalEdges = 0;
1031  Edge UnknownEdge, SelfReferentialEdge, SingleEdge;
1032 
1033  if (i == 0) {
1034  // First, visit all predecessor edges.
1035  NumTotalEdges = Predecessors[BB].size();
1036  for (auto *Pred : Predecessors[BB]) {
1037  Edge E = std::make_pair(Pred, BB);
1038  TotalWeight += visitEdge(E, &NumUnknownEdges, &UnknownEdge);
1039  if (E.first == E.second)
1040  SelfReferentialEdge = E;
1041  }
1042  if (NumTotalEdges == 1) {
1043  SingleEdge = std::make_pair(Predecessors[BB][0], BB);
1044  }
1045  } else {
1046  // On the second round, visit all successor edges.
1047  NumTotalEdges = Successors[BB].size();
1048  for (auto *Succ : Successors[BB]) {
1049  Edge E = std::make_pair(BB, Succ);
1050  TotalWeight += visitEdge(E, &NumUnknownEdges, &UnknownEdge);
1051  }
1052  if (NumTotalEdges == 1) {
1053  SingleEdge = std::make_pair(BB, Successors[BB][0]);
1054  }
1055  }
1056 
1057  // After visiting all the edges, there are three cases that we
1058  // can handle immediately:
1059  //
1060  // - All the edge weights are known (i.e., NumUnknownEdges == 0).
1061  // In this case, we simply check that the sum of all the edges
1062  // is the same as BB's weight. If not, we change BB's weight
1063  // to match. Additionally, if BB had not been visited before,
1064  // we mark it visited.
1065  //
1066  // - Only one edge is unknown and BB has already been visited.
1067  // In this case, we can compute the weight of the edge by
1068  // subtracting the total block weight from all the known
1069  // edge weights. If the edges weight more than BB, then the
1070  // edge of the last remaining edge is set to zero.
1071  //
1072  // - There exists a self-referential edge and the weight of BB is
1073  // known. In this case, this edge can be based on BB's weight.
1074  // We add up all the other known edges and set the weight on
1075  // the self-referential edge as we did in the previous case.
1076  //
1077  // In any other case, we must continue iterating. Eventually,
1078  // all edges will get a weight, or iteration will stop when
1079  // it reaches SampleProfileMaxPropagateIterations.
1080  if (NumUnknownEdges <= 1) {
1081  uint64_t &BBWeight = BlockWeights[EC];
1082  if (NumUnknownEdges == 0) {
1083  if (!VisitedBlocks.count(EC)) {
1084  // If we already know the weight of all edges, the weight of the
1085  // basic block can be computed. It should be no larger than the sum
1086  // of all edge weights.
1087  if (TotalWeight > BBWeight) {
1088  BBWeight = TotalWeight;
1089  Changed = true;
1090  LLVM_DEBUG(dbgs() << "All edge weights for " << BB->getName()
1091  << " known. Set weight for block: ";
1092  printBlockWeight(dbgs(), BB););
1093  }
1094  } else if (NumTotalEdges == 1 &&
1095  EdgeWeights[SingleEdge] < BlockWeights[EC]) {
1096  // If there is only one edge for the visited basic block, use the
1097  // block weight to adjust edge weight if edge weight is smaller.
1098  EdgeWeights[SingleEdge] = BlockWeights[EC];
1099  Changed = true;
1100  }
1101  } else if (NumUnknownEdges == 1 && VisitedBlocks.count(EC)) {
1102  // If there is a single unknown edge and the block has been
1103  // visited, then we can compute E's weight.
1104  if (BBWeight >= TotalWeight)
1105  EdgeWeights[UnknownEdge] = BBWeight - TotalWeight;
1106  else
1107  EdgeWeights[UnknownEdge] = 0;
1108  const BasicBlock *OtherEC;
1109  if (i == 0)
1110  OtherEC = EquivalenceClass[UnknownEdge.first];
1111  else
1112  OtherEC = EquivalenceClass[UnknownEdge.second];
1113  // Edge weights should never exceed the BB weights it connects.
1114  if (VisitedBlocks.count(OtherEC) &&
1115  EdgeWeights[UnknownEdge] > BlockWeights[OtherEC])
1116  EdgeWeights[UnknownEdge] = BlockWeights[OtherEC];
1117  VisitedEdges.insert(UnknownEdge);
1118  Changed = true;
1119  LLVM_DEBUG(dbgs() << "Set weight for edge: ";
1120  printEdgeWeight(dbgs(), UnknownEdge));
1121  }
1122  } else if (VisitedBlocks.count(EC) && BlockWeights[EC] == 0) {
1123  // If a block Weights 0, all its in/out edges should weight 0.
1124  if (i == 0) {
1125  for (auto *Pred : Predecessors[BB]) {
1126  Edge E = std::make_pair(Pred, BB);
1127  EdgeWeights[E] = 0;
1128  VisitedEdges.insert(E);
1129  }
1130  } else {
1131  for (auto *Succ : Successors[BB]) {
1132  Edge E = std::make_pair(BB, Succ);
1133  EdgeWeights[E] = 0;
1134  VisitedEdges.insert(E);
1135  }
1136  }
1137  } else if (SelfReferentialEdge.first && VisitedBlocks.count(EC)) {
1138  uint64_t &BBWeight = BlockWeights[BB];
1139  // We have a self-referential edge and the weight of BB is known.
1140  if (BBWeight >= TotalWeight)
1141  EdgeWeights[SelfReferentialEdge] = BBWeight - TotalWeight;
1142  else
1143  EdgeWeights[SelfReferentialEdge] = 0;
1144  VisitedEdges.insert(SelfReferentialEdge);
1145  Changed = true;
1146  LLVM_DEBUG(dbgs() << "Set self-referential edge weight to: ";
1147  printEdgeWeight(dbgs(), SelfReferentialEdge));
1148  }
1149  if (UpdateBlockCount && !VisitedBlocks.count(EC) && TotalWeight > 0) {
1150  BlockWeights[EC] = TotalWeight;
1151  VisitedBlocks.insert(EC);
1152  Changed = true;
1153  }
1154  }
1155  }
1156 
1157  return Changed;
1158 }
1159 
1160 /// Build in/out edge lists for each basic block in the CFG.
1161 ///
1162 /// We are interested in unique edges. If a block B1 has multiple
1163 /// edges to another block B2, we only add a single B1->B2 edge.
1164 void SampleProfileLoader::buildEdges(Function &F) {
1165  for (auto &BI : F) {
1166  BasicBlock *B1 = &BI;
1167 
1168  // Add predecessors for B1.
1170  if (!Predecessors[B1].empty())
1171  llvm_unreachable("Found a stale predecessors list in a basic block.");
1172  for (pred_iterator PI = pred_begin(B1), PE = pred_end(B1); PI != PE; ++PI) {
1173  BasicBlock *B2 = *PI;
1174  if (Visited.insert(B2).second)
1175  Predecessors[B1].push_back(B2);
1176  }
1177 
1178  // Add successors for B1.
1179  Visited.clear();
1180  if (!Successors[B1].empty())
1181  llvm_unreachable("Found a stale successors list in a basic block.");
1182  for (succ_iterator SI = succ_begin(B1), SE = succ_end(B1); SI != SE; ++SI) {
1183  BasicBlock *B2 = *SI;
1184  if (Visited.insert(B2).second)
1185  Successors[B1].push_back(B2);
1186  }
1187  }
1188 }
1189 
1190 /// Returns the sorted CallTargetMap \p M by count in descending order.
1192  const SampleRecord::CallTargetMap &M) {
1194  for (auto I = M.begin(); I != M.end(); ++I)
1195  R.push_back({FunctionSamples::getGUID(I->getKey()), I->getValue()});
1196  llvm::sort(R, [](const InstrProfValueData &L, const InstrProfValueData &R) {
1197  if (L.Count == R.Count)
1198  return L.Value > R.Value;
1199  else
1200  return L.Count > R.Count;
1201  });
1202  return R;
1203 }
1204 
1205 /// Propagate weights into edges
1206 ///
1207 /// The following rules are applied to every block BB in the CFG:
1208 ///
1209 /// - If BB has a single predecessor/successor, then the weight
1210 /// of that edge is the weight of the block.
1211 ///
1212 /// - If all incoming or outgoing edges are known except one, and the
1213 /// weight of the block is already known, the weight of the unknown
1214 /// edge will be the weight of the block minus the sum of all the known
1215 /// edges. If the sum of all the known edges is larger than BB's weight,
1216 /// we set the unknown edge weight to zero.
1217 ///
1218 /// - If there is a self-referential edge, and the weight of the block is
1219 /// known, the weight for that edge is set to the weight of the block
1220 /// minus the weight of the other incoming edges to that block (if
1221 /// known).
1222 void SampleProfileLoader::propagateWeights(Function &F) {
1223  bool Changed = true;
1224  unsigned I = 0;
1225 
1226  // If BB weight is larger than its corresponding loop's header BB weight,
1227  // use the BB weight to replace the loop header BB weight.
1228  for (auto &BI : F) {
1229  BasicBlock *BB = &BI;
1230  Loop *L = LI->getLoopFor(BB);
1231  if (!L) {
1232  continue;
1233  }
1234  BasicBlock *Header = L->getHeader();
1235  if (Header && BlockWeights[BB] > BlockWeights[Header]) {
1236  BlockWeights[Header] = BlockWeights[BB];
1237  }
1238  }
1239 
1240  // Before propagation starts, build, for each block, a list of
1241  // unique predecessors and successors. This is necessary to handle
1242  // identical edges in multiway branches. Since we visit all blocks and all
1243  // edges of the CFG, it is cleaner to build these lists once at the start
1244  // of the pass.
1245  buildEdges(F);
1246 
1247  // Propagate until we converge or we go past the iteration limit.
1248  while (Changed && I++ < SampleProfileMaxPropagateIterations) {
1249  Changed = propagateThroughEdges(F, false);
1250  }
1251 
1252  // The first propagation propagates BB counts from annotated BBs to unknown
1253  // BBs. The 2nd propagation pass resets edges weights, and use all BB weights
1254  // to propagate edge weights.
1255  VisitedEdges.clear();
1256  Changed = true;
1257  while (Changed && I++ < SampleProfileMaxPropagateIterations) {
1258  Changed = propagateThroughEdges(F, false);
1259  }
1260 
1261  // The 3rd propagation pass allows adjust annotated BB weights that are
1262  // obviously wrong.
1263  Changed = true;
1264  while (Changed && I++ < SampleProfileMaxPropagateIterations) {
1265  Changed = propagateThroughEdges(F, true);
1266  }
1267 
1268  // Generate MD_prof metadata for every branch instruction using the
1269  // edge weights computed during propagation.
1270  LLVM_DEBUG(dbgs() << "\nPropagation complete. Setting branch weights\n");
1271  LLVMContext &Ctx = F.getContext();
1272  MDBuilder MDB(Ctx);
1273  for (auto &BI : F) {
1274  BasicBlock *BB = &BI;
1275 
1276  if (BlockWeights[BB]) {
1277  for (auto &I : BB->getInstList()) {
1278  if (!isa<CallInst>(I) && !isa<InvokeInst>(I))
1279  continue;
1280  CallSite CS(&I);
1281  if (!CS.getCalledFunction()) {
1282  const DebugLoc &DLoc = I.getDebugLoc();
1283  if (!DLoc)
1284  continue;
1285  const DILocation *DIL = DLoc;
1286  uint32_t LineOffset = FunctionSamples::getOffset(DIL);
1287  uint32_t Discriminator = DIL->getBaseDiscriminator();
1288 
1289  const FunctionSamples *FS = findFunctionSamples(I);
1290  if (!FS)
1291  continue;
1292  auto T = FS->findCallTargetMapAt(LineOffset, Discriminator);
1293  if (!T || T.get().empty())
1294  continue;
1295  SmallVector<InstrProfValueData, 2> SortedCallTargets =
1296  SortCallTargets(T.get());
1297  uint64_t Sum;
1298  findIndirectCallFunctionSamples(I, Sum);
1299  annotateValueSite(*I.getParent()->getParent()->getParent(), I,
1300  SortedCallTargets, Sum, IPVK_IndirectCallTarget,
1301  SortedCallTargets.size());
1302  } else if (!dyn_cast<IntrinsicInst>(&I)) {
1303  SmallVector<uint32_t, 1> Weights;
1304  Weights.push_back(BlockWeights[BB]);
1305  I.setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
1306  }
1307  }
1308  }
1309  Instruction *TI = BB->getTerminator();
1310  if (TI->getNumSuccessors() == 1)
1311  continue;
1312  if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI))
1313  continue;
1314 
1315  DebugLoc BranchLoc = TI->getDebugLoc();
1316  LLVM_DEBUG(dbgs() << "\nGetting weights for branch at line "
1317  << ((BranchLoc) ? Twine(BranchLoc.getLine())
1318  : Twine("<UNKNOWN LOCATION>"))
1319  << ".\n");
1320  SmallVector<uint32_t, 4> Weights;
1321  uint32_t MaxWeight = 0;
1322  Instruction *MaxDestInst;
1323  for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) {
1324  BasicBlock *Succ = TI->getSuccessor(I);
1325  Edge E = std::make_pair(BB, Succ);
1326  uint64_t Weight = EdgeWeights[E];
1327  LLVM_DEBUG(dbgs() << "\t"; printEdgeWeight(dbgs(), E));
1328  // Use uint32_t saturated arithmetic to adjust the incoming weights,
1329  // if needed. Sample counts in profiles are 64-bit unsigned values,
1330  // but internally branch weights are expressed as 32-bit values.
1331  if (Weight > std::numeric_limits<uint32_t>::max()) {
1332  LLVM_DEBUG(dbgs() << " (saturated due to uint32_t overflow)");
1334  }
1335  // Weight is added by one to avoid propagation errors introduced by
1336  // 0 weights.
1337  Weights.push_back(static_cast<uint32_t>(Weight + 1));
1338  if (Weight != 0) {
1339  if (Weight > MaxWeight) {
1340  MaxWeight = Weight;
1341  MaxDestInst = Succ->getFirstNonPHIOrDbgOrLifetime();
1342  }
1343  }
1344  }
1345 
1346  uint64_t TempWeight;
1347  // Only set weights if there is at least one non-zero weight.
1348  // In any other case, let the analyzer set weights.
1349  // Do not set weights if the weights are present. In ThinLTO, the profile
1350  // annotation is done twice. If the first annotation already set the
1351  // weights, the second pass does not need to set it.
1352  if (MaxWeight > 0 && !TI->extractProfTotalWeight(TempWeight)) {
1353  LLVM_DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n");
1355  MDB.createBranchWeights(Weights));
1356  ORE->emit([&]() {
1357  return OptimizationRemark(DEBUG_TYPE, "PopularDest", MaxDestInst)
1358  << "most popular destination for conditional branches at "
1359  << ore::NV("CondBranchesLoc", BranchLoc);
1360  });
1361  } else {
1362  LLVM_DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n");
1363  }
1364  }
1365 }
1366 
1367 /// Get the line number for the function header.
1368 ///
1369 /// This looks up function \p F in the current compilation unit and
1370 /// retrieves the line number where the function is defined. This is
1371 /// line 0 for all the samples read from the profile file. Every line
1372 /// number is relative to this line.
1373 ///
1374 /// \param F Function object to query.
1375 ///
1376 /// \returns the line number where \p F is defined. If it returns 0,
1377 /// it means that there is no debug information available for \p F.
1378 unsigned SampleProfileLoader::getFunctionLoc(Function &F) {
1379  if (DISubprogram *S = F.getSubprogram())
1380  return S->getLine();
1381 
1382  if (NoWarnSampleUnused)
1383  return 0;
1384 
1385  // If the start of \p F is missing, emit a diagnostic to inform the user
1386  // about the missed opportunity.
1388  "No debug information found in function " + F.getName() +
1389  ": Function profile not used",
1390  DS_Warning));
1391  return 0;
1392 }
1393 
1394 void SampleProfileLoader::computeDominanceAndLoopInfo(Function &F) {
1395  DT.reset(new DominatorTree);
1396  DT->recalculate(F);
1397 
1398  PDT.reset(new PostDominatorTree(F));
1399 
1400  LI.reset(new LoopInfo);
1401  LI->analyze(*DT);
1402 }
1403 
1404 /// Generate branch weight metadata for all branches in \p F.
1405 ///
1406 /// Branch weights are computed out of instruction samples using a
1407 /// propagation heuristic. Propagation proceeds in 3 phases:
1408 ///
1409 /// 1- Assignment of block weights. All the basic blocks in the function
1410 /// are initial assigned the same weight as their most frequently
1411 /// executed instruction.
1412 ///
1413 /// 2- Creation of equivalence classes. Since samples may be missing from
1414 /// blocks, we can fill in the gaps by setting the weights of all the
1415 /// blocks in the same equivalence class to the same weight. To compute
1416 /// the concept of equivalence, we use dominance and loop information.
1417 /// Two blocks B1 and B2 are in the same equivalence class if B1
1418 /// dominates B2, B2 post-dominates B1 and both are in the same loop.
1419 ///
1420 /// 3- Propagation of block weights into edges. This uses a simple
1421 /// propagation heuristic. The following rules are applied to every
1422 /// block BB in the CFG:
1423 ///
1424 /// - If BB has a single predecessor/successor, then the weight
1425 /// of that edge is the weight of the block.
1426 ///
1427 /// - If all the edges are known except one, and the weight of the
1428 /// block is already known, the weight of the unknown edge will
1429 /// be the weight of the block minus the sum of all the known
1430 /// edges. If the sum of all the known edges is larger than BB's weight,
1431 /// we set the unknown edge weight to zero.
1432 ///
1433 /// - If there is a self-referential edge, and the weight of the block is
1434 /// known, the weight for that edge is set to the weight of the block
1435 /// minus the weight of the other incoming edges to that block (if
1436 /// known).
1437 ///
1438 /// Since this propagation is not guaranteed to finalize for every CFG, we
1439 /// only allow it to proceed for a limited number of iterations (controlled
1440 /// by -sample-profile-max-propagate-iterations).
1441 ///
1442 /// FIXME: Try to replace this propagation heuristic with a scheme
1443 /// that is guaranteed to finalize. A work-list approach similar to
1444 /// the standard value propagation algorithm used by SSA-CCP might
1445 /// work here.
1446 ///
1447 /// Once all the branch weights are computed, we emit the MD_prof
1448 /// metadata on BB using the computed values for each of its branches.
1449 ///
1450 /// \param F The function to query.
1451 ///
1452 /// \returns true if \p F was modified. Returns false, otherwise.
1453 bool SampleProfileLoader::emitAnnotations(Function &F) {
1454  bool Changed = false;
1455 
1456  if (getFunctionLoc(F) == 0)
1457  return false;
1458 
1459  LLVM_DEBUG(dbgs() << "Line number for the first instruction in "
1460  << F.getName() << ": " << getFunctionLoc(F) << "\n");
1461 
1462  DenseSet<GlobalValue::GUID> InlinedGUIDs;
1463  Changed |= inlineHotFunctions(F, InlinedGUIDs);
1464 
1465  // Compute basic block weights.
1466  Changed |= computeBlockWeights(F);
1467 
1468  if (Changed) {
1469  // Add an entry count to the function using the samples gathered at the
1470  // function entry.
1471  // Sets the GUIDs that are inlined in the profiled binary. This is used
1472  // for ThinLink to make correct liveness analysis, and also make the IR
1473  // match the profiled binary before annotation.
1474  F.setEntryCount(
1475  ProfileCount(Samples->getHeadSamples() + 1, Function::PCT_Real),
1476  &InlinedGUIDs);
1477 
1478  // Compute dominance and loop info needed for propagation.
1479  computeDominanceAndLoopInfo(F);
1480 
1481  // Find equivalence classes.
1482  findEquivalenceClasses(F);
1483 
1484  // Propagate weights to all edges.
1485  propagateWeights(F);
1486  }
1487 
1488  // If coverage checking was requested, compute it now.
1490  unsigned Used = CoverageTracker.countUsedRecords(Samples, PSI);
1491  unsigned Total = CoverageTracker.countBodyRecords(Samples, PSI);
1492  unsigned Coverage = CoverageTracker.computeCoverage(Used, Total);
1493  if (Coverage < SampleProfileRecordCoverage) {
1495  F.getSubprogram()->getFilename(), getFunctionLoc(F),
1496  Twine(Used) + " of " + Twine(Total) + " available profile records (" +
1497  Twine(Coverage) + "%) were applied",
1498  DS_Warning));
1499  }
1500  }
1501 
1503  uint64_t Used = CoverageTracker.getTotalUsedSamples();
1504  uint64_t Total = CoverageTracker.countBodySamples(Samples, PSI);
1505  unsigned Coverage = CoverageTracker.computeCoverage(Used, Total);
1506  if (Coverage < SampleProfileSampleCoverage) {
1508  F.getSubprogram()->getFilename(), getFunctionLoc(F),
1509  Twine(Used) + " of " + Twine(Total) + " available profile samples (" +
1510  Twine(Coverage) + "%) were applied",
1511  DS_Warning));
1512  }
1513  }
1514  return Changed;
1515 }
1516 
1518 
1519 INITIALIZE_PASS_BEGIN(SampleProfileLoaderLegacyPass, "sample-profile",
1520  "Sample Profile loader", false, false)
1524 INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass, "sample-profile",
1525  "Sample Profile loader", false, false)
1526 
1527 bool SampleProfileLoader::doInitialization(Module &M) {
1528  auto &Ctx = M.getContext();
1529  auto ReaderOrErr = SampleProfileReader::create(Filename, Ctx);
1530  if (std::error_code EC = ReaderOrErr.getError()) {
1531  std::string Msg = "Could not open profile: " + EC.message();
1532  Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
1533  return false;
1534  }
1535  Reader = std::move(ReaderOrErr.get());
1536  Reader->collectFuncsToUse(M);
1537  ProfileIsValid = (Reader->read() == sampleprof_error::success);
1538 
1539  if (!RemappingFilename.empty()) {
1540  // Apply profile remappings to the loaded profile data if requested.
1541  // For now, we only support remapping symbols encoded using the Itanium
1542  // C++ ABI's name mangling scheme.
1544  RemappingFilename, Ctx, std::move(Reader));
1545  if (std::error_code EC = ReaderOrErr.getError()) {
1546  std::string Msg = "Could not open profile remapping file: " + EC.message();
1547  Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
1548  return false;
1549  }
1550  Reader = std::move(ReaderOrErr.get());
1551  ProfileIsValid = (Reader->read() == sampleprof_error::success);
1552  }
1553  return true;
1554 }
1555 
1557  return new SampleProfileLoaderLegacyPass();
1558 }
1559 
1561  return new SampleProfileLoaderLegacyPass(Name);
1562 }
1563 
1564 bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
1565  ProfileSummaryInfo *_PSI) {
1567  if (!ProfileIsValid)
1568  return false;
1569 
1570  PSI = _PSI;
1571  if (M.getProfileSummary() == nullptr)
1572  M.setProfileSummary(Reader->getSummary().getMD(M.getContext()));
1573 
1574  // Compute the total number of samples collected in this profile.
1575  for (const auto &I : Reader->getProfiles())
1576  TotalCollectedSamples += I.second.getTotalSamples();
1577 
1578  // Populate the symbol map.
1579  for (const auto &N_F : M.getValueSymbolTable()) {
1580  StringRef OrigName = N_F.getKey();
1581  Function *F = dyn_cast<Function>(N_F.getValue());
1582  if (F == nullptr)
1583  continue;
1584  SymbolMap[OrigName] = F;
1585  auto pos = OrigName.find('.');
1586  if (pos != StringRef::npos) {
1587  StringRef NewName = OrigName.substr(0, pos);
1588  auto r = SymbolMap.insert(std::make_pair(NewName, F));
1589  // Failiing to insert means there is already an entry in SymbolMap,
1590  // thus there are multiple functions that are mapped to the same
1591  // stripped name. In this case of name conflicting, set the value
1592  // to nullptr to avoid confusion.
1593  if (!r.second)
1594  r.first->second = nullptr;
1595  }
1596  }
1597 
1598  bool retval = false;
1599  for (auto &F : M)
1600  if (!F.isDeclaration()) {
1601  clearFunctionData();
1602  retval |= runOnFunction(F, AM);
1603  }
1604  return retval;
1605 }
1606 
1607 bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) {
1608  ACT = &getAnalysis<AssumptionCacheTracker>();
1609  TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>();
1610  ProfileSummaryInfo *PSI =
1611  &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1612  return SampleLoader.runOnModule(M, nullptr, PSI);
1613 }
1614 
1616 
1617  DILocation2SampleMap.clear();
1618  // By default the entry count is initialized to -1, which will be treated
1619  // conservatively by getEntryCount as the same as unknown (None). This is
1620  // to avoid newly added code to be treated as cold. If we have samples
1621  // this will be overwritten in emitAnnotations.
1622  // If ProfileSampleAccurate is true or F has profile-sample-accurate
1623  // attribute, initialize the entry count to 0 so callsites or functions
1624  // unsampled will be treated as cold.
1625  uint64_t initialEntryCount =
1626  (ProfileSampleAccurate || F.hasFnAttribute("profile-sample-accurate"))
1627  ? 0
1628  : -1;
1629  F.setEntryCount(ProfileCount(initialEntryCount, Function::PCT_Real));
1630  std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;
1631  if (AM) {
1632  auto &FAM =
1634  .getManager();
1635  ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
1636  } else {
1637  OwnedORE = make_unique<OptimizationRemarkEmitter>(&F);
1638  ORE = OwnedORE.get();
1639  }
1640  Samples = Reader->getSamplesFor(F);
1641  if (Samples && !Samples->empty())
1642  return emitAnnotations(F);
1643  return false;
1644 }
1645 
1647  ModuleAnalysisManager &AM) {
1649  AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
1650 
1651  auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
1652  return FAM.getResult<AssumptionAnalysis>(F);
1653  };
1654  auto GetTTI = [&](Function &F) -> TargetTransformInfo & {
1655  return FAM.getResult<TargetIRAnalysis>(F);
1656  };
1657 
1658  SampleProfileLoader SampleLoader(
1659  ProfileFileName.empty() ? SampleProfileFile : ProfileFileName,
1660  ProfileRemappingFileName.empty() ? SampleProfileRemappingFile
1661  : ProfileRemappingFileName,
1662  IsThinLTOPreLink, GetAssumptionCache, GetTTI);
1663 
1664  SampleLoader.doInitialization(M);
1665 
1667  if (!SampleLoader.runOnModule(M, &AM, PSI))
1668  return PreservedAnalyses::all();
1669 
1670  return PreservedAnalyses::none();
1671 }
const NoneType None
Definition: None.h:24
uint64_t CallInst * C
const FunctionSamplesMap * findFunctionSamplesMapAt(const LineLocation &Loc) const
Returns the FunctionSamplesMap at the given Loc.
Definition: SampleProf.h:284
static uint64_t getGUID(StringRef Name)
Definition: SampleProf.h:490
Thresholds to tune inline cost analysis.
Definition: InlineCost.h:154
Represents either an error or a value T.
Definition: ErrorOr.h:57
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
DiagnosticInfoOptimizationBase::Argument NV
bool isNever() const
Definition: InlineCost.h:105
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:770
This class represents lattice values for constants.
Definition: AllocatorList.h:24
sample profile
A Module instance is used to store all the information related to an LLVM module. ...
Definition: Module.h:65
BasicBlock * getSuccessor(unsigned Idx) const
Return the specified successor. This instruction must be a terminator.
Implements a dense probed hash-table based set.
Definition: DenseSet.h:250
const ValueSymbolTable & getValueSymbolTable() const
Get the symbol table of global variable and function identifiers.
Definition: Module.h:565
Analysis providing profile information.
This class represents a function call, abstracting a target machine&#39;s calling convention.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
Analysis pass providing the TargetTransformInfo.
bool isLegalToPromote(CallSite CS, Function *Callee, const char **FailureReason=nullptr)
Return true if the given indirect call site can be made to call Callee.
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.h:321
static ErrorOr< std::unique_ptr< SampleProfileReader > > create(const Twine &Filename, LLVMContext &C)
Create a sample profile reader appropriate to the file format.
unsigned getLine() const
Definition: DebugLoc.cpp:26
ErrorOr< uint64_t > findSamplesAt(uint32_t LineOffset, uint32_t Discriminator) const
Return the number of samples collected at the given location.
Definition: SampleProf.h:257
A debug info location.
Definition: DebugLoc.h:34
F(f)
uint64_t getOrCompHotCountThreshold()
Returns HotCountThreshold if set.
InlineResult InlineFunction(CallInst *C, InlineFunctionInfo &IFI, AAResults *CalleeAAR=nullptr, bool InsertLifetime=true)
This function inlines the called function into the basic block of the caller.
void findInlinedFunctions(DenseSet< GlobalValue::GUID > &S, const Module *M, uint64_t Threshold) const
Recursively traverses all children, if the total sample count of the corresponding function is no les...
Definition: SampleProf.h:383
static SmallVector< InstrProfValueData, 2 > SortCallTargets(const SampleRecord::CallTargetMap &M)
Returns the sorted CallTargetMap M by count in descending order.
This class captures the data input to the InlineFunction call, and records the auxiliary results prod...
Definition: Cloning.h:174
bool isHotCount(uint64_t C)
Returns true if count C is considered hot.
Represents the cost of inlining a function.
Definition: InlineCost.h:64
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
Representation of the samples collected for a function.
Definition: SampleProf.h:217
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:221
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:51
amdgpu Simplify well known AMD library false Value Value const Twine & Name
Definition: BitVector.h:938
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
void setEntryCount(ProfileCount Count, const DenseSet< GlobalValue::GUID > *Imports=nullptr)
Set the entry count for this function.
Definition: Function.cpp:1363
An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo.
static cl::opt< std::string > SampleProfileRemappingFile("sample-profile-remapping-file", cl::init(""), cl::value_desc("filename"), cl::desc("Profile remapping file loaded by -sample-profile"), cl::Hidden)
LLVMContext & getContext() const
Get the global data context.
Definition: Module.h:244
Diagnostic information for optimization analysis remarks.
static StringRef getName(Value *V)
Metadata * getProfileSummary()
Returns profile summary metadata.
Definition: Module.cpp:540
StringRef getFilename() const
BlockT * getHeader() const
Definition: LoopInfo.h:100
Interval::succ_iterator succ_begin(Interval *I)
succ_begin/succ_end - define methods so that Intervals may be used just like BasicBlocks can with the...
Definition: Interval.h:103
Subprogram description.
const Instruction * getFirstNonPHIOrDbgOrLifetime() const
Returns a pointer to the first instruction in this block that is not a PHINode, a debug intrinsic...
Definition: BasicBlock.cpp:204
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:33
bool extractProfTotalWeight(uint64_t &TotalVal) const
Retrieve total raw weight values of a branch.
Definition: Metadata.cpp:1340
static cl::opt< std::string > SampleProfileFile("sample-profile-file", cl::init(""), cl::value_desc("filename"), cl::desc("Profile file loaded by -sample-profile"), cl::Hidden)
LLVM_NODISCARD LLVM_ATTRIBUTE_ALWAYS_INLINE StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition: StringRef.h:598
Debug location.
ModulePass * createSampleProfileLoaderPass()
Instruction * promoteIndirectCall(Instruction *Inst, Function *F, uint64_t Count, uint64_t TotalCount, bool AttachProfToDirectCall, OptimizationRemarkEmitter *ORE)
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
Definition: Dominators.h:145
amdgpu Simplify well known AMD library false Value * Callee
unsigned getNumSuccessors() const
Return the number of successors that this instruction has.
static cl::opt< unsigned > SampleProfileRecordCoverage("sample-profile-check-record-coverage", cl::init(0), cl::value_desc("N"), cl::desc("Emit a warning if less than N% of records in the input profile " "are matched to the IR."))
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: PassManager.h:157
Interval::succ_iterator succ_end(Interval *I)
Definition: Interval.h:106
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:176
Core dominator tree base class.
Definition: LoopInfo.h:61
const BasicBlock & getEntryBlock() const
Definition: Function.h:640
static bool runOnFunction(Function &F, bool PostInlining)
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:423
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Wrapper pass for TargetTransformInfo.
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:154
* if(!EatIfPresent(lltok::kw_thread_local)) return false
ParseOptionalThreadLocal := /*empty.
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:69
static cl::opt< unsigned > SampleProfileMaxPropagateIterations("sample-profile-max-propagate-iterations", cl::init(100), cl::desc("Maximum number of iterations to go through when propagating " "sample block/edge weights through the CFG."))
DISubprogram * getSubprogram() const
Get the attached subprogram.
Definition: Metadata.cpp:1508
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
const BodySampleMap & getBodySamples() const
Return all the samples collected in the body of the function.
Definition: SampleProf.h:351
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
LLVM_ATTRIBUTE_ALWAYS_INLINE iterator begin()
Definition: SmallVector.h:129
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:371
Diagnostic information for applied optimization remarks.
Interval::pred_iterator pred_begin(Interval *I)
pred_begin/pred_end - define methods so that Intervals may be used just like BasicBlocks can with the...
Definition: Interval.h:113
Function::ProfileCount ProfileCount
Represent the analysis usage information of a pass.
Interval::pred_iterator pred_end(Interval *I)
Definition: Interval.h:116
DenseMap< SymbolStringPtr, JITEvaluatedSymbol > SymbolMap
A map from symbol names (as SymbolStringPtrs) to JITSymbols (address/flags pairs).
Definition: Core.h:48
void initializeSampleProfileLoaderLegacyPassPass(PassRegistry &)
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function. ...
Definition: Function.cpp:193
Used in the streaming interface as the general argument type.
void annotateValueSite(Module &M, Instruction &Inst, const InstrProfRecord &InstrProfR, InstrProfValueKind ValueKind, uint32_t SiteIndx, uint32_t MaxMDCount=3)
Get the value profile data for value site SiteIdx from InstrProfR and annotate the instruction Inst w...
Definition: InstrProf.cpp:813
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:160
Class to represent profile counts.
Definition: Function.h:261
size_t size() const
Definition: SmallVector.h:53
MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight)
Return metadata containing two branch weights.
Definition: MDBuilder.cpp:38
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1226
StringRef getFuncNameInModule(const Module *M) const
Return the original function name if it exists in Module M.
Definition: SampleProf.h:410
InlineCost getInlineCost(CallSite CS, const InlineParams &Params, TargetTransformInfo &CalleeTTI, std::function< AssumptionCache &(Function &)> &GetAssumptionCache, Optional< function_ref< BlockFrequencyInfo &(Function &)>> GetBFI, ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE=nullptr)
Get an InlineCost object representing the cost of inlining this callsite.
Optional< bool > ComputeFullInlineCost
Compute inline cost even when the cost has exceeded the threshold.
Definition: InlineCost.h:181
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1116
constexpr bool empty(const T &RangeOrContainer)
Test whether RangeOrContainer is empty. Similar to C++17 std::empty.
Definition: STLExtras.h:210
const FunctionSamples * findFunctionSamplesAt(const LineLocation &Loc, StringRef CalleeName) const
Returns a pointer to FunctionSamples at the given callsite location Loc with callee CalleeName...
Definition: SampleProf.h:295
A function analysis which provides an AssumptionCache.
const InstListType & getInstList() const
Return the underlying instruction list container.
Definition: BasicBlock.h:334
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:418
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
InlineParams getInlineParams()
Generate the parameters to tune the inline cost analysis based only on the commandline options...
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:847
Module.h This file contains the declarations for the Module class.
static unsigned getOffset(const DILocation *DIL)
Returns the line offset to the start line of the subprogram.
Definition: SampleProf.cpp:165
static cl::opt< bool > ProfileSampleAccurate("profile-sample-accurate", cl::Hidden, cl::init(false), cl::desc("If the sample profile is accurate, we will mark all un-sampled " "callsite and function as having 0 samples. Otherwise, treat " "un-sampled callsites and functions conservatively as unknown. "))
ErrorOr< SampleRecord::CallTargetMap > findCallTargetMapAt(uint32_t LineOffset, uint32_t Discriminator) const
Returns the call target map collected at a given location.
Definition: SampleProf.h:270
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:133
StringMap - This is an unconventional map that is specialized for handling keys that are "strings"...
Definition: StringMap.h:220
PostDominatorTree Class - Concrete subclass of DominatorTree that is used to compute the post-dominat...
static void clear(coro::Shape &Shape)
Definition: Coroutines.cpp:212
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:478
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
void setProfileSummary(Metadata *M)
Attach profile summary metadata to this module.
Definition: Module.cpp:536
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
LLVM_ATTRIBUTE_ALWAYS_INLINE iterator end()
Definition: SmallVector.h:133
unsigned getBaseDiscriminator() const
Returns the base discriminator stored in the discriminator.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:311
#define DEBUG_TYPE
static const size_t npos
Definition: StringRef.h:51
iterator begin()
Definition: StringMap.h:315
Represents the relative location of an instruction.
Definition: SampleProf.h:118
static cl::opt< unsigned > SampleProfileSampleCoverage("sample-profile-check-sample-coverage", cl::init(0), cl::value_desc("N"), cl::desc("Emit a warning if less than N% of samples in the input profile " "are matched to the IR."))
static bool callsiteIsHot(const FunctionSamples *CallsiteFS, ProfileSummaryInfo *PSI)
Return true if the given callsite is hot wrt to hot cutoff threshold.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:465
StringRef getName() const
Return a constant reference to the value&#39;s name.
Definition: Value.cpp:214
Establish a view to a call site for examination.
Definition: CallSite.h:711
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:107
#define I(x, y, z)
Definition: MD5.cpp:58
ModulePass class - This class is used to implement unstructured interprocedural optimizations and ana...
Definition: Pass.h:225
iterator end()
Definition: DenseMap.h:109
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:323
std::map< std::string, FunctionSamples > FunctionSamplesMap
Definition: SampleProf.h:209
StringRef getName() const
Return the function name.
Definition: SampleProf.h:407
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:92
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
INITIALIZE_PASS_BEGIN(SampleProfileLoaderLegacyPass, "sample-profile", "Sample Profile loader", false, false) INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:206
Provides ErrorOr<T> smart pointer.
FunTy * getCalledFunction() const
Return the function being called if this is a direct call, otherwise return null (if it&#39;s an indirect...
Definition: CallSite.h:107
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:566
uint64_t getTotalSamples() const
Return the total number of samples collected inside the function.
Definition: SampleProf.h:321
sample Sample Profile loader
const CallsiteSampleMap & getCallsiteSamples() const
Return all the callsite samples collected in the body of the function.
Definition: SampleProf.h:354
This class implements an extremely fast bulk output stream that can only output to a stream...
Definition: raw_ostream.h:46
print Print MemDeps of function
This file defines a set of templates that efficiently compute a dominator tree over a generic graph...
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:49
A container for analyses that lazily runs them and caches their results.
This pass exposes codegen information to IR-level passes.
uint64_t getEntrySamples() const
Return the sample count of the first instruction of the function.
Definition: SampleProf.h:332
static cl::opt< bool > NoWarnSampleUnused("no-warn-sample-unused", cl::init(false), cl::Hidden, cl::desc("Use this option to turn off/on warnings about function with " "samples but without debug information to use those samples. "))
This header defines various interfaces for pass management in LLVM.
Diagnostic information for the sample profiler.
LLVM_NODISCARD LLVM_ATTRIBUTE_ALWAYS_INLINE size_t find(char C, size_t From=0) const
Search for the first character C in the string.
Definition: StringRef.h:298
#define LLVM_DEBUG(X)
Definition: Debug.h:123
The optimization diagnostic interface.
iterator end()
Definition: StringMap.h:318
This file provides the interface for the sampled PGO loader pass.
reference get()
Definition: ErrorOr.h:157
const BasicBlock * getParent() const
Definition: Instruction.h:67
static ErrorOr< std::unique_ptr< SampleProfileReader > > create(const Twine &Filename, LLVMContext &C, std::unique_ptr< SampleProfileReader > Underlying)
Create a remapped sample profile from the given remapping file and underlying samples.
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Definition: PassManager.h:1038