/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Transforms/IPO/SampleProfile.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===- SampleProfile.cpp - Incorporate sample profiles into the IR --------===// |
2 | | // |
3 | | // The LLVM Compiler Infrastructure |
4 | | // |
5 | | // This file is distributed under the University of Illinois Open Source |
6 | | // License. See LICENSE.TXT for details. |
7 | | // |
8 | | //===----------------------------------------------------------------------===// |
9 | | // |
10 | | // This file implements the SampleProfileLoader transformation. This pass |
11 | | // reads a profile file generated by a sampling profiler (e.g. Linux Perf - |
12 | | // http://perf.wiki.kernel.org/) and generates IR metadata to reflect the |
13 | | // profile information in the given profile. |
14 | | // |
15 | | // This pass generates branch weight annotations on the IR: |
16 | | // |
17 | | // - prof: Represents branch weights. This annotation is added to branches |
18 | | // to indicate the weights of each edge coming out of the branch. |
19 | | // The weight of each edge is the weight of the target block for |
20 | | // that edge. The weight of a block B is computed as the maximum |
21 | | // number of samples found in B. |
22 | | // |
23 | | //===----------------------------------------------------------------------===// |
24 | | |
25 | | #include "llvm/Transforms/SampleProfile.h" |
26 | | #include "llvm/ADT/DenseMap.h" |
27 | | #include "llvm/ADT/SmallPtrSet.h" |
28 | | #include "llvm/ADT/SmallSet.h" |
29 | | #include "llvm/ADT/StringRef.h" |
30 | | #include "llvm/Analysis/AssumptionCache.h" |
31 | | #include "llvm/Analysis/InlineCost.h" |
32 | | #include "llvm/Analysis/LoopInfo.h" |
33 | | #include "llvm/Analysis/OptimizationDiagnosticInfo.h" |
34 | | #include "llvm/Analysis/PostDominators.h" |
35 | | #include "llvm/Analysis/TargetTransformInfo.h" |
36 | | #include "llvm/IR/Constants.h" |
37 | | #include "llvm/IR/DebugInfo.h" |
38 | | #include "llvm/IR/DiagnosticInfo.h" |
39 | | #include "llvm/IR/Dominators.h" |
40 | | #include "llvm/IR/Function.h" |
41 | | #include "llvm/IR/GlobalValue.h" |
42 | | #include "llvm/IR/InstIterator.h" |
43 | | #include "llvm/IR/Instructions.h" |
44 | | #include "llvm/IR/IntrinsicInst.h" |
45 | | #include "llvm/IR/LLVMContext.h" |
46 | | #include "llvm/IR/MDBuilder.h" |
47 | | #include "llvm/IR/Metadata.h" |
48 | | #include "llvm/IR/Module.h" |
49 | | #include "llvm/IR/ValueSymbolTable.h" |
50 | | #include "llvm/Pass.h" |
51 | | #include "llvm/ProfileData/InstrProf.h" |
52 | | #include "llvm/ProfileData/SampleProfReader.h" |
53 | | #include "llvm/Support/CommandLine.h" |
54 | | #include "llvm/Support/Debug.h" |
55 | | #include "llvm/Support/ErrorOr.h" |
56 | | #include "llvm/Support/Format.h" |
57 | | #include "llvm/Support/raw_ostream.h" |
58 | | #include "llvm/Transforms/IPO.h" |
59 | | #include "llvm/Transforms/Instrumentation.h" |
60 | | #include "llvm/Transforms/Utils/Cloning.h" |
61 | | #include <cctype> |
62 | | |
63 | | using namespace llvm; |
64 | | using namespace sampleprof; |
65 | | |
66 | 262 | #define DEBUG_TYPE "sample-profile" |
67 | | |
68 | | // Command line option to specify the file to read samples from. This is |
69 | | // mainly used for debugging. |
70 | | static cl::opt<std::string> SampleProfileFile( |
71 | | "sample-profile-file", cl::init(""), cl::value_desc("filename"), |
72 | | cl::desc("Profile file loaded by -sample-profile"), cl::Hidden); |
73 | | static cl::opt<unsigned> SampleProfileMaxPropagateIterations( |
74 | | "sample-profile-max-propagate-iterations", cl::init(100), |
75 | | cl::desc("Maximum number of iterations to go through when propagating " |
76 | | "sample block/edge weights through the CFG.")); |
77 | | static cl::opt<unsigned> SampleProfileRecordCoverage( |
78 | | "sample-profile-check-record-coverage", cl::init(0), cl::value_desc("N"), |
79 | | cl::desc("Emit a warning if less than N% of records in the input profile " |
80 | | "are matched to the IR.")); |
81 | | static cl::opt<unsigned> SampleProfileSampleCoverage( |
82 | | "sample-profile-check-sample-coverage", cl::init(0), cl::value_desc("N"), |
83 | | cl::desc("Emit a warning if less than N% of samples in the input profile " |
84 | | "are matched to the IR.")); |
85 | | static cl::opt<double> SampleProfileHotThreshold( |
86 | | "sample-profile-inline-hot-threshold", cl::init(0.1), cl::value_desc("N"), |
87 | | cl::desc("Inlined functions that account for more than N% of all samples " |
88 | | "collected in the parent function, will be inlined again.")); |
89 | | |
90 | | namespace { |
91 | | typedef DenseMap<const BasicBlock *, uint64_t> BlockWeightMap; |
92 | | typedef DenseMap<const BasicBlock *, const BasicBlock *> EquivalenceClassMap; |
93 | | typedef std::pair<const BasicBlock *, const BasicBlock *> Edge; |
94 | | typedef DenseMap<Edge, uint64_t> EdgeWeightMap; |
95 | | typedef DenseMap<const BasicBlock *, SmallVector<const BasicBlock *, 8>> |
96 | | BlockEdgeMap; |
97 | | |
98 | | class SampleCoverageTracker { |
99 | | public: |
100 | 71 | SampleCoverageTracker() : SampleCoverage(), TotalUsedSamples(0) {} |
101 | | |
102 | | bool markSamplesUsed(const FunctionSamples *FS, uint32_t LineOffset, |
103 | | uint32_t Discriminator, uint64_t Samples); |
104 | | unsigned computeCoverage(unsigned Used, unsigned Total) const; |
105 | | unsigned countUsedRecords(const FunctionSamples *FS) const; |
106 | | unsigned countBodyRecords(const FunctionSamples *FS) const; |
107 | 4 | uint64_t getTotalUsedSamples() const { return TotalUsedSamples; } |
108 | | uint64_t countBodySamples(const FunctionSamples *FS) const; |
109 | 119 | void clear() { |
110 | 119 | SampleCoverage.clear(); |
111 | 119 | TotalUsedSamples = 0; |
112 | 119 | } |
113 | | |
114 | | private: |
115 | | typedef std::map<LineLocation, unsigned> BodySampleCoverageMap; |
116 | | typedef DenseMap<const FunctionSamples *, BodySampleCoverageMap> |
117 | | FunctionSamplesCoverageMap; |
118 | | |
119 | | /// Coverage map for sampling records. |
120 | | /// |
121 | | /// This map keeps a record of sampling records that have been matched to |
122 | | /// an IR instruction. This is used to detect some form of staleness in |
123 | | /// profiles (see flag -sample-profile-check-coverage). |
124 | | /// |
125 | | /// Each entry in the map corresponds to a FunctionSamples instance. This is |
126 | | /// another map that counts how many times the sample record at the |
127 | | /// given location has been used. |
128 | | FunctionSamplesCoverageMap SampleCoverage; |
129 | | |
130 | | /// Number of samples used from the profile. |
131 | | /// |
132 | | /// When a sampling record is used for the first time, the samples from |
133 | | /// that record are added to this accumulator. Coverage is later computed |
134 | | /// based on the total number of samples available in this function and |
135 | | /// its callsites. |
136 | | /// |
137 | | /// Note that this accumulator tracks samples used from a single function |
138 | | /// and all the inlined callsites. Strictly, we should have a map of counters |
139 | | /// keyed by FunctionSamples pointers, but these stats are cleared after |
140 | | /// every function, so we just need to keep a single counter. |
141 | | uint64_t TotalUsedSamples; |
142 | | }; |
143 | | |
144 | | /// \brief Sample profile pass. |
145 | | /// |
146 | | /// This pass reads profile data from the file specified by |
147 | | /// -sample-profile-file and annotates every affected function with the |
148 | | /// profile information found in that file. |
149 | | class SampleProfileLoader { |
150 | | public: |
151 | | SampleProfileLoader( |
152 | | StringRef Name, |
153 | | std::function<AssumptionCache &(Function &)> GetAssumptionCache, |
154 | | std::function<TargetTransformInfo &(Function &)> GetTargetTransformInfo) |
155 | | : DT(nullptr), PDT(nullptr), LI(nullptr), GetAC(GetAssumptionCache), |
156 | | GetTTI(GetTargetTransformInfo), Reader(), Samples(nullptr), |
157 | | Filename(Name), ProfileIsValid(false), TotalCollectedSamples(0), |
158 | 71 | ORE(nullptr) {} |
159 | | |
160 | | bool doInitialization(Module &M); |
161 | | bool runOnModule(Module &M, ModuleAnalysisManager *AM); |
162 | | |
163 | 0 | void dump() { Reader->dump(); } |
164 | | |
165 | | protected: |
166 | | bool runOnFunction(Function &F, ModuleAnalysisManager *AM); |
167 | | unsigned getFunctionLoc(Function &F); |
168 | | bool emitAnnotations(Function &F); |
169 | | ErrorOr<uint64_t> getInstWeight(const Instruction &I); |
170 | | ErrorOr<uint64_t> getBlockWeight(const BasicBlock *BB); |
171 | | const FunctionSamples *findCalleeFunctionSamples(const Instruction &I) const; |
172 | | std::vector<const FunctionSamples *> |
173 | | findIndirectCallFunctionSamples(const Instruction &I) const; |
174 | | const FunctionSamples *findFunctionSamples(const Instruction &I) const; |
175 | | bool inlineHotFunctions(Function &F, |
176 | | DenseSet<GlobalValue::GUID> &ImportGUIDs); |
177 | | void printEdgeWeight(raw_ostream &OS, Edge E); |
178 | | void printBlockWeight(raw_ostream &OS, const BasicBlock *BB) const; |
179 | | void printBlockEquivalence(raw_ostream &OS, const BasicBlock *BB); |
180 | | bool computeBlockWeights(Function &F); |
181 | | void findEquivalenceClasses(Function &F); |
182 | | template <bool IsPostDom> |
183 | | void findEquivalencesFor(BasicBlock *BB1, ArrayRef<BasicBlock *> Descendants, |
184 | | DominatorTreeBase<BasicBlock, IsPostDom> *DomTree); |
185 | | |
186 | | void propagateWeights(Function &F); |
187 | | uint64_t visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge); |
188 | | void buildEdges(Function &F); |
189 | | bool propagateThroughEdges(Function &F, bool UpdateBlockCount); |
190 | | void computeDominanceAndLoopInfo(Function &F); |
191 | | unsigned getOffset(const DILocation *DIL) const; |
192 | | void clearFunctionData(); |
193 | | |
194 | | /// \brief Map basic blocks to their computed weights. |
195 | | /// |
196 | | /// The weight of a basic block is defined to be the maximum |
197 | | /// of all the instruction weights in that block. |
198 | | BlockWeightMap BlockWeights; |
199 | | |
200 | | /// \brief Map edges to their computed weights. |
201 | | /// |
202 | | /// Edge weights are computed by propagating basic block weights in |
203 | | /// SampleProfile::propagateWeights. |
204 | | EdgeWeightMap EdgeWeights; |
205 | | |
206 | | /// \brief Set of visited blocks during propagation. |
207 | | SmallPtrSet<const BasicBlock *, 32> VisitedBlocks; |
208 | | |
209 | | /// \brief Set of visited edges during propagation. |
210 | | SmallSet<Edge, 32> VisitedEdges; |
211 | | |
212 | | /// \brief Equivalence classes for block weights. |
213 | | /// |
214 | | /// Two blocks BB1 and BB2 are in the same equivalence class if they |
215 | | /// dominate and post-dominate each other, and they are in the same loop |
216 | | /// nest. When this happens, the two blocks are guaranteed to execute |
217 | | /// the same number of times. |
218 | | EquivalenceClassMap EquivalenceClass; |
219 | | |
220 | | /// Map from function name to Function *. Used to find the function from |
221 | | /// the function name. If the function name contains suffix, additional |
222 | | /// entry is added to map from the stripped name to the function if there |
223 | | /// is one-to-one mapping. |
224 | | StringMap<Function *> SymbolMap; |
225 | | |
226 | | /// \brief Dominance, post-dominance and loop information. |
227 | | std::unique_ptr<DominatorTree> DT; |
228 | | std::unique_ptr<PostDomTreeBase<BasicBlock>> PDT; |
229 | | std::unique_ptr<LoopInfo> LI; |
230 | | |
231 | | std::function<AssumptionCache &(Function &)> GetAC; |
232 | | std::function<TargetTransformInfo &(Function &)> GetTTI; |
233 | | |
234 | | /// \brief Predecessors for each basic block in the CFG. |
235 | | BlockEdgeMap Predecessors; |
236 | | |
237 | | /// \brief Successors for each basic block in the CFG. |
238 | | BlockEdgeMap Successors; |
239 | | |
240 | | SampleCoverageTracker CoverageTracker; |
241 | | |
242 | | /// \brief Profile reader object. |
243 | | std::unique_ptr<SampleProfileReader> Reader; |
244 | | |
245 | | /// \brief Samples collected for the body of this function. |
246 | | FunctionSamples *Samples; |
247 | | |
248 | | /// \brief Name of the profile file to load. |
249 | | std::string Filename; |
250 | | |
251 | | /// \brief Flag indicating whether the profile input loaded successfully. |
252 | | bool ProfileIsValid; |
253 | | |
254 | | /// \brief Total number of samples collected in this profile. |
255 | | /// |
256 | | /// This is the sum of all the samples collected in all the functions executed |
257 | | /// at runtime. |
258 | | uint64_t TotalCollectedSamples; |
259 | | |
260 | | /// \brief Optimization Remark Emitter used to emit diagnostic remarks. |
261 | | OptimizationRemarkEmitter *ORE; |
262 | | }; |
263 | | |
264 | | class SampleProfileLoaderLegacyPass : public ModulePass { |
265 | | public: |
266 | | // Class identification, replacement for typeinfo |
267 | | static char ID; |
268 | | |
269 | | SampleProfileLoaderLegacyPass(StringRef Name = SampleProfileFile) |
270 | | : ModulePass(ID), SampleLoader(Name, |
271 | 18 | [&](Function &F) -> AssumptionCache & { |
272 | 18 | return ACT->getAssumptionCache(F); |
273 | 18 | }, |
274 | 12 | [&](Function &F) -> TargetTransformInfo & { |
275 | 12 | return TTIWP->getTTI(F); |
276 | 12 | }), |
277 | 41 | ACT(nullptr), TTIWP(nullptr) { |
278 | 41 | initializeSampleProfileLoaderLegacyPassPass( |
279 | 41 | *PassRegistry::getPassRegistry()); |
280 | 41 | } |
281 | | |
282 | 0 | void dump() { SampleLoader.dump(); } |
283 | | |
284 | 41 | bool doInitialization(Module &M) override { |
285 | 41 | return SampleLoader.doInitialization(M); |
286 | 41 | } |
287 | 1 | StringRef getPassName() const override { return "Sample profile pass"; } |
288 | | bool runOnModule(Module &M) override; |
289 | | |
290 | 41 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
291 | 41 | AU.addRequired<AssumptionCacheTracker>(); |
292 | 41 | AU.addRequired<TargetTransformInfoWrapperPass>(); |
293 | 41 | } |
294 | | |
295 | | private: |
296 | | SampleProfileLoader SampleLoader; |
297 | | AssumptionCacheTracker *ACT; |
298 | | TargetTransformInfoWrapperPass *TTIWP; |
299 | | }; |
300 | | |
301 | | /// Return true if the given callsite is hot wrt to its caller. |
302 | | /// |
303 | | /// Functions that were inlined in the original binary will be represented |
304 | | /// in the inline stack in the sample profile. If the profile shows that |
305 | | /// the original inline decision was "good" (i.e., the callsite is executed |
306 | | /// frequently), then we will recreate the inline decision and apply the |
307 | | /// profile from the inlined callsite. |
308 | | /// |
309 | | /// To decide whether an inlined callsite is hot, we compute the fraction |
310 | | /// of samples used by the callsite with respect to the total number of samples |
311 | | /// collected in the caller. |
312 | | /// |
313 | | /// If that fraction is larger than the default given by |
314 | | /// SampleProfileHotThreshold, the callsite will be inlined again. |
315 | | bool callsiteIsHot(const FunctionSamples *CallerFS, |
316 | 41 | const FunctionSamples *CallsiteFS) { |
317 | 41 | if (!CallsiteFS) |
318 | 0 | return false; // The callsite was not inlined in the original binary. |
319 | 41 | |
320 | 41 | uint64_t ParentTotalSamples = CallerFS->getTotalSamples(); |
321 | 41 | if (ParentTotalSamples == 0) |
322 | 0 | return false; // Avoid division by zero. |
323 | 41 | |
324 | 41 | uint64_t CallsiteTotalSamples = CallsiteFS->getTotalSamples(); |
325 | 41 | if (CallsiteTotalSamples == 0) |
326 | 7 | return false; // Callsite is trivially cold. |
327 | 34 | |
328 | 34 | double PercentSamples = |
329 | 34 | (double)CallsiteTotalSamples / (double)ParentTotalSamples * 100.0; |
330 | 34 | return PercentSamples >= SampleProfileHotThreshold; |
331 | 34 | } |
332 | | } |
333 | | |
334 | | /// Mark as used the sample record for the given function samples at |
335 | | /// (LineOffset, Discriminator). |
336 | | /// |
337 | | /// \returns true if this is the first time we mark the given record. |
338 | | bool SampleCoverageTracker::markSamplesUsed(const FunctionSamples *FS, |
339 | | uint32_t LineOffset, |
340 | | uint32_t Discriminator, |
341 | 575 | uint64_t Samples) { |
342 | 575 | LineLocation Loc(LineOffset, Discriminator); |
343 | 575 | unsigned &Count = SampleCoverage[FS][Loc]; |
344 | 575 | bool FirstTime = (++Count == 1); |
345 | 575 | if (FirstTime) |
346 | 186 | TotalUsedSamples += Samples; |
347 | 575 | return FirstTime; |
348 | 575 | } |
349 | | |
350 | | /// Return the number of sample records that were applied from this profile. |
351 | | /// |
352 | | /// This count does not include records from cold inlined callsites. |
353 | | unsigned |
354 | 8 | SampleCoverageTracker::countUsedRecords(const FunctionSamples *FS) const { |
355 | 8 | auto I = SampleCoverage.find(FS); |
356 | 8 | |
357 | 8 | // The size of the coverage map for FS represents the number of records |
358 | 8 | // that were marked used at least once. |
359 | 8 | unsigned Count = (I != SampleCoverage.end()) ? I->second.size()8 : 00 ; |
360 | 8 | |
361 | 8 | // If there are inlined callsites in this function, count the samples found |
362 | 8 | // in the respective bodies. However, do not bother counting callees with 0 |
363 | 8 | // total samples, these are callees that were never invoked at runtime. |
364 | 8 | for (const auto &I : FS->getCallsiteSamples()) |
365 | 4 | for (const auto &J : I.second) 4 { |
366 | 4 | const FunctionSamples *CalleeSamples = &J.second; |
367 | 4 | if (callsiteIsHot(FS, CalleeSamples)) |
368 | 2 | Count += countUsedRecords(CalleeSamples); |
369 | 4 | } |
370 | 8 | |
371 | 8 | return Count; |
372 | 8 | } |
373 | | |
374 | | /// Return the number of sample records in the body of this profile. |
375 | | /// |
376 | | /// This count does not include records from cold inlined callsites. |
377 | | unsigned |
378 | 8 | SampleCoverageTracker::countBodyRecords(const FunctionSamples *FS) const { |
379 | 8 | unsigned Count = FS->getBodySamples().size(); |
380 | 8 | |
381 | 8 | // Only count records in hot callsites. |
382 | 8 | for (const auto &I : FS->getCallsiteSamples()) |
383 | 4 | for (const auto &J : I.second) 4 { |
384 | 4 | const FunctionSamples *CalleeSamples = &J.second; |
385 | 4 | if (callsiteIsHot(FS, CalleeSamples)) |
386 | 2 | Count += countBodyRecords(CalleeSamples); |
387 | 4 | } |
388 | 8 | |
389 | 8 | return Count; |
390 | 8 | } |
391 | | |
392 | | /// Return the number of samples collected in the body of this profile. |
393 | | /// |
394 | | /// This count does not include samples from cold inlined callsites. |
395 | | uint64_t |
396 | 6 | SampleCoverageTracker::countBodySamples(const FunctionSamples *FS) const { |
397 | 6 | uint64_t Total = 0; |
398 | 6 | for (const auto &I : FS->getBodySamples()) |
399 | 16 | Total += I.second.getSamples(); |
400 | 6 | |
401 | 6 | // Only count samples in hot callsites. |
402 | 6 | for (const auto &I : FS->getCallsiteSamples()) |
403 | 2 | for (const auto &J : I.second) 2 { |
404 | 2 | const FunctionSamples *CalleeSamples = &J.second; |
405 | 2 | if (callsiteIsHot(FS, CalleeSamples)) |
406 | 2 | Total += countBodySamples(CalleeSamples); |
407 | 2 | } |
408 | 6 | |
409 | 6 | return Total; |
410 | 6 | } |
411 | | |
412 | | /// Return the fraction of sample records used in this profile. |
413 | | /// |
414 | | /// The returned value is an unsigned integer in the range 0-100 indicating |
415 | | /// the percentage of sample records that were used while applying this |
416 | | /// profile to the associated function. |
417 | | unsigned SampleCoverageTracker::computeCoverage(unsigned Used, |
418 | 10 | unsigned Total) const { |
419 | 10 | assert(Used <= Total && |
420 | 10 | "number of used records cannot exceed the total number of records"); |
421 | 10 | return Total > 0 ? Used * 100 / Total10 : 1000 ; |
422 | 10 | } |
423 | | |
424 | | /// Clear all the per-function data used to load samples and propagate weights. |
425 | 119 | void SampleProfileLoader::clearFunctionData() { |
426 | 119 | BlockWeights.clear(); |
427 | 119 | EdgeWeights.clear(); |
428 | 119 | VisitedBlocks.clear(); |
429 | 119 | VisitedEdges.clear(); |
430 | 119 | EquivalenceClass.clear(); |
431 | 119 | DT = nullptr; |
432 | 119 | PDT = nullptr; |
433 | 119 | LI = nullptr; |
434 | 119 | Predecessors.clear(); |
435 | 119 | Successors.clear(); |
436 | 119 | CoverageTracker.clear(); |
437 | 119 | } |
438 | | |
439 | | /// Returns the line offset to the start line of the subprogram. |
440 | | /// We assume that a single function will not exceed 65535 LOC. |
441 | 1.04k | unsigned SampleProfileLoader::getOffset(const DILocation *DIL) const { |
442 | 1.04k | return (DIL->getLine() - DIL->getScope()->getSubprogram()->getLine()) & |
443 | 1.04k | 0xffff; |
444 | 1.04k | } |
445 | | |
446 | | #ifndef NDEBUG |
447 | | /// \brief Print the weight of edge \p E on stream \p OS. |
448 | | /// |
449 | | /// \param OS Stream to emit the output to. |
450 | | /// \param E Edge to print. |
451 | | void SampleProfileLoader::printEdgeWeight(raw_ostream &OS, Edge E) { |
452 | | OS << "weight[" << E.first->getName() << "->" << E.second->getName() |
453 | | << "]: " << EdgeWeights[E] << "\n"; |
454 | | } |
455 | | |
456 | | /// \brief Print the equivalence class of block \p BB on stream \p OS. |
457 | | /// |
458 | | /// \param OS Stream to emit the output to. |
459 | | /// \param BB Block to print. |
460 | | void SampleProfileLoader::printBlockEquivalence(raw_ostream &OS, |
461 | | const BasicBlock *BB) { |
462 | | const BasicBlock *Equiv = EquivalenceClass[BB]; |
463 | | OS << "equivalence[" << BB->getName() |
464 | | << "]: " << ((Equiv) ? EquivalenceClass[BB]->getName() : "NONE") << "\n"; |
465 | | } |
466 | | |
467 | | /// \brief Print the weight of block \p BB on stream \p OS. |
468 | | /// |
469 | | /// \param OS Stream to emit the output to. |
470 | | /// \param BB Block to print. |
471 | | void SampleProfileLoader::printBlockWeight(raw_ostream &OS, |
472 | | const BasicBlock *BB) const { |
473 | | const auto &I = BlockWeights.find(BB); |
474 | | uint64_t W = (I == BlockWeights.end() ? 0 : I->second); |
475 | | OS << "weight[" << BB->getName() << "]: " << W << "\n"; |
476 | | } |
477 | | #endif |
478 | | |
479 | | /// \brief Get the weight for an instruction. |
480 | | /// |
481 | | /// The "weight" of an instruction \p Inst is the number of samples |
482 | | /// collected on that instruction at runtime. To retrieve it, we |
483 | | /// need to compute the line number of \p Inst relative to the start of its |
484 | | /// function. We use HeaderLineno to compute the offset. We then |
485 | | /// look up the samples collected for \p Inst using BodySamples. |
486 | | /// |
487 | | /// \param Inst Instruction to query. |
488 | | /// |
489 | | /// \returns the weight of \p Inst. |
490 | 1.27k | ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) { |
491 | 1.27k | const DebugLoc &DLoc = Inst.getDebugLoc(); |
492 | 1.27k | if (!DLoc) |
493 | 252 | return std::error_code(); |
494 | 1.02k | |
495 | 1.02k | const FunctionSamples *FS = findFunctionSamples(Inst); |
496 | 1.02k | if (!FS) |
497 | 0 | return std::error_code(); |
498 | 1.02k | |
499 | 1.02k | // Ignore all intrinsics and branch instructions. |
500 | 1.02k | // Branch instruction usually contains debug info from sources outside of |
501 | 1.02k | // the residing basic block, thus we ignore them during annotation. |
502 | 1.02k | if (1.02k isa<BranchInst>(Inst) || 1.02k isa<IntrinsicInst>(Inst)828 ) |
503 | 295 | return std::error_code(); |
504 | 728 | |
505 | 728 | // If a call/invoke instruction is inlined in profile, but not inlined here, |
506 | 728 | // it means that the inlined callsite has no sample, thus the call |
507 | 728 | // instruction should have 0 count. |
508 | 728 | if (728 (isa<CallInst>(Inst) || 728 isa<InvokeInst>(Inst)643 ) && |
509 | 85 | findCalleeFunctionSamples(Inst)) |
510 | 17 | return 0; |
511 | 711 | |
512 | 711 | const DILocation *DIL = DLoc; |
513 | 711 | uint32_t LineOffset = getOffset(DIL); |
514 | 711 | uint32_t Discriminator = DIL->getBaseDiscriminator(); |
515 | 711 | ErrorOr<uint64_t> R = FS->findSamplesAt(LineOffset, Discriminator); |
516 | 711 | if (R711 ) { |
517 | 575 | bool FirstMark = |
518 | 575 | CoverageTracker.markSamplesUsed(FS, LineOffset, Discriminator, R.get()); |
519 | 575 | if (FirstMark575 ) { |
520 | 186 | if (Discriminator) |
521 | 32 | ORE->emit(OptimizationRemarkAnalysis(32 DEBUG_TYPE32 , "AppliedSamples", &Inst) |
522 | 32 | << "Applied " << ore::NV("NumSamples", *R) |
523 | 32 | << " samples from profile (offset: " |
524 | 32 | << ore::NV("LineOffset", LineOffset) << "." |
525 | 32 | << ore::NV("Discriminator", Discriminator) << ")"); |
526 | 186 | else |
527 | 154 | ORE->emit(OptimizationRemarkAnalysis(154 DEBUG_TYPE154 , "AppliedSamples", &Inst) |
528 | 154 | << "Applied " << ore::NV("NumSamples", *R) |
529 | 154 | << " samples from profile (offset: " |
530 | 154 | << ore::NV("LineOffset", LineOffset) << ")"); |
531 | 186 | } |
532 | 575 | DEBUG(dbgs() << " " << DLoc.getLine() << "." |
533 | 575 | << DIL->getBaseDiscriminator() << ":" << Inst |
534 | 575 | << " (line offset: " << LineOffset << "." |
535 | 575 | << DIL->getBaseDiscriminator() << " - weight: " << R.get() |
536 | 575 | << ")\n"); |
537 | 575 | } |
538 | 1.27k | return R; |
539 | 1.27k | } |
540 | | |
541 | | /// \brief Compute the weight of a basic block. |
542 | | /// |
543 | | /// The weight of basic block \p BB is the maximum weight of all the |
544 | | /// instructions in BB. |
545 | | /// |
546 | | /// \param BB The basic block to query. |
547 | | /// |
548 | | /// \returns the weight for \p BB. |
549 | 289 | ErrorOr<uint64_t> SampleProfileLoader::getBlockWeight(const BasicBlock *BB) { |
550 | 289 | uint64_t Max = 0; |
551 | 289 | bool HasWeight = false; |
552 | 1.27k | for (auto &I : BB->getInstList()) { |
553 | 1.27k | const ErrorOr<uint64_t> &R = getInstWeight(I); |
554 | 1.27k | if (R1.27k ) { |
555 | 592 | Max = std::max(Max, R.get()); |
556 | 592 | HasWeight = true; |
557 | 592 | } |
558 | 1.27k | } |
559 | 289 | return HasWeight ? ErrorOr<uint64_t>(Max)195 : std::error_code()94 ; |
560 | 289 | } |
561 | | |
562 | | /// \brief Compute and store the weights of every basic block. |
563 | | /// |
564 | | /// This populates the BlockWeights map by computing |
565 | | /// the weights of every basic block in the CFG. |
566 | | /// |
567 | | /// \param F The function to query. |
568 | 71 | bool SampleProfileLoader::computeBlockWeights(Function &F) { |
569 | 71 | bool Changed = false; |
570 | 71 | DEBUG(dbgs() << "Block weights\n"); |
571 | 289 | for (const auto &BB : F) { |
572 | 289 | ErrorOr<uint64_t> Weight = getBlockWeight(&BB); |
573 | 289 | if (Weight289 ) { |
574 | 195 | BlockWeights[&BB] = Weight.get(); |
575 | 195 | VisitedBlocks.insert(&BB); |
576 | 195 | Changed = true; |
577 | 195 | } |
578 | 289 | DEBUG(printBlockWeight(dbgs(), &BB)); |
579 | 289 | } |
580 | 71 | |
581 | 71 | return Changed; |
582 | 71 | } |
583 | | |
584 | | /// \brief Get the FunctionSamples for a call instruction. |
585 | | /// |
586 | | /// The FunctionSamples of a call/invoke instruction \p Inst is the inlined |
587 | | /// instance in which that call instruction is calling to. It contains |
588 | | /// all samples that resides in the inlined instance. We first find the |
589 | | /// inlined instance in which the call instruction is from, then we |
590 | | /// traverse its children to find the callsite with the matching |
591 | | /// location. |
592 | | /// |
593 | | /// \param Inst Call/Invoke instruction to query. |
594 | | /// |
595 | | /// \returns The FunctionSamples pointer to the inlined instance. |
596 | | const FunctionSamples * |
597 | 186 | SampleProfileLoader::findCalleeFunctionSamples(const Instruction &Inst) const { |
598 | 186 | const DILocation *DIL = Inst.getDebugLoc(); |
599 | 186 | if (!DIL186 ) { |
600 | 0 | return nullptr; |
601 | 0 | } |
602 | 186 | |
603 | 186 | StringRef CalleeName; |
604 | 186 | if (const CallInst *CI = dyn_cast<CallInst>(&Inst)) |
605 | 185 | if (Function *185 Callee185 = CI->getCalledFunction()) |
606 | 134 | CalleeName = Callee->getName(); |
607 | 186 | |
608 | 186 | const FunctionSamples *FS = findFunctionSamples(Inst); |
609 | 186 | if (FS == nullptr) |
610 | 0 | return nullptr; |
611 | 186 | |
612 | 186 | return FS->findFunctionSamplesAt( |
613 | 186 | LineLocation(getOffset(DIL), DIL->getBaseDiscriminator()), CalleeName); |
614 | 186 | } |
615 | | |
616 | | /// Returns a vector of FunctionSamples that are the indirect call targets |
617 | | /// of \p Inst. The vector is sorted by the total number of samples. |
618 | | std::vector<const FunctionSamples *> |
619 | | SampleProfileLoader::findIndirectCallFunctionSamples( |
620 | 17 | const Instruction &Inst) const { |
621 | 17 | const DILocation *DIL = Inst.getDebugLoc(); |
622 | 17 | std::vector<const FunctionSamples *> R; |
623 | 17 | |
624 | 17 | if (!DIL17 ) { |
625 | 0 | return R; |
626 | 0 | } |
627 | 17 | |
628 | 17 | const FunctionSamples *FS = findFunctionSamples(Inst); |
629 | 17 | if (FS == nullptr) |
630 | 0 | return R; |
631 | 17 | |
632 | 17 | if (const FunctionSamplesMap *17 M17 = FS->findFunctionSamplesMapAt( |
633 | 17 | LineLocation(getOffset(DIL), DIL->getBaseDiscriminator()))) { |
634 | 17 | if (M->size() == 0) |
635 | 0 | return R; |
636 | 17 | for (const auto &NameFS : *M) 17 { |
637 | 22 | R.push_back(&NameFS.second); |
638 | 22 | } |
639 | 17 | std::sort(R.begin(), R.end(), |
640 | 5 | [](const FunctionSamples *L, const FunctionSamples *R) { |
641 | 5 | return L->getTotalSamples() > R->getTotalSamples(); |
642 | 5 | }); |
643 | 17 | } |
644 | 17 | return R; |
645 | 17 | } |
646 | | |
647 | | /// \brief Get the FunctionSamples for an instruction. |
648 | | /// |
649 | | /// The FunctionSamples of an instruction \p Inst is the inlined instance |
650 | | /// in which that instruction is coming from. We traverse the inline stack |
651 | | /// of that instruction, and match it with the tree nodes in the profile. |
652 | | /// |
653 | | /// \param Inst Instruction to query. |
654 | | /// |
655 | | /// \returns the FunctionSamples pointer to the inlined instance. |
656 | | const FunctionSamples * |
657 | 1.24k | SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const { |
658 | 1.24k | SmallVector<std::pair<LineLocation, StringRef>, 10> S; |
659 | 1.24k | const DILocation *DIL = Inst.getDebugLoc(); |
660 | 1.24k | if (!DIL) |
661 | 0 | return Samples; |
662 | 1.24k | |
663 | 1.24k | const DILocation *PrevDIL = DIL; |
664 | 1.36k | for (DIL = DIL->getInlinedAt(); DIL1.36k ; DIL = DIL->getInlinedAt()113 ) { |
665 | 113 | S.push_back(std::make_pair( |
666 | 113 | LineLocation(getOffset(DIL), DIL->getBaseDiscriminator()), |
667 | 113 | PrevDIL->getScope()->getSubprogram()->getLinkageName())); |
668 | 113 | PrevDIL = DIL; |
669 | 113 | } |
670 | 1.24k | if (S.size() == 0) |
671 | 1.13k | return Samples; |
672 | 113 | const FunctionSamples *FS = Samples; |
673 | 226 | for (int i = S.size() - 1; i >= 0 && 226 FS != nullptr113 ; i--113 ) { |
674 | 113 | FS = FS->findFunctionSamplesAt(S[i].first, S[i].second); |
675 | 113 | } |
676 | 1.24k | return FS; |
677 | 1.24k | } |
678 | | |
679 | | /// \brief Iteratively inline hot callsites of a function. |
680 | | /// |
681 | | /// Iteratively traverse all callsites of the function \p F, and find if |
682 | | /// the corresponding inlined instance exists and is hot in profile. If |
683 | | /// it is hot enough, inline the callsites and adds new callsites of the |
684 | | /// callee into the caller. If the call is an indirect call, first promote |
685 | | /// it to direct call. Each indirect call is limited with a single target. |
686 | | /// |
687 | | /// \param F function to perform iterative inlining. |
688 | | /// \param ImportGUIDs a set to be updated to include all GUIDs that come |
689 | | /// from a different module but inlined in the profiled binary. |
690 | | /// |
691 | | /// \returns True if there is any inline happened. |
692 | | bool SampleProfileLoader::inlineHotFunctions( |
693 | 71 | Function &F, DenseSet<GlobalValue::GUID> &ImportGUIDs) { |
694 | 71 | DenseSet<Instruction *> PromotedInsns; |
695 | 71 | bool Changed = false; |
696 | 83 | while (true83 ) { |
697 | 83 | bool LocalChanged = false; |
698 | 83 | SmallVector<Instruction *, 10> CIS; |
699 | 326 | for (auto &BB : F) { |
700 | 326 | bool Hot = false; |
701 | 326 | SmallVector<Instruction *, 10> Candidates; |
702 | 1.43k | for (auto &I : BB.getInstList()) { |
703 | 1.43k | const FunctionSamples *FS = nullptr; |
704 | 1.43k | if ((isa<CallInst>(I) || 1.43k isa<InvokeInst>(I)1.20k ) && |
705 | 1.43k | !isa<IntrinsicInst>(I)223 && (FS = findCalleeFunctionSamples(I))101 ) { |
706 | 31 | Candidates.push_back(&I); |
707 | 31 | if (callsiteIsHot(Samples, FS)) |
708 | 28 | Hot = true; |
709 | 31 | } |
710 | 1.43k | } |
711 | 326 | if (Hot326 ) { |
712 | 25 | CIS.insert(CIS.begin(), Candidates.begin(), Candidates.end()); |
713 | 25 | } |
714 | 326 | } |
715 | 29 | for (auto I : CIS) { |
716 | 29 | InlineFunctionInfo IFI(nullptr, &GetAC); |
717 | 29 | Function *CalledFunction = CallSite(I).getCalledFunction(); |
718 | 29 | // Do not inline recursive calls. |
719 | 29 | if (CalledFunction == &F) |
720 | 2 | continue; |
721 | 27 | Instruction *DI = I; |
722 | 27 | if (!CalledFunction && 27 !PromotedInsns.count(I)11 && |
723 | 27 | CallSite(I).isIndirectCall()8 ) { |
724 | 9 | for (const auto *FS : findIndirectCallFunctionSamples(*I)) { |
725 | 9 | auto CalleeFunctionName = FS->getName(); |
726 | 9 | // If it is a recursive call, we do not inline it as it could bloat |
727 | 9 | // the code exponentially. There is way to better handle this, e.g. |
728 | 9 | // clone the caller first, and inline the cloned caller if it is |
729 | 9 | // recursive. As llvm does not inline recursive calls, we will simply |
730 | 9 | // ignore it instead of handling it explicitly. |
731 | 9 | if (CalleeFunctionName == F.getName()) |
732 | 1 | continue; |
733 | 8 | const char *Reason = "Callee function not available"; |
734 | 8 | auto R = SymbolMap.find(CalleeFunctionName); |
735 | 8 | if (R == SymbolMap.end()) |
736 | 2 | continue; |
737 | 6 | CalledFunction = R->getValue(); |
738 | 6 | if (CalledFunction && 6 isLegalToPromote(I, CalledFunction, &Reason)5 ) { |
739 | 4 | // The indirect target was promoted and inlined in the profile, as a |
740 | 4 | // result, we do not have profile info for the branch probability. |
741 | 4 | // We set the probability to 80% taken to indicate that the static |
742 | 4 | // call is likely taken. |
743 | 4 | DI = dyn_cast<Instruction>( |
744 | 4 | promoteIndirectCall(I, CalledFunction, 80, 100, false, ORE) |
745 | 4 | ->stripPointerCasts()); |
746 | 4 | PromotedInsns.insert(I); |
747 | 6 | } else { |
748 | 2 | DEBUG(dbgs() << "\nFailed to promote indirect call to " |
749 | 2 | << CalleeFunctionName << " because " << Reason |
750 | 2 | << "\n"); |
751 | 2 | continue; |
752 | 2 | } |
753 | 7 | } |
754 | 7 | // If there is profile mismatch, we should not attempt to inline DI. |
755 | 7 | if (!isa<CallInst>(DI) && 7 !isa<InvokeInst>(DI)1 ) |
756 | 1 | continue; |
757 | 26 | } |
758 | 26 | if (26 !CalledFunction || 26 !CalledFunction->getSubprogram()19 ) { |
759 | 10 | // Handles functions that are imported from other modules. |
760 | 10 | for (const FunctionSamples *FS : findIndirectCallFunctionSamples(*I)) |
761 | 13 | FS->findImportedFunctions( |
762 | 13 | ImportGUIDs, F.getParent(), |
763 | 13 | Samples->getTotalSamples() * SampleProfileHotThreshold / 100); |
764 | 10 | continue; |
765 | 10 | } |
766 | 26 | assert(isa<CallInst>(DI) || isa<InvokeInst>(DI)); |
767 | 16 | CallSite CS(DI); |
768 | 16 | DebugLoc DLoc = I->getDebugLoc(); |
769 | 16 | BasicBlock *BB = I->getParent(); |
770 | 16 | InlineParams Params = getInlineParams(); |
771 | 16 | Params.ComputeFullInlineCost = true; |
772 | 16 | // Checks if there is anything in the reachable portion of the callee at |
773 | 16 | // this callsite that makes this inlining potentially illegal. Need to |
774 | 16 | // set ComputeFullInlineCost, otherwise getInlineCost may return early |
775 | 16 | // when cost exceeds threshold without checking all IRs in the callee. |
776 | 16 | // The acutal cost does not matter because we only checks isNever() to |
777 | 16 | // see if it is legal to inline the callsite. |
778 | 16 | InlineCost Cost = getInlineCost(CS, Params, GetTTI(*CalledFunction), GetAC, |
779 | 16 | None, nullptr, nullptr); |
780 | 16 | if (Cost.isNever()16 ) { |
781 | 3 | ORE->emit(OptimizationRemark(DEBUG_TYPE, "Not inline", DLoc, BB) |
782 | 3 | << "incompatible inlining"); |
783 | 3 | continue; |
784 | 3 | } |
785 | 13 | if (13 InlineFunction(CS, IFI)13 ) { |
786 | 13 | LocalChanged = true; |
787 | 13 | // The call to InlineFunction erases DI, so we can't pass it here. |
788 | 13 | ORE->emit(OptimizationRemark(DEBUG_TYPE, "HotInline", DLoc, BB) |
789 | 13 | << "inlined hot callee '" |
790 | 13 | << ore::NV("Callee", CalledFunction) << "' into '" |
791 | 13 | << ore::NV("Caller", &F) << "'"); |
792 | 13 | } |
793 | 29 | } |
794 | 83 | if (LocalChanged83 ) { |
795 | 12 | Changed = true; |
796 | 83 | } else { |
797 | 71 | break; |
798 | 71 | } |
799 | 83 | } |
800 | 71 | return Changed; |
801 | 71 | } |
802 | | |
803 | | /// \brief Find equivalence classes for the given block. |
804 | | /// |
805 | | /// This finds all the blocks that are guaranteed to execute the same |
806 | | /// number of times as \p BB1. To do this, it traverses all the |
807 | | /// descendants of \p BB1 in the dominator or post-dominator tree. |
808 | | /// |
809 | | /// A block BB2 will be in the same equivalence class as \p BB1 if |
810 | | /// the following holds: |
811 | | /// |
812 | | /// 1- \p BB1 is a descendant of BB2 in the opposite tree. So, if BB2 |
813 | | /// is a descendant of \p BB1 in the dominator tree, then BB2 should |
814 | | /// dominate BB1 in the post-dominator tree. |
815 | | /// |
816 | | /// 2- Both BB2 and \p BB1 must be in the same loop. |
817 | | /// |
818 | | /// For every block BB2 that meets those two requirements, we set BB2's |
819 | | /// equivalence class to \p BB1. |
820 | | /// |
821 | | /// \param BB1 Block to check. |
822 | | /// \param Descendants Descendants of \p BB1 in either the dom or pdom tree. |
823 | | /// \param DomTree Opposite dominator tree. If \p Descendants is filled |
824 | | /// with blocks from \p BB1's dominator tree, then |
825 | | /// this is the post-dominator tree, and vice versa. |
826 | | template <bool IsPostDom> |
827 | | void SampleProfileLoader::findEquivalencesFor( |
828 | | BasicBlock *BB1, ArrayRef<BasicBlock *> Descendants, |
829 | 202 | DominatorTreeBase<BasicBlock, IsPostDom> *DomTree) { |
830 | 202 | const BasicBlock *EC = EquivalenceClass[BB1]; |
831 | 202 | uint64_t Weight = BlockWeights[EC]; |
832 | 715 | for (const auto *BB2 : Descendants) { |
833 | 715 | bool IsDomParent = DomTree->dominates(BB2, BB1); |
834 | 715 | bool IsInSameLoop = LI->getLoopFor(BB1) == LI->getLoopFor(BB2); |
835 | 715 | if (BB1 != BB2 && 715 IsDomParent514 && IsInSameLoop178 ) { |
836 | 86 | EquivalenceClass[BB2] = EC; |
837 | 86 | // If BB2 is visited, then the entire EC should be marked as visited. |
838 | 86 | if (VisitedBlocks.count(BB2)86 ) { |
839 | 42 | VisitedBlocks.insert(EC); |
840 | 42 | } |
841 | 86 | |
842 | 86 | // If BB2 is heavier than BB1, make BB2 have the same weight |
843 | 86 | // as BB1. |
844 | 86 | // |
845 | 86 | // Note that we don't worry about the opposite situation here |
846 | 86 | // (when BB2 is lighter than BB1). We will deal with this |
847 | 86 | // during the propagation phase. Right now, we just want to |
848 | 86 | // make sure that BB1 has the largest weight of all the |
849 | 86 | // members of its equivalence set. |
850 | 86 | Weight = std::max(Weight, BlockWeights[BB2]); |
851 | 86 | } |
852 | 715 | } |
853 | 202 | if (EC == &EC->getParent()->getEntryBlock()202 ) { |
854 | 70 | BlockWeights[EC] = Samples->getHeadSamples() + 1; |
855 | 202 | } else { |
856 | 132 | BlockWeights[EC] = Weight; |
857 | 132 | } |
858 | 202 | } |
859 | | |
860 | | /// \brief Find equivalence classes. |
861 | | /// |
862 | | /// Since samples may be missing from blocks, we can fill in the gaps by setting |
863 | | /// the weights of all the blocks in the same equivalence class to the same |
864 | | /// weight. To compute the concept of equivalence, we use dominance and loop |
865 | | /// information. Two blocks B1 and B2 are in the same equivalence class if B1 |
866 | | /// dominates B2, B2 post-dominates B1 and both are in the same loop. |
867 | | /// |
868 | | /// \param F The function to query. |
869 | 70 | void SampleProfileLoader::findEquivalenceClasses(Function &F) { |
870 | 70 | SmallVector<BasicBlock *, 8> DominatedBBs; |
871 | 70 | DEBUG(dbgs() << "\nBlock equivalence classes\n"); |
872 | 70 | // Find equivalence sets based on dominance and post-dominance information. |
873 | 288 | for (auto &BB : F) { |
874 | 288 | BasicBlock *BB1 = &BB; |
875 | 288 | |
876 | 288 | // Compute BB1's equivalence class once. |
877 | 288 | if (EquivalenceClass.count(BB1)288 ) { |
878 | 86 | DEBUG(printBlockEquivalence(dbgs(), BB1)); |
879 | 86 | continue; |
880 | 86 | } |
881 | 202 | |
882 | 202 | // By default, blocks are in their own equivalence class. |
883 | 202 | EquivalenceClass[BB1] = BB1; |
884 | 202 | |
885 | 202 | // Traverse all the blocks dominated by BB1. We are looking for |
886 | 202 | // every basic block BB2 such that: |
887 | 202 | // |
888 | 202 | // 1- BB1 dominates BB2. |
889 | 202 | // 2- BB2 post-dominates BB1. |
890 | 202 | // 3- BB1 and BB2 are in the same loop nest. |
891 | 202 | // |
892 | 202 | // If all those conditions hold, it means that BB2 is executed |
893 | 202 | // as many times as BB1, so they are placed in the same equivalence |
894 | 202 | // class by making BB2's equivalence class be BB1. |
895 | 202 | DominatedBBs.clear(); |
896 | 202 | DT->getDescendants(BB1, DominatedBBs); |
897 | 202 | findEquivalencesFor(BB1, DominatedBBs, PDT.get()); |
898 | 202 | |
899 | 202 | DEBUG(printBlockEquivalence(dbgs(), BB1)); |
900 | 288 | } |
901 | 70 | |
902 | 70 | // Assign weights to equivalence classes. |
903 | 70 | // |
904 | 70 | // All the basic blocks in the same equivalence class will execute |
905 | 70 | // the same number of times. Since we know that the head block in |
906 | 70 | // each equivalence class has the largest weight, assign that weight |
907 | 70 | // to all the blocks in that equivalence class. |
908 | 70 | DEBUG(dbgs() << "\nAssign the same weight to all blocks in the same class\n"); |
909 | 288 | for (auto &BI : F) { |
910 | 288 | const BasicBlock *BB = &BI; |
911 | 288 | const BasicBlock *EquivBB = EquivalenceClass[BB]; |
912 | 288 | if (BB != EquivBB) |
913 | 86 | BlockWeights[BB] = BlockWeights[EquivBB]; |
914 | 288 | DEBUG(printBlockWeight(dbgs(), BB)); |
915 | 288 | } |
916 | 70 | } |
917 | | |
918 | | /// \brief Visit the given edge to decide if it has a valid weight. |
919 | | /// |
920 | | /// If \p E has not been visited before, we copy to \p UnknownEdge |
921 | | /// and increment the count of unknown edges. |
922 | | /// |
923 | | /// \param E Edge to visit. |
924 | | /// \param NumUnknownEdges Current number of unknown edges. |
925 | | /// \param UnknownEdge Set if E has not been visited before. |
926 | | /// |
927 | | /// \returns E's weight, if known. Otherwise, return 0. |
928 | | uint64_t SampleProfileLoader::visitEdge(Edge E, unsigned *NumUnknownEdges, |
929 | 3.54k | Edge *UnknownEdge) { |
930 | 3.54k | if (!VisitedEdges.count(E)3.54k ) { |
931 | 1.05k | (*NumUnknownEdges)++; |
932 | 1.05k | *UnknownEdge = E; |
933 | 1.05k | return 0; |
934 | 1.05k | } |
935 | 2.49k | |
936 | 2.49k | return EdgeWeights[E]; |
937 | 2.49k | } |
938 | | |
939 | | /// \brief Propagate weights through incoming/outgoing edges. |
940 | | /// |
941 | | /// If the weight of a basic block is known, and there is only one edge |
942 | | /// with an unknown weight, we can calculate the weight of that edge. |
943 | | /// |
944 | | /// Similarly, if all the edges have a known count, we can calculate the |
945 | | /// count of the basic block, if needed. |
946 | | /// |
947 | | /// \param F Function to process. |
948 | | /// \param UpdateBlockCount Whether we should update basic block counts that |
949 | | /// has already been annotated. |
950 | | /// |
951 | | /// \returns True if new weights were assigned to edges or blocks. |
952 | | bool SampleProfileLoader::propagateThroughEdges(Function &F, |
953 | 334 | bool UpdateBlockCount) { |
954 | 334 | bool Changed = false; |
955 | 334 | DEBUG(dbgs() << "\nPropagation through edges\n"); |
956 | 1.67k | for (const auto &BI : F) { |
957 | 1.67k | const BasicBlock *BB = &BI; |
958 | 1.67k | const BasicBlock *EC = EquivalenceClass[BB]; |
959 | 1.67k | |
960 | 1.67k | // Visit all the predecessor and successor edges to determine |
961 | 1.67k | // which ones have a weight assigned already. Note that it doesn't |
962 | 1.67k | // matter that we only keep track of a single unknown edge. The |
963 | 1.67k | // only case we are interested in handling is when only a single |
964 | 1.67k | // edge is unknown (see setEdgeOrBlockWeight). |
965 | 5.03k | for (unsigned i = 0; i < 25.03k ; i++3.35k ) { |
966 | 3.35k | uint64_t TotalWeight = 0; |
967 | 3.35k | unsigned NumUnknownEdges = 0, NumTotalEdges = 0; |
968 | 3.35k | Edge UnknownEdge, SelfReferentialEdge, SingleEdge; |
969 | 3.35k | |
970 | 3.35k | if (i == 03.35k ) { |
971 | 1.67k | // First, visit all predecessor edges. |
972 | 1.67k | NumTotalEdges = Predecessors[BB].size(); |
973 | 1.77k | for (auto *Pred : Predecessors[BB]) { |
974 | 1.77k | Edge E = std::make_pair(Pred, BB); |
975 | 1.77k | TotalWeight += visitEdge(E, &NumUnknownEdges, &UnknownEdge); |
976 | 1.77k | if (E.first == E.second) |
977 | 0 | SelfReferentialEdge = E; |
978 | 1.77k | } |
979 | 1.67k | if (NumTotalEdges == 11.67k ) { |
980 | 920 | SingleEdge = std::make_pair(Predecessors[BB][0], BB); |
981 | 920 | } |
982 | 3.35k | } else { |
983 | 1.67k | // On the second round, visit all successor edges. |
984 | 1.67k | NumTotalEdges = Successors[BB].size(); |
985 | 1.77k | for (auto *Succ : Successors[BB]) { |
986 | 1.77k | Edge E = std::make_pair(BB, Succ); |
987 | 1.77k | TotalWeight += visitEdge(E, &NumUnknownEdges, &UnknownEdge); |
988 | 1.77k | } |
989 | 1.67k | if (NumTotalEdges == 11.67k ) { |
990 | 878 | SingleEdge = std::make_pair(BB, Successors[BB][0]); |
991 | 878 | } |
992 | 1.67k | } |
993 | 3.35k | |
994 | 3.35k | // After visiting all the edges, there are three cases that we |
995 | 3.35k | // can handle immediately: |
996 | 3.35k | // |
997 | 3.35k | // - All the edge weights are known (i.e., NumUnknownEdges == 0). |
998 | 3.35k | // In this case, we simply check that the sum of all the edges |
999 | 3.35k | // is the same as BB's weight. If not, we change BB's weight |
1000 | 3.35k | // to match. Additionally, if BB had not been visited before, |
1001 | 3.35k | // we mark it visited. |
1002 | 3.35k | // |
1003 | 3.35k | // - Only one edge is unknown and BB has already been visited. |
1004 | 3.35k | // In this case, we can compute the weight of the edge by |
1005 | 3.35k | // subtracting the total block weight from all the known |
1006 | 3.35k | // edge weights. If the edges weight more than BB, then the |
1007 | 3.35k | // edge of the last remaining edge is set to zero. |
1008 | 3.35k | // |
1009 | 3.35k | // - There exists a self-referential edge and the weight of BB is |
1010 | 3.35k | // known. In this case, this edge can be based on BB's weight. |
1011 | 3.35k | // We add up all the other known edges and set the weight on |
1012 | 3.35k | // the self-referential edge as we did in the previous case. |
1013 | 3.35k | // |
1014 | 3.35k | // In any other case, we must continue iterating. Eventually, |
1015 | 3.35k | // all edges will get a weight, or iteration will stop when |
1016 | 3.35k | // it reaches SampleProfileMaxPropagateIterations. |
1017 | 3.35k | if (NumUnknownEdges <= 13.35k ) { |
1018 | 3.19k | uint64_t &BBWeight = BlockWeights[EC]; |
1019 | 3.19k | if (NumUnknownEdges == 03.19k ) { |
1020 | 2.46k | if (!VisitedBlocks.count(EC)2.46k ) { |
1021 | 422 | // If we already know the weight of all edges, the weight of the |
1022 | 422 | // basic block can be computed. It should be no larger than the sum |
1023 | 422 | // of all edge weights. |
1024 | 422 | if (TotalWeight > BBWeight422 ) { |
1025 | 12 | BBWeight = TotalWeight; |
1026 | 12 | Changed = true; |
1027 | 12 | DEBUG(dbgs() << "All edge weights for " << BB->getName() |
1028 | 12 | << " known. Set weight for block: "; |
1029 | 12 | printBlockWeight(dbgs(), BB);); |
1030 | 12 | } |
1031 | 2.46k | } else if (2.04k NumTotalEdges == 1 && |
1032 | 2.04k | EdgeWeights[SingleEdge] < BlockWeights[EC]1.09k ) { |
1033 | 80 | // If there is only one edge for the visited basic block, use the |
1034 | 80 | // block weight to adjust edge weight if edge weight is smaller. |
1035 | 80 | EdgeWeights[SingleEdge] = BlockWeights[EC]; |
1036 | 80 | Changed = true; |
1037 | 80 | } |
1038 | 3.19k | } else if (722 NumUnknownEdges == 1 && 722 VisitedBlocks.count(EC)722 ) { |
1039 | 520 | // If there is a single unknown edge and the block has been |
1040 | 520 | // visited, then we can compute E's weight. |
1041 | 520 | if (BBWeight >= TotalWeight) |
1042 | 520 | EdgeWeights[UnknownEdge] = BBWeight - TotalWeight; |
1043 | 520 | else |
1044 | 0 | EdgeWeights[UnknownEdge] = 0; |
1045 | 520 | const BasicBlock *OtherEC; |
1046 | 520 | if (i == 0) |
1047 | 276 | OtherEC = EquivalenceClass[UnknownEdge.first]; |
1048 | 520 | else |
1049 | 244 | OtherEC = EquivalenceClass[UnknownEdge.second]; |
1050 | 520 | // Edge weights should never exceed the BB weights it connects. |
1051 | 520 | if (VisitedBlocks.count(OtherEC) && |
1052 | 402 | EdgeWeights[UnknownEdge] > BlockWeights[OtherEC]) |
1053 | 60 | EdgeWeights[UnknownEdge] = BlockWeights[OtherEC]; |
1054 | 520 | VisitedEdges.insert(UnknownEdge); |
1055 | 520 | Changed = true; |
1056 | 520 | DEBUG(dbgs() << "Set weight for edge: "; |
1057 | 722 | printEdgeWeight(dbgs(), UnknownEdge)); |
1058 | 722 | } |
1059 | 3.35k | } else if (166 VisitedBlocks.count(EC) && 166 BlockWeights[EC] == 0126 ) { |
1060 | 12 | // If a block Weights 0, all its in/out edges should weight 0. |
1061 | 12 | if (i == 012 ) { |
1062 | 16 | for (auto *Pred : Predecessors[BB]) { |
1063 | 16 | Edge E = std::make_pair(Pred, BB); |
1064 | 16 | EdgeWeights[E] = 0; |
1065 | 16 | VisitedEdges.insert(E); |
1066 | 16 | } |
1067 | 12 | } else { |
1068 | 8 | for (auto *Succ : Successors[BB]) { |
1069 | 8 | Edge E = std::make_pair(BB, Succ); |
1070 | 8 | EdgeWeights[E] = 0; |
1071 | 8 | VisitedEdges.insert(E); |
1072 | 8 | } |
1073 | 4 | } |
1074 | 166 | } else if (154 SelfReferentialEdge.first && 154 VisitedBlocks.count(EC)0 ) { |
1075 | 0 | uint64_t &BBWeight = BlockWeights[BB]; |
1076 | 0 | // We have a self-referential edge and the weight of BB is known. |
1077 | 0 | if (BBWeight >= TotalWeight) |
1078 | 0 | EdgeWeights[SelfReferentialEdge] = BBWeight - TotalWeight; |
1079 | 0 | else |
1080 | 0 | EdgeWeights[SelfReferentialEdge] = 0; |
1081 | 0 | VisitedEdges.insert(SelfReferentialEdge); |
1082 | 0 | Changed = true; |
1083 | 0 | DEBUG(dbgs() << "Set self-referential edge weight to: "; |
1084 | 166 | printEdgeWeight(dbgs(), SelfReferentialEdge)); |
1085 | 166 | } |
1086 | 3.35k | if (UpdateBlockCount && 3.35k !VisitedBlocks.count(EC)780 && TotalWeight > 0140 ) { |
1087 | 18 | BlockWeights[EC] = TotalWeight; |
1088 | 18 | VisitedBlocks.insert(EC); |
1089 | 18 | Changed = true; |
1090 | 18 | } |
1091 | 3.35k | } |
1092 | 1.67k | } |
1093 | 334 | |
1094 | 334 | return Changed; |
1095 | 334 | } |
1096 | | |
1097 | | /// \brief Build in/out edge lists for each basic block in the CFG. |
1098 | | /// |
1099 | | /// We are interested in unique edges. If a block B1 has multiple |
1100 | | /// edges to another block B2, we only add a single B1->B2 edge. |
1101 | 70 | void SampleProfileLoader::buildEdges(Function &F) { |
1102 | 288 | for (auto &BI : F) { |
1103 | 288 | BasicBlock *B1 = &BI; |
1104 | 288 | |
1105 | 288 | // Add predecessors for B1. |
1106 | 288 | SmallPtrSet<BasicBlock *, 16> Visited; |
1107 | 288 | if (!Predecessors[B1].empty()) |
1108 | 0 | llvm_unreachable("Found a stale predecessors list in a basic block."); |
1109 | 576 | for (pred_iterator PI = pred_begin(B1), PE = pred_end(B1); 288 PI != PE576 ; ++PI288 ) { |
1110 | 288 | BasicBlock *B2 = *PI; |
1111 | 288 | if (Visited.insert(B2).second) |
1112 | 288 | Predecessors[B1].push_back(B2); |
1113 | 288 | } |
1114 | 288 | |
1115 | 288 | // Add successors for B1. |
1116 | 288 | Visited.clear(); |
1117 | 288 | if (!Successors[B1].empty()) |
1118 | 0 | llvm_unreachable("Found a stale successors list in a basic block."); |
1119 | 576 | for (succ_iterator SI = succ_begin(B1), SE = succ_end(B1); 288 SI != SE576 ; ++SI288 ) { |
1120 | 288 | BasicBlock *B2 = *SI; |
1121 | 288 | if (Visited.insert(B2).second) |
1122 | 288 | Successors[B1].push_back(B2); |
1123 | 288 | } |
1124 | 288 | } |
1125 | 70 | } |
1126 | | |
1127 | | /// Sorts the CallTargetMap \p M by count in descending order and stores the |
1128 | | /// sorted result in \p Sorted. Returns the total counts. |
1129 | | static uint64_t SortCallTargets(SmallVector<InstrProfValueData, 2> &Sorted, |
1130 | 12 | const SampleRecord::CallTargetMap &M) { |
1131 | 12 | Sorted.clear(); |
1132 | 12 | uint64_t Sum = 0; |
1133 | 30 | for (auto I = M.begin(); I != M.end()30 ; ++I18 ) { |
1134 | 18 | Sum += I->getValue(); |
1135 | 18 | Sorted.push_back({Function::getGUID(I->getKey()), I->getValue()}); |
1136 | 18 | } |
1137 | 12 | std::sort(Sorted.begin(), Sorted.end(), |
1138 | 6 | [](const InstrProfValueData &L, const InstrProfValueData &R) { |
1139 | 6 | if (L.Count == R.Count) |
1140 | 0 | return L.Value > R.Value; |
1141 | 6 | else |
1142 | 6 | return L.Count > R.Count; |
1143 | 0 | }); |
1144 | 12 | return Sum; |
1145 | 12 | } |
1146 | | |
1147 | | /// \brief Propagate weights into edges |
1148 | | /// |
1149 | | /// The following rules are applied to every block BB in the CFG: |
1150 | | /// |
1151 | | /// - If BB has a single predecessor/successor, then the weight |
1152 | | /// of that edge is the weight of the block. |
1153 | | /// |
1154 | | /// - If all incoming or outgoing edges are known except one, and the |
1155 | | /// weight of the block is already known, the weight of the unknown |
1156 | | /// edge will be the weight of the block minus the sum of all the known |
1157 | | /// edges. If the sum of all the known edges is larger than BB's weight, |
1158 | | /// we set the unknown edge weight to zero. |
1159 | | /// |
1160 | | /// - If there is a self-referential edge, and the weight of the block is |
1161 | | /// known, the weight for that edge is set to the weight of the block |
1162 | | /// minus the weight of the other incoming edges to that block (if |
1163 | | /// known). |
1164 | 70 | void SampleProfileLoader::propagateWeights(Function &F) { |
1165 | 70 | bool Changed = true; |
1166 | 70 | unsigned I = 0; |
1167 | 70 | |
1168 | 70 | // If BB weight is larger than its corresponding loop's header BB weight, |
1169 | 70 | // use the BB weight to replace the loop header BB weight. |
1170 | 288 | for (auto &BI : F) { |
1171 | 288 | BasicBlock *BB = &BI; |
1172 | 288 | Loop *L = LI->getLoopFor(BB); |
1173 | 288 | if (!L288 ) { |
1174 | 158 | continue; |
1175 | 158 | } |
1176 | 130 | BasicBlock *Header = L->getHeader(); |
1177 | 130 | if (Header && 130 BlockWeights[BB] > BlockWeights[Header]130 ) { |
1178 | 20 | BlockWeights[Header] = BlockWeights[BB]; |
1179 | 20 | } |
1180 | 288 | } |
1181 | 70 | |
1182 | 70 | // Before propagation starts, build, for each block, a list of |
1183 | 70 | // unique predecessors and successors. This is necessary to handle |
1184 | 70 | // identical edges in multiway branches. Since we visit all blocks and all |
1185 | 70 | // edges of the CFG, it is cleaner to build these lists once at the start |
1186 | 70 | // of the pass. |
1187 | 70 | buildEdges(F); |
1188 | 70 | |
1189 | 70 | // Propagate until we converge or we go past the iteration limit. |
1190 | 192 | while (Changed && 192 I++ < SampleProfileMaxPropagateIterations122 ) { |
1191 | 122 | Changed = propagateThroughEdges(F, false); |
1192 | 122 | } |
1193 | 70 | |
1194 | 70 | // The first propagation propagates BB counts from annotated BBs to unknown |
1195 | 70 | // BBs. The 2nd propagation pass resets edges weights, and use all BB weights |
1196 | 70 | // to propagate edge weights. |
1197 | 70 | VisitedEdges.clear(); |
1198 | 70 | Changed = true; |
1199 | 192 | while (Changed && 192 I++ < SampleProfileMaxPropagateIterations122 ) { |
1200 | 122 | Changed = propagateThroughEdges(F, false); |
1201 | 122 | } |
1202 | 70 | |
1203 | 70 | // The 3rd propagation pass allows adjust annotated BB weights that are |
1204 | 70 | // obviously wrong. |
1205 | 70 | Changed = true; |
1206 | 160 | while (Changed && 160 I++ < SampleProfileMaxPropagateIterations90 ) { |
1207 | 90 | Changed = propagateThroughEdges(F, true); |
1208 | 90 | } |
1209 | 70 | |
1210 | 70 | // Generate MD_prof metadata for every branch instruction using the |
1211 | 70 | // edge weights computed during propagation. |
1212 | 70 | DEBUG(dbgs() << "\nPropagation complete. Setting branch weights\n"); |
1213 | 70 | LLVMContext &Ctx = F.getContext(); |
1214 | 70 | MDBuilder MDB(Ctx); |
1215 | 288 | for (auto &BI : F) { |
1216 | 288 | BasicBlock *BB = &BI; |
1217 | 288 | |
1218 | 288 | if (BlockWeights[BB]288 ) { |
1219 | 1.12k | for (auto &I : BB->getInstList()) { |
1220 | 1.12k | if (!isa<CallInst>(I) && 1.12k !isa<InvokeInst>(I)950 ) |
1221 | 950 | continue; |
1222 | 172 | CallSite CS(&I); |
1223 | 172 | if (!CS.getCalledFunction()172 ) { |
1224 | 21 | const DebugLoc &DLoc = I.getDebugLoc(); |
1225 | 21 | if (!DLoc) |
1226 | 0 | continue; |
1227 | 21 | const DILocation *DIL = DLoc; |
1228 | 21 | uint32_t LineOffset = getOffset(DIL); |
1229 | 21 | uint32_t Discriminator = DIL->getBaseDiscriminator(); |
1230 | 21 | |
1231 | 21 | const FunctionSamples *FS = findFunctionSamples(I); |
1232 | 21 | if (!FS) |
1233 | 0 | continue; |
1234 | 21 | auto T = FS->findCallTargetMapAt(LineOffset, Discriminator); |
1235 | 21 | if (!T || 21 T.get().size() == 016 ) |
1236 | 9 | continue; |
1237 | 12 | SmallVector<InstrProfValueData, 2> SortedCallTargets; |
1238 | 12 | uint64_t Sum = SortCallTargets(SortedCallTargets, T.get()); |
1239 | 12 | annotateValueSite(*I.getParent()->getParent()->getParent(), I, |
1240 | 12 | SortedCallTargets, Sum, IPVK_IndirectCallTarget, |
1241 | 12 | SortedCallTargets.size()); |
1242 | 172 | } else if (151 !dyn_cast<IntrinsicInst>(&I)151 ) { |
1243 | 57 | SmallVector<uint32_t, 1> Weights; |
1244 | 57 | Weights.push_back(BlockWeights[BB]); |
1245 | 57 | I.setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights)); |
1246 | 57 | } |
1247 | 1.12k | } |
1248 | 245 | } |
1249 | 288 | TerminatorInst *TI = BB->getTerminator(); |
1250 | 288 | if (TI->getNumSuccessors() == 1) |
1251 | 142 | continue; |
1252 | 146 | if (146 !isa<BranchInst>(TI) && 146 !isa<SwitchInst>(TI)73 ) |
1253 | 73 | continue; |
1254 | 73 | |
1255 | 73 | DebugLoc BranchLoc = TI->getDebugLoc(); |
1256 | 73 | DEBUG(dbgs() << "\nGetting weights for branch at line " |
1257 | 73 | << ((BranchLoc) ? Twine(BranchLoc.getLine()) |
1258 | 73 | : Twine("<UNKNOWN LOCATION>")) |
1259 | 73 | << ".\n"); |
1260 | 73 | SmallVector<uint32_t, 4> Weights; |
1261 | 73 | uint32_t MaxWeight = 0; |
1262 | 73 | Instruction *MaxDestInst; |
1263 | 219 | for (unsigned I = 0; I < TI->getNumSuccessors()219 ; ++I146 ) { |
1264 | 146 | BasicBlock *Succ = TI->getSuccessor(I); |
1265 | 146 | Edge E = std::make_pair(BB, Succ); |
1266 | 146 | uint64_t Weight = EdgeWeights[E]; |
1267 | 146 | DEBUG(dbgs() << "\t"; printEdgeWeight(dbgs(), E)); |
1268 | 146 | // Use uint32_t saturated arithmetic to adjust the incoming weights, |
1269 | 146 | // if needed. Sample counts in profiles are 64-bit unsigned values, |
1270 | 146 | // but internally branch weights are expressed as 32-bit values. |
1271 | 146 | if (Weight > std::numeric_limits<uint32_t>::max()146 ) { |
1272 | 0 | DEBUG(dbgs() << " (saturated due to uint32_t overflow)"); |
1273 | 0 | Weight = std::numeric_limits<uint32_t>::max(); |
1274 | 0 | } |
1275 | 146 | // Weight is added by one to avoid propagation errors introduced by |
1276 | 146 | // 0 weights. |
1277 | 146 | Weights.push_back(static_cast<uint32_t>(Weight + 1)); |
1278 | 146 | if (Weight != 0146 ) { |
1279 | 102 | if (Weight > MaxWeight102 ) { |
1280 | 70 | MaxWeight = Weight; |
1281 | 70 | MaxDestInst = Succ->getFirstNonPHIOrDbgOrLifetime(); |
1282 | 70 | } |
1283 | 102 | } |
1284 | 146 | } |
1285 | 73 | |
1286 | 73 | uint64_t TempWeight; |
1287 | 73 | // Only set weights if there is at least one non-zero weight. |
1288 | 73 | // In any other case, let the analyzer set weights. |
1289 | 73 | // Do not set weights if the weights are present. In ThinLTO, the profile |
1290 | 73 | // annotation is done twice. If the first annotation already set the |
1291 | 73 | // weights, the second pass does not need to set it. |
1292 | 73 | if (MaxWeight > 0 && 73 !TI->extractProfTotalWeight(TempWeight)62 ) { |
1293 | 60 | DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n"); |
1294 | 60 | TI->setMetadata(llvm::LLVMContext::MD_prof, |
1295 | 60 | MDB.createBranchWeights(Weights)); |
1296 | 60 | ORE->emit(OptimizationRemark(DEBUG_TYPE, "PopularDest", MaxDestInst) |
1297 | 60 | << "most popular destination for conditional branches at " |
1298 | 60 | << ore::NV("CondBranchesLoc", BranchLoc)); |
1299 | 73 | } else { |
1300 | 13 | DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n"); |
1301 | 13 | } |
1302 | 288 | } |
1303 | 70 | } |
1304 | | |
1305 | | /// \brief Get the line number for the function header. |
1306 | | /// |
1307 | | /// This looks up function \p F in the current compilation unit and |
1308 | | /// retrieves the line number where the function is defined. This is |
1309 | | /// line 0 for all the samples read from the profile file. Every line |
1310 | | /// number is relative to this line. |
1311 | | /// |
1312 | | /// \param F Function object to query. |
1313 | | /// |
1314 | | /// \returns the line number where \p F is defined. If it returns 0, |
1315 | | /// it means that there is no debug information available for \p F. |
1316 | 84 | unsigned SampleProfileLoader::getFunctionLoc(Function &F) { |
1317 | 84 | if (DISubprogram *S = F.getSubprogram()) |
1318 | 79 | return S->getLine(); |
1319 | 5 | |
1320 | 5 | // If the start of \p F is missing, emit a diagnostic to inform the user |
1321 | 5 | // about the missed opportunity. |
1322 | 5 | F.getContext().diagnose(DiagnosticInfoSampleProfile( |
1323 | 5 | "No debug information found in function " + F.getName() + |
1324 | 5 | ": Function profile not used", |
1325 | 5 | DS_Warning)); |
1326 | 5 | return 0; |
1327 | 5 | } |
1328 | | |
1329 | 70 | void SampleProfileLoader::computeDominanceAndLoopInfo(Function &F) { |
1330 | 70 | DT.reset(new DominatorTree); |
1331 | 70 | DT->recalculate(F); |
1332 | 70 | |
1333 | 70 | PDT.reset(new PostDomTreeBase<BasicBlock>()); |
1334 | 70 | PDT->recalculate(F); |
1335 | 70 | |
1336 | 70 | LI.reset(new LoopInfo); |
1337 | 70 | LI->analyze(*DT); |
1338 | 70 | } |
1339 | | |
1340 | | /// \brief Generate branch weight metadata for all branches in \p F. |
1341 | | /// |
1342 | | /// Branch weights are computed out of instruction samples using a |
1343 | | /// propagation heuristic. Propagation proceeds in 3 phases: |
1344 | | /// |
1345 | | /// 1- Assignment of block weights. All the basic blocks in the function |
1346 | | /// are initial assigned the same weight as their most frequently |
1347 | | /// executed instruction. |
1348 | | /// |
1349 | | /// 2- Creation of equivalence classes. Since samples may be missing from |
1350 | | /// blocks, we can fill in the gaps by setting the weights of all the |
1351 | | /// blocks in the same equivalence class to the same weight. To compute |
1352 | | /// the concept of equivalence, we use dominance and loop information. |
1353 | | /// Two blocks B1 and B2 are in the same equivalence class if B1 |
1354 | | /// dominates B2, B2 post-dominates B1 and both are in the same loop. |
1355 | | /// |
1356 | | /// 3- Propagation of block weights into edges. This uses a simple |
1357 | | /// propagation heuristic. The following rules are applied to every |
1358 | | /// block BB in the CFG: |
1359 | | /// |
1360 | | /// - If BB has a single predecessor/successor, then the weight |
1361 | | /// of that edge is the weight of the block. |
1362 | | /// |
1363 | | /// - If all the edges are known except one, and the weight of the |
1364 | | /// block is already known, the weight of the unknown edge will |
1365 | | /// be the weight of the block minus the sum of all the known |
1366 | | /// edges. If the sum of all the known edges is larger than BB's weight, |
1367 | | /// we set the unknown edge weight to zero. |
1368 | | /// |
1369 | | /// - If there is a self-referential edge, and the weight of the block is |
1370 | | /// known, the weight for that edge is set to the weight of the block |
1371 | | /// minus the weight of the other incoming edges to that block (if |
1372 | | /// known). |
1373 | | /// |
1374 | | /// Since this propagation is not guaranteed to finalize for every CFG, we |
1375 | | /// only allow it to proceed for a limited number of iterations (controlled |
1376 | | /// by -sample-profile-max-propagate-iterations). |
1377 | | /// |
1378 | | /// FIXME: Try to replace this propagation heuristic with a scheme |
1379 | | /// that is guaranteed to finalize. A work-list approach similar to |
1380 | | /// the standard value propagation algorithm used by SSA-CCP might |
1381 | | /// work here. |
1382 | | /// |
1383 | | /// Once all the branch weights are computed, we emit the MD_prof |
1384 | | /// metadata on BB using the computed values for each of its branches. |
1385 | | /// |
1386 | | /// \param F The function to query. |
1387 | | /// |
1388 | | /// \returns true if \p F was modified. Returns false, otherwise. |
1389 | 76 | bool SampleProfileLoader::emitAnnotations(Function &F) { |
1390 | 76 | bool Changed = false; |
1391 | 76 | |
1392 | 76 | if (getFunctionLoc(F) == 0) |
1393 | 5 | return false; |
1394 | 71 | |
1395 | 71 | DEBUG71 (dbgs() << "Line number for the first instruction in " << F.getName() |
1396 | 71 | << ": " << getFunctionLoc(F) << "\n"); |
1397 | 71 | |
1398 | 71 | DenseSet<GlobalValue::GUID> ImportGUIDs; |
1399 | 71 | Changed |= inlineHotFunctions(F, ImportGUIDs); |
1400 | 71 | |
1401 | 71 | // Compute basic block weights. |
1402 | 71 | Changed |= computeBlockWeights(F); |
1403 | 71 | |
1404 | 71 | if (Changed71 ) { |
1405 | 70 | // Add an entry count to the function using the samples gathered at the |
1406 | 70 | // function entry. Also sets the GUIDs that comes from a different |
1407 | 70 | // module but inlined in the profiled binary. This is aiming at making |
1408 | 70 | // the IR match the profiled binary before annotation. |
1409 | 70 | F.setEntryCount(Samples->getHeadSamples() + 1, &ImportGUIDs); |
1410 | 70 | |
1411 | 70 | // Compute dominance and loop info needed for propagation. |
1412 | 70 | computeDominanceAndLoopInfo(F); |
1413 | 70 | |
1414 | 70 | // Find equivalence classes. |
1415 | 70 | findEquivalenceClasses(F); |
1416 | 70 | |
1417 | 70 | // Propagate weights to all edges. |
1418 | 70 | propagateWeights(F); |
1419 | 70 | } |
1420 | 71 | |
1421 | 71 | // If coverage checking was requested, compute it now. |
1422 | 71 | if (SampleProfileRecordCoverage71 ) { |
1423 | 6 | unsigned Used = CoverageTracker.countUsedRecords(Samples); |
1424 | 6 | unsigned Total = CoverageTracker.countBodyRecords(Samples); |
1425 | 6 | unsigned Coverage = CoverageTracker.computeCoverage(Used, Total); |
1426 | 6 | if (Coverage < SampleProfileRecordCoverage6 ) { |
1427 | 4 | F.getContext().diagnose(DiagnosticInfoSampleProfile( |
1428 | 4 | F.getSubprogram()->getFilename(), getFunctionLoc(F), |
1429 | 4 | Twine(Used) + " of " + Twine(Total) + " available profile records (" + |
1430 | 4 | Twine(Coverage) + "%) were applied", |
1431 | 4 | DS_Warning)); |
1432 | 4 | } |
1433 | 6 | } |
1434 | 71 | |
1435 | 71 | if (SampleProfileSampleCoverage71 ) { |
1436 | 4 | uint64_t Used = CoverageTracker.getTotalUsedSamples(); |
1437 | 4 | uint64_t Total = CoverageTracker.countBodySamples(Samples); |
1438 | 4 | unsigned Coverage = CoverageTracker.computeCoverage(Used, Total); |
1439 | 4 | if (Coverage < SampleProfileSampleCoverage4 ) { |
1440 | 4 | F.getContext().diagnose(DiagnosticInfoSampleProfile( |
1441 | 4 | F.getSubprogram()->getFilename(), getFunctionLoc(F), |
1442 | 4 | Twine(Used) + " of " + Twine(Total) + " available profile samples (" + |
1443 | 4 | Twine(Coverage) + "%) were applied", |
1444 | 4 | DS_Warning)); |
1445 | 4 | } |
1446 | 4 | } |
1447 | 76 | return Changed; |
1448 | 76 | } |
1449 | | |
1450 | | char SampleProfileLoaderLegacyPass::ID = 0; |
1451 | 7.92k | INITIALIZE_PASS_BEGIN7.92k (SampleProfileLoaderLegacyPass, "sample-profile",
|
1452 | 7.92k | "Sample Profile loader", false, false) |
1453 | 7.92k | INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) |
1454 | 7.92k | INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) |
1455 | 7.92k | INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass, "sample-profile", |
1456 | | "Sample Profile loader", false, false) |
1457 | | |
1458 | 71 | bool SampleProfileLoader::doInitialization(Module &M) { |
1459 | 71 | auto &Ctx = M.getContext(); |
1460 | 71 | auto ReaderOrErr = SampleProfileReader::create(Filename, Ctx); |
1461 | 71 | if (std::error_code EC71 = ReaderOrErr.getError()) { |
1462 | 4 | std::string Msg = "Could not open profile: " + EC.message(); |
1463 | 4 | Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg)); |
1464 | 4 | return false; |
1465 | 4 | } |
1466 | 67 | Reader = std::move(ReaderOrErr.get()); |
1467 | 67 | ProfileIsValid = (Reader->read() == sampleprof_error::success); |
1468 | 67 | return true; |
1469 | 67 | } |
1470 | | |
1471 | 0 | ModulePass *llvm::createSampleProfileLoaderPass() { |
1472 | 0 | return new SampleProfileLoaderLegacyPass(SampleProfileFile); |
1473 | 0 | } |
1474 | | |
1475 | 8 | ModulePass *llvm::createSampleProfileLoaderPass(StringRef Name) { |
1476 | 8 | return new SampleProfileLoaderLegacyPass(Name); |
1477 | 8 | } |
1478 | | |
1479 | 59 | bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM) { |
1480 | 59 | if (!ProfileIsValid) |
1481 | 0 | return false; |
1482 | 59 | |
1483 | 59 | // Compute the total number of samples collected in this profile. |
1484 | 59 | for (const auto &I : Reader->getProfiles()) |
1485 | 88 | TotalCollectedSamples += I.second.getTotalSamples(); |
1486 | 59 | |
1487 | 59 | // Populate the symbol map. |
1488 | 221 | for (const auto &N_F : M.getValueSymbolTable()) { |
1489 | 221 | std::string OrigName = N_F.getKey(); |
1490 | 221 | Function *F = dyn_cast<Function>(N_F.getValue()); |
1491 | 221 | if (F == nullptr) |
1492 | 26 | continue; |
1493 | 195 | SymbolMap[OrigName] = F; |
1494 | 195 | auto pos = OrigName.find('.'); |
1495 | 195 | if (pos != std::string::npos195 ) { |
1496 | 36 | std::string NewName = OrigName.substr(0, pos); |
1497 | 36 | auto r = SymbolMap.insert(std::make_pair(NewName, F)); |
1498 | 36 | // Failiing to insert means there is already an entry in SymbolMap, |
1499 | 36 | // thus there are multiple functions that are mapped to the same |
1500 | 36 | // stripped name. In this case of name conflicting, set the value |
1501 | 36 | // to nullptr to avoid confusion. |
1502 | 36 | if (!r.second) |
1503 | 15 | r.first->second = nullptr; |
1504 | 36 | } |
1505 | 221 | } |
1506 | 59 | |
1507 | 59 | bool retval = false; |
1508 | 59 | for (auto &F : M) |
1509 | 201 | if (201 !F.isDeclaration()201 ) { |
1510 | 119 | clearFunctionData(); |
1511 | 119 | retval |= runOnFunction(F, AM); |
1512 | 119 | } |
1513 | 59 | if (M.getProfileSummary() == nullptr) |
1514 | 58 | M.setProfileSummary(Reader->getSummary().getMD(M.getContext())); |
1515 | 59 | return retval; |
1516 | 59 | } |
1517 | | |
1518 | 35 | bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) { |
1519 | 35 | ACT = &getAnalysis<AssumptionCacheTracker>(); |
1520 | 35 | TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>(); |
1521 | 35 | return SampleLoader.runOnModule(M, nullptr); |
1522 | 35 | } |
1523 | | |
1524 | 119 | bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) { |
1525 | 119 | F.setEntryCount(0); |
1526 | 119 | std::unique_ptr<OptimizationRemarkEmitter> OwnedORE; |
1527 | 119 | if (AM119 ) { |
1528 | 43 | auto &FAM = |
1529 | 43 | AM->getResult<FunctionAnalysisManagerModuleProxy>(*F.getParent()) |
1530 | 43 | .getManager(); |
1531 | 43 | ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); |
1532 | 119 | } else { |
1533 | 76 | OwnedORE = make_unique<OptimizationRemarkEmitter>(&F); |
1534 | 76 | ORE = OwnedORE.get(); |
1535 | 76 | } |
1536 | 119 | Samples = Reader->getSamplesFor(F); |
1537 | 119 | if (Samples && 119 !Samples->empty()81 ) |
1538 | 76 | return emitAnnotations(F); |
1539 | 43 | return false; |
1540 | 43 | } |
1541 | | |
1542 | | PreservedAnalyses SampleProfileLoaderPass::run(Module &M, |
1543 | 30 | ModuleAnalysisManager &AM) { |
1544 | 30 | FunctionAnalysisManager &FAM = |
1545 | 30 | AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); |
1546 | 30 | |
1547 | 8 | auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & { |
1548 | 8 | return FAM.getResult<AssumptionAnalysis>(F); |
1549 | 8 | }; |
1550 | 4 | auto GetTTI = [&](Function &F) -> TargetTransformInfo & { |
1551 | 4 | return FAM.getResult<TargetIRAnalysis>(F); |
1552 | 4 | }; |
1553 | 30 | |
1554 | 24 | SampleProfileLoader SampleLoader(ProfileFileName.empty() ? SampleProfileFile |
1555 | 6 | : ProfileFileName, |
1556 | 30 | GetAssumptionCache, GetTTI); |
1557 | 30 | |
1558 | 30 | SampleLoader.doInitialization(M); |
1559 | 30 | |
1560 | 30 | if (!SampleLoader.runOnModule(M, &AM)) |
1561 | 5 | return PreservedAnalyses::all(); |
1562 | 25 | |
1563 | 25 | return PreservedAnalyses::none(); |
1564 | 25 | } |