/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Transforms/IPO/SampleProfile.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===- SampleProfile.cpp - Incorporate sample profiles into the IR --------===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | // |
9 | | // This file implements the SampleProfileLoader transformation. This pass |
10 | | // reads a profile file generated by a sampling profiler (e.g. Linux Perf - |
11 | | // http://perf.wiki.kernel.org/) and generates IR metadata to reflect the |
12 | | // profile information in the given profile. |
13 | | // |
14 | | // This pass generates branch weight annotations on the IR: |
15 | | // |
16 | | // - prof: Represents branch weights. This annotation is added to branches |
17 | | // to indicate the weights of each edge coming out of the branch. |
18 | | // The weight of each edge is the weight of the target block for |
19 | | // that edge. The weight of a block B is computed as the maximum |
20 | | // number of samples found in B. |
21 | | // |
22 | | //===----------------------------------------------------------------------===// |
23 | | |
24 | | #include "llvm/Transforms/IPO/SampleProfile.h" |
25 | | #include "llvm/ADT/ArrayRef.h" |
26 | | #include "llvm/ADT/DenseMap.h" |
27 | | #include "llvm/ADT/DenseSet.h" |
28 | | #include "llvm/ADT/None.h" |
29 | | #include "llvm/ADT/SmallPtrSet.h" |
30 | | #include "llvm/ADT/SmallSet.h" |
31 | | #include "llvm/ADT/SmallVector.h" |
32 | | #include "llvm/ADT/StringMap.h" |
33 | | #include "llvm/ADT/StringRef.h" |
34 | | #include "llvm/ADT/Twine.h" |
35 | | #include "llvm/Analysis/AssumptionCache.h" |
36 | | #include "llvm/Analysis/InlineCost.h" |
37 | | #include "llvm/Analysis/LoopInfo.h" |
38 | | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
39 | | #include "llvm/Analysis/PostDominators.h" |
40 | | #include "llvm/Analysis/ProfileSummaryInfo.h" |
41 | | #include "llvm/Analysis/TargetTransformInfo.h" |
42 | | #include "llvm/IR/BasicBlock.h" |
43 | | #include "llvm/IR/CFG.h" |
44 | | #include "llvm/IR/CallSite.h" |
45 | | #include "llvm/IR/DebugInfoMetadata.h" |
46 | | #include "llvm/IR/DebugLoc.h" |
47 | | #include "llvm/IR/DiagnosticInfo.h" |
48 | | #include "llvm/IR/Dominators.h" |
49 | | #include "llvm/IR/Function.h" |
50 | | #include "llvm/IR/GlobalValue.h" |
51 | | #include "llvm/IR/InstrTypes.h" |
52 | | #include "llvm/IR/Instruction.h" |
53 | | #include "llvm/IR/Instructions.h" |
54 | | #include "llvm/IR/IntrinsicInst.h" |
55 | | #include "llvm/IR/LLVMContext.h" |
56 | | #include "llvm/IR/MDBuilder.h" |
57 | | #include "llvm/IR/Module.h" |
58 | | #include "llvm/IR/PassManager.h" |
59 | | #include "llvm/IR/ValueSymbolTable.h" |
60 | | #include "llvm/Pass.h" |
61 | | #include "llvm/ProfileData/InstrProf.h" |
62 | | #include "llvm/ProfileData/SampleProf.h" |
63 | | #include "llvm/ProfileData/SampleProfReader.h" |
64 | | #include "llvm/Support/Casting.h" |
65 | | #include "llvm/Support/CommandLine.h" |
66 | | #include "llvm/Support/Debug.h" |
67 | | #include "llvm/Support/ErrorHandling.h" |
68 | | #include "llvm/Support/ErrorOr.h" |
69 | | #include "llvm/Support/GenericDomTree.h" |
70 | | #include "llvm/Support/raw_ostream.h" |
71 | | #include "llvm/Transforms/IPO.h" |
72 | | #include "llvm/Transforms/Instrumentation.h" |
73 | | #include "llvm/Transforms/Utils/CallPromotionUtils.h" |
74 | | #include "llvm/Transforms/Utils/Cloning.h" |
75 | | #include <algorithm> |
76 | | #include <cassert> |
77 | | #include <cstdint> |
78 | | #include <functional> |
79 | | #include <limits> |
80 | | #include <map> |
81 | | #include <memory> |
82 | | #include <string> |
83 | | #include <system_error> |
84 | | #include <utility> |
85 | | #include <vector> |
86 | | |
87 | | using namespace llvm; |
88 | | using namespace sampleprof; |
89 | | using ProfileCount = Function::ProfileCount; |
90 | 86 | #define DEBUG_TYPE "sample-profile" |
91 | | |
92 | | // Command line option to specify the file to read samples from. This is |
93 | | // mainly used for debugging. |
94 | | static cl::opt<std::string> SampleProfileFile( |
95 | | "sample-profile-file", cl::init(""), cl::value_desc("filename"), |
96 | | cl::desc("Profile file loaded by -sample-profile"), cl::Hidden); |
97 | | |
98 | | // The named file contains a set of transformations that may have been applied |
99 | | // to the symbol names between the program from which the sample data was |
100 | | // collected and the current program's symbols. |
101 | | static cl::opt<std::string> SampleProfileRemappingFile( |
102 | | "sample-profile-remapping-file", cl::init(""), cl::value_desc("filename"), |
103 | | cl::desc("Profile remapping file loaded by -sample-profile"), cl::Hidden); |
104 | | |
105 | | static cl::opt<unsigned> SampleProfileMaxPropagateIterations( |
106 | | "sample-profile-max-propagate-iterations", cl::init(100), |
107 | | cl::desc("Maximum number of iterations to go through when propagating " |
108 | | "sample block/edge weights through the CFG.")); |
109 | | |
110 | | static cl::opt<unsigned> SampleProfileRecordCoverage( |
111 | | "sample-profile-check-record-coverage", cl::init(0), cl::value_desc("N"), |
112 | | cl::desc("Emit a warning if less than N% of records in the input profile " |
113 | | "are matched to the IR.")); |
114 | | |
115 | | static cl::opt<unsigned> SampleProfileSampleCoverage( |
116 | | "sample-profile-check-sample-coverage", cl::init(0), cl::value_desc("N"), |
117 | | cl::desc("Emit a warning if less than N% of samples in the input profile " |
118 | | "are matched to the IR.")); |
119 | | |
120 | | static cl::opt<bool> NoWarnSampleUnused( |
121 | | "no-warn-sample-unused", cl::init(false), cl::Hidden, |
122 | | cl::desc("Use this option to turn off/on warnings about function with " |
123 | | "samples but without debug information to use those samples. ")); |
124 | | |
125 | | static cl::opt<bool> ProfileSampleAccurate( |
126 | | "profile-sample-accurate", cl::Hidden, cl::init(false), |
127 | | cl::desc("If the sample profile is accurate, we will mark all un-sampled " |
128 | | "callsite and function as having 0 samples. Otherwise, treat " |
129 | | "un-sampled callsites and functions conservatively as unknown. ")); |
130 | | |
131 | | namespace { |
132 | | |
133 | | using BlockWeightMap = DenseMap<const BasicBlock *, uint64_t>; |
134 | | using EquivalenceClassMap = DenseMap<const BasicBlock *, const BasicBlock *>; |
135 | | using Edge = std::pair<const BasicBlock *, const BasicBlock *>; |
136 | | using EdgeWeightMap = DenseMap<Edge, uint64_t>; |
137 | | using BlockEdgeMap = |
138 | | DenseMap<const BasicBlock *, SmallVector<const BasicBlock *, 8>>; |
139 | | |
140 | | class SampleCoverageTracker { |
141 | | public: |
142 | 106 | SampleCoverageTracker() = default; |
143 | | |
144 | | bool markSamplesUsed(const FunctionSamples *FS, uint32_t LineOffset, |
145 | | uint32_t Discriminator, uint64_t Samples); |
146 | | unsigned computeCoverage(unsigned Used, unsigned Total) const; |
147 | | unsigned countUsedRecords(const FunctionSamples *FS, |
148 | | ProfileSummaryInfo *PSI) const; |
149 | | unsigned countBodyRecords(const FunctionSamples *FS, |
150 | | ProfileSummaryInfo *PSI) const; |
151 | 4 | uint64_t getTotalUsedSamples() const { return TotalUsedSamples; } |
152 | | uint64_t countBodySamples(const FunctionSamples *FS, |
153 | | ProfileSummaryInfo *PSI) const; |
154 | | |
155 | 203 | void clear() { |
156 | 203 | SampleCoverage.clear(); |
157 | 203 | TotalUsedSamples = 0; |
158 | 203 | } |
159 | | |
160 | | private: |
161 | | using BodySampleCoverageMap = std::map<LineLocation, unsigned>; |
162 | | using FunctionSamplesCoverageMap = |
163 | | DenseMap<const FunctionSamples *, BodySampleCoverageMap>; |
164 | | |
165 | | /// Coverage map for sampling records. |
166 | | /// |
167 | | /// This map keeps a record of sampling records that have been matched to |
168 | | /// an IR instruction. This is used to detect some form of staleness in |
169 | | /// profiles (see flag -sample-profile-check-coverage). |
170 | | /// |
171 | | /// Each entry in the map corresponds to a FunctionSamples instance. This is |
172 | | /// another map that counts how many times the sample record at the |
173 | | /// given location has been used. |
174 | | FunctionSamplesCoverageMap SampleCoverage; |
175 | | |
176 | | /// Number of samples used from the profile. |
177 | | /// |
178 | | /// When a sampling record is used for the first time, the samples from |
179 | | /// that record are added to this accumulator. Coverage is later computed |
180 | | /// based on the total number of samples available in this function and |
181 | | /// its callsites. |
182 | | /// |
183 | | /// Note that this accumulator tracks samples used from a single function |
184 | | /// and all the inlined callsites. Strictly, we should have a map of counters |
185 | | /// keyed by FunctionSamples pointers, but these stats are cleared after |
186 | | /// every function, so we just need to keep a single counter. |
187 | | uint64_t TotalUsedSamples = 0; |
188 | | }; |
189 | | |
190 | | /// Sample profile pass. |
191 | | /// |
192 | | /// This pass reads profile data from the file specified by |
193 | | /// -sample-profile-file and annotates every affected function with the |
194 | | /// profile information found in that file. |
195 | | class SampleProfileLoader { |
196 | | public: |
197 | | SampleProfileLoader( |
198 | | StringRef Name, StringRef RemapName, bool IsThinLTOPreLink, |
199 | | std::function<AssumptionCache &(Function &)> GetAssumptionCache, |
200 | | std::function<TargetTransformInfo &(Function &)> GetTargetTransformInfo) |
201 | | : GetAC(std::move(GetAssumptionCache)), |
202 | | GetTTI(std::move(GetTargetTransformInfo)), Filename(Name), |
203 | 106 | RemappingFilename(RemapName), IsThinLTOPreLink(IsThinLTOPreLink) {} |
204 | | |
205 | | bool doInitialization(Module &M); |
206 | | bool runOnModule(Module &M, ModuleAnalysisManager *AM, |
207 | | ProfileSummaryInfo *_PSI); |
208 | | |
209 | 0 | void dump() { Reader->dump(); } |
210 | | |
211 | | protected: |
212 | | bool runOnFunction(Function &F, ModuleAnalysisManager *AM); |
213 | | unsigned getFunctionLoc(Function &F); |
214 | | bool emitAnnotations(Function &F); |
215 | | ErrorOr<uint64_t> getInstWeight(const Instruction &I); |
216 | | ErrorOr<uint64_t> getBlockWeight(const BasicBlock *BB); |
217 | | const FunctionSamples *findCalleeFunctionSamples(const Instruction &I) const; |
218 | | std::vector<const FunctionSamples *> |
219 | | findIndirectCallFunctionSamples(const Instruction &I, uint64_t &Sum) const; |
220 | | mutable DenseMap<const DILocation *, const FunctionSamples *> DILocation2SampleMap; |
221 | | const FunctionSamples *findFunctionSamples(const Instruction &I) const; |
222 | | bool inlineCallInstruction(Instruction *I); |
223 | | bool inlineHotFunctions(Function &F, |
224 | | DenseSet<GlobalValue::GUID> &InlinedGUIDs); |
225 | | void printEdgeWeight(raw_ostream &OS, Edge E); |
226 | | void printBlockWeight(raw_ostream &OS, const BasicBlock *BB) const; |
227 | | void printBlockEquivalence(raw_ostream &OS, const BasicBlock *BB); |
228 | | bool computeBlockWeights(Function &F); |
229 | | void findEquivalenceClasses(Function &F); |
230 | | template <bool IsPostDom> |
231 | | void findEquivalencesFor(BasicBlock *BB1, ArrayRef<BasicBlock *> Descendants, |
232 | | DominatorTreeBase<BasicBlock, IsPostDom> *DomTree); |
233 | | |
234 | | void propagateWeights(Function &F); |
235 | | uint64_t visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge); |
236 | | void buildEdges(Function &F); |
237 | | bool propagateThroughEdges(Function &F, bool UpdateBlockCount); |
238 | | void computeDominanceAndLoopInfo(Function &F); |
239 | | void clearFunctionData(); |
240 | | |
241 | | /// Map basic blocks to their computed weights. |
242 | | /// |
243 | | /// The weight of a basic block is defined to be the maximum |
244 | | /// of all the instruction weights in that block. |
245 | | BlockWeightMap BlockWeights; |
246 | | |
247 | | /// Map edges to their computed weights. |
248 | | /// |
249 | | /// Edge weights are computed by propagating basic block weights in |
250 | | /// SampleProfile::propagateWeights. |
251 | | EdgeWeightMap EdgeWeights; |
252 | | |
253 | | /// Set of visited blocks during propagation. |
254 | | SmallPtrSet<const BasicBlock *, 32> VisitedBlocks; |
255 | | |
256 | | /// Set of visited edges during propagation. |
257 | | SmallSet<Edge, 32> VisitedEdges; |
258 | | |
259 | | /// Equivalence classes for block weights. |
260 | | /// |
261 | | /// Two blocks BB1 and BB2 are in the same equivalence class if they |
262 | | /// dominate and post-dominate each other, and they are in the same loop |
263 | | /// nest. When this happens, the two blocks are guaranteed to execute |
264 | | /// the same number of times. |
265 | | EquivalenceClassMap EquivalenceClass; |
266 | | |
267 | | /// Map from function name to Function *. Used to find the function from |
268 | | /// the function name. If the function name contains suffix, additional |
269 | | /// entry is added to map from the stripped name to the function if there |
270 | | /// is one-to-one mapping. |
271 | | StringMap<Function *> SymbolMap; |
272 | | |
273 | | /// Dominance, post-dominance and loop information. |
274 | | std::unique_ptr<DominatorTree> DT; |
275 | | std::unique_ptr<PostDominatorTree> PDT; |
276 | | std::unique_ptr<LoopInfo> LI; |
277 | | |
278 | | std::function<AssumptionCache &(Function &)> GetAC; |
279 | | std::function<TargetTransformInfo &(Function &)> GetTTI; |
280 | | |
281 | | /// Predecessors for each basic block in the CFG. |
282 | | BlockEdgeMap Predecessors; |
283 | | |
284 | | /// Successors for each basic block in the CFG. |
285 | | BlockEdgeMap Successors; |
286 | | |
287 | | SampleCoverageTracker CoverageTracker; |
288 | | |
289 | | /// Profile reader object. |
290 | | std::unique_ptr<SampleProfileReader> Reader; |
291 | | |
292 | | /// Samples collected for the body of this function. |
293 | | FunctionSamples *Samples = nullptr; |
294 | | |
295 | | /// Name of the profile file to load. |
296 | | std::string Filename; |
297 | | |
298 | | /// Name of the profile remapping file to load. |
299 | | std::string RemappingFilename; |
300 | | |
301 | | /// Flag indicating whether the profile input loaded successfully. |
302 | | bool ProfileIsValid = false; |
303 | | |
304 | | /// Flag indicating if the pass is invoked in ThinLTO compile phase. |
305 | | /// |
306 | | /// In this phase, in annotation, we should not promote indirect calls. |
307 | | /// Instead, we will mark GUIDs that needs to be annotated to the function. |
308 | | bool IsThinLTOPreLink; |
309 | | |
310 | | /// Profile Summary Info computed from sample profile. |
311 | | ProfileSummaryInfo *PSI = nullptr; |
312 | | |
313 | | /// Total number of samples collected in this profile. |
314 | | /// |
315 | | /// This is the sum of all the samples collected in all the functions executed |
316 | | /// at runtime. |
317 | | uint64_t TotalCollectedSamples = 0; |
318 | | |
319 | | /// Optimization Remark Emitter used to emit diagnostic remarks. |
320 | | OptimizationRemarkEmitter *ORE = nullptr; |
321 | | |
322 | | // Information recorded when we declined to inline a call site |
323 | | // because we have determined it is too cold is accumulated for |
324 | | // each callee function. Initially this is just the entry count. |
325 | | struct NotInlinedProfileInfo { |
326 | | uint64_t entryCount; |
327 | | }; |
328 | | DenseMap<Function *, NotInlinedProfileInfo> notInlinedCallInfo; |
329 | | }; |
330 | | |
331 | | class SampleProfileLoaderLegacyPass : public ModulePass { |
332 | | public: |
333 | | // Class identification, replacement for typeinfo |
334 | | static char ID; |
335 | | |
336 | | SampleProfileLoaderLegacyPass(StringRef Name = SampleProfileFile, |
337 | | bool IsThinLTOPreLink = false) |
338 | | : ModulePass(ID), |
339 | | SampleLoader(Name, SampleProfileRemappingFile, IsThinLTOPreLink, |
340 | 48 | [&](Function &F) -> AssumptionCache & { |
341 | 48 | return ACT->getAssumptionCache(F); |
342 | 48 | }, |
343 | 26 | [&](Function &F) -> TargetTransformInfo & { |
344 | 26 | return TTIWP->getTTI(F); |
345 | 62 | }) { |
346 | 62 | initializeSampleProfileLoaderLegacyPassPass( |
347 | 62 | *PassRegistry::getPassRegistry()); |
348 | 62 | } |
349 | | |
350 | 0 | void dump() { SampleLoader.dump(); } |
351 | | |
352 | 62 | bool doInitialization(Module &M) override { |
353 | 62 | return SampleLoader.doInitialization(M); |
354 | 62 | } |
355 | | |
356 | 1 | StringRef getPassName() const override { return "Sample profile pass"; } |
357 | | bool runOnModule(Module &M) override; |
358 | | |
359 | 62 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
360 | 62 | AU.addRequired<AssumptionCacheTracker>(); |
361 | 62 | AU.addRequired<TargetTransformInfoWrapperPass>(); |
362 | 62 | AU.addRequired<ProfileSummaryInfoWrapperPass>(); |
363 | 62 | } |
364 | | |
365 | | private: |
366 | | SampleProfileLoader SampleLoader; |
367 | | AssumptionCacheTracker *ACT = nullptr; |
368 | | TargetTransformInfoWrapperPass *TTIWP = nullptr; |
369 | | }; |
370 | | |
371 | | } // end anonymous namespace |
372 | | |
373 | | /// Return true if the given callsite is hot wrt to hot cutoff threshold. |
374 | | /// |
375 | | /// Functions that were inlined in the original binary will be represented |
376 | | /// in the inline stack in the sample profile. If the profile shows that |
377 | | /// the original inline decision was "good" (i.e., the callsite is executed |
378 | | /// frequently), then we will recreate the inline decision and apply the |
379 | | /// profile from the inlined callsite. |
380 | | /// |
381 | | /// To decide whether an inlined callsite is hot, we compare the callsite |
382 | | /// sample count with the hot cutoff computed by ProfileSummaryInfo, it is |
383 | | /// regarded as hot if the count is above the cutoff value. |
384 | | static bool callsiteIsHot(const FunctionSamples *CallsiteFS, |
385 | 98 | ProfileSummaryInfo *PSI) { |
386 | 98 | if (!CallsiteFS) |
387 | 0 | return false; // The callsite was not inlined in the original binary. |
388 | 98 | |
389 | 98 | assert(PSI && "PSI is expected to be non null"); |
390 | 98 | uint64_t CallsiteTotalSamples = CallsiteFS->getTotalSamples(); |
391 | 98 | return PSI->isHotCount(CallsiteTotalSamples); |
392 | 98 | } |
393 | | |
394 | | /// Mark as used the sample record for the given function samples at |
395 | | /// (LineOffset, Discriminator). |
396 | | /// |
397 | | /// \returns true if this is the first time we mark the given record. |
398 | | bool SampleCoverageTracker::markSamplesUsed(const FunctionSamples *FS, |
399 | | uint32_t LineOffset, |
400 | | uint32_t Discriminator, |
401 | 763 | uint64_t Samples) { |
402 | 763 | LineLocation Loc(LineOffset, Discriminator); |
403 | 763 | unsigned &Count = SampleCoverage[FS][Loc]; |
404 | 763 | bool FirstTime = (++Count == 1); |
405 | 763 | if (FirstTime) |
406 | 249 | TotalUsedSamples += Samples; |
407 | 763 | return FirstTime; |
408 | 763 | } |
409 | | |
410 | | /// Return the number of sample records that were applied from this profile. |
411 | | /// |
412 | | /// This count does not include records from cold inlined callsites. |
413 | | unsigned |
414 | | SampleCoverageTracker::countUsedRecords(const FunctionSamples *FS, |
415 | 8 | ProfileSummaryInfo *PSI) const { |
416 | 8 | auto I = SampleCoverage.find(FS); |
417 | 8 | |
418 | 8 | // The size of the coverage map for FS represents the number of records |
419 | 8 | // that were marked used at least once. |
420 | 8 | unsigned Count = (I != SampleCoverage.end()) ? I->second.size() : 00 ; |
421 | 8 | |
422 | 8 | // If there are inlined callsites in this function, count the samples found |
423 | 8 | // in the respective bodies. However, do not bother counting callees with 0 |
424 | 8 | // total samples, these are callees that were never invoked at runtime. |
425 | 8 | for (const auto &I : FS->getCallsiteSamples()) |
426 | 4 | for (const auto &J : I.second) { |
427 | 4 | const FunctionSamples *CalleeSamples = &J.second; |
428 | 4 | if (callsiteIsHot(CalleeSamples, PSI)) |
429 | 2 | Count += countUsedRecords(CalleeSamples, PSI); |
430 | 4 | } |
431 | 8 | |
432 | 8 | return Count; |
433 | 8 | } |
434 | | |
435 | | /// Return the number of sample records in the body of this profile. |
436 | | /// |
437 | | /// This count does not include records from cold inlined callsites. |
438 | | unsigned |
439 | | SampleCoverageTracker::countBodyRecords(const FunctionSamples *FS, |
440 | 8 | ProfileSummaryInfo *PSI) const { |
441 | 8 | unsigned Count = FS->getBodySamples().size(); |
442 | 8 | |
443 | 8 | // Only count records in hot callsites. |
444 | 8 | for (const auto &I : FS->getCallsiteSamples()) |
445 | 4 | for (const auto &J : I.second) { |
446 | 4 | const FunctionSamples *CalleeSamples = &J.second; |
447 | 4 | if (callsiteIsHot(CalleeSamples, PSI)) |
448 | 2 | Count += countBodyRecords(CalleeSamples, PSI); |
449 | 4 | } |
450 | 8 | |
451 | 8 | return Count; |
452 | 8 | } |
453 | | |
454 | | /// Return the number of samples collected in the body of this profile. |
455 | | /// |
456 | | /// This count does not include samples from cold inlined callsites. |
457 | | uint64_t |
458 | | SampleCoverageTracker::countBodySamples(const FunctionSamples *FS, |
459 | 6 | ProfileSummaryInfo *PSI) const { |
460 | 6 | uint64_t Total = 0; |
461 | 6 | for (const auto &I : FS->getBodySamples()) |
462 | 16 | Total += I.second.getSamples(); |
463 | 6 | |
464 | 6 | // Only count samples in hot callsites. |
465 | 6 | for (const auto &I : FS->getCallsiteSamples()) |
466 | 2 | for (const auto &J : I.second) { |
467 | 2 | const FunctionSamples *CalleeSamples = &J.second; |
468 | 2 | if (callsiteIsHot(CalleeSamples, PSI)) |
469 | 2 | Total += countBodySamples(CalleeSamples, PSI); |
470 | 2 | } |
471 | 6 | |
472 | 6 | return Total; |
473 | 6 | } |
474 | | |
475 | | /// Return the fraction of sample records used in this profile. |
476 | | /// |
477 | | /// The returned value is an unsigned integer in the range 0-100 indicating |
478 | | /// the percentage of sample records that were used while applying this |
479 | | /// profile to the associated function. |
480 | | unsigned SampleCoverageTracker::computeCoverage(unsigned Used, |
481 | 10 | unsigned Total) const { |
482 | 10 | assert(Used <= Total && |
483 | 10 | "number of used records cannot exceed the total number of records"); |
484 | 10 | return Total > 0 ? Used * 100 / Total : 1000 ; |
485 | 10 | } |
486 | | |
487 | | /// Clear all the per-function data used to load samples and propagate weights. |
488 | 203 | void SampleProfileLoader::clearFunctionData() { |
489 | 203 | BlockWeights.clear(); |
490 | 203 | EdgeWeights.clear(); |
491 | 203 | VisitedBlocks.clear(); |
492 | 203 | VisitedEdges.clear(); |
493 | 203 | EquivalenceClass.clear(); |
494 | 203 | DT = nullptr; |
495 | 203 | PDT = nullptr; |
496 | 203 | LI = nullptr; |
497 | 203 | Predecessors.clear(); |
498 | 203 | Successors.clear(); |
499 | 203 | CoverageTracker.clear(); |
500 | 203 | } |
501 | | |
502 | | #ifndef NDEBUG |
503 | | /// Print the weight of edge \p E on stream \p OS. |
504 | | /// |
505 | | /// \param OS Stream to emit the output to. |
506 | | /// \param E Edge to print. |
507 | | void SampleProfileLoader::printEdgeWeight(raw_ostream &OS, Edge E) { |
508 | | OS << "weight[" << E.first->getName() << "->" << E.second->getName() |
509 | | << "]: " << EdgeWeights[E] << "\n"; |
510 | | } |
511 | | |
512 | | /// Print the equivalence class of block \p BB on stream \p OS. |
513 | | /// |
514 | | /// \param OS Stream to emit the output to. |
515 | | /// \param BB Block to print. |
516 | | void SampleProfileLoader::printBlockEquivalence(raw_ostream &OS, |
517 | | const BasicBlock *BB) { |
518 | | const BasicBlock *Equiv = EquivalenceClass[BB]; |
519 | | OS << "equivalence[" << BB->getName() |
520 | | << "]: " << ((Equiv) ? EquivalenceClass[BB]->getName() : "NONE") << "\n"; |
521 | | } |
522 | | |
523 | | /// Print the weight of block \p BB on stream \p OS. |
524 | | /// |
525 | | /// \param OS Stream to emit the output to. |
526 | | /// \param BB Block to print. |
527 | | void SampleProfileLoader::printBlockWeight(raw_ostream &OS, |
528 | | const BasicBlock *BB) const { |
529 | | const auto &I = BlockWeights.find(BB); |
530 | | uint64_t W = (I == BlockWeights.end() ? 0 : I->second); |
531 | | OS << "weight[" << BB->getName() << "]: " << W << "\n"; |
532 | | } |
533 | | #endif |
534 | | |
535 | | /// Get the weight for an instruction. |
536 | | /// |
537 | | /// The "weight" of an instruction \p Inst is the number of samples |
538 | | /// collected on that instruction at runtime. To retrieve it, we |
539 | | /// need to compute the line number of \p Inst relative to the start of its |
540 | | /// function. We use HeaderLineno to compute the offset. We then |
541 | | /// look up the samples collected for \p Inst using BodySamples. |
542 | | /// |
543 | | /// \param Inst Instruction to query. |
544 | | /// |
545 | | /// \returns the weight of \p Inst. |
546 | 1.81k | ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) { |
547 | 1.81k | const DebugLoc &DLoc = Inst.getDebugLoc(); |
548 | 1.81k | if (!DLoc) |
549 | 404 | return std::error_code(); |
550 | 1.40k | |
551 | 1.40k | const FunctionSamples *FS = findFunctionSamples(Inst); |
552 | 1.40k | if (!FS) |
553 | 0 | return std::error_code(); |
554 | 1.40k | |
555 | 1.40k | // Ignore all intrinsics, phinodes and branch instructions. |
556 | 1.40k | // Branch and phinodes instruction usually contains debug info from sources outside of |
557 | 1.40k | // the residing basic block, thus we ignore them during annotation. |
558 | 1.40k | if (isa<BranchInst>(Inst) || isa<IntrinsicInst>(Inst)1.14k || isa<PHINode>(Inst)1.01k ) |
559 | 406 | return std::error_code(); |
560 | 1.00k | |
561 | 1.00k | // If a direct call/invoke instruction is inlined in profile |
562 | 1.00k | // (findCalleeFunctionSamples returns non-empty result), but not inlined here, |
563 | 1.00k | // it means that the inlined callsite has no sample, thus the call |
564 | 1.00k | // instruction should have 0 count. |
565 | 1.00k | if ((isa<CallInst>(Inst) || isa<InvokeInst>(Inst)874 ) && |
566 | 1.00k | !ImmutableCallSite(&Inst).isIndirectCall()126 && |
567 | 1.00k | findCalleeFunctionSamples(Inst)91 ) |
568 | 17 | return 0; |
569 | 983 | |
570 | 983 | const DILocation *DIL = DLoc; |
571 | 983 | uint32_t LineOffset = FunctionSamples::getOffset(DIL); |
572 | 983 | uint32_t Discriminator = DIL->getBaseDiscriminator(); |
573 | 983 | ErrorOr<uint64_t> R = FS->findSamplesAt(LineOffset, Discriminator); |
574 | 983 | if (R) { |
575 | 763 | bool FirstMark = |
576 | 763 | CoverageTracker.markSamplesUsed(FS, LineOffset, Discriminator, R.get()); |
577 | 763 | if (FirstMark) { |
578 | 249 | ORE->emit([&]() { |
579 | 38 | OptimizationRemarkAnalysis Remark(DEBUG_TYPE, "AppliedSamples", &Inst); |
580 | 38 | Remark << "Applied " << ore::NV("NumSamples", *R); |
581 | 38 | Remark << " samples from profile (offset: "; |
582 | 38 | Remark << ore::NV("LineOffset", LineOffset); |
583 | 38 | if (Discriminator) { |
584 | 6 | Remark << "."; |
585 | 6 | Remark << ore::NV("Discriminator", Discriminator); |
586 | 6 | } |
587 | 38 | Remark << ")"; |
588 | 38 | return Remark; |
589 | 38 | }); |
590 | 249 | } |
591 | 763 | LLVM_DEBUG(dbgs() << " " << DLoc.getLine() << "." |
592 | 763 | << DIL->getBaseDiscriminator() << ":" << Inst |
593 | 763 | << " (line offset: " << LineOffset << "." |
594 | 763 | << DIL->getBaseDiscriminator() << " - weight: " << R.get() |
595 | 763 | << ")\n"); |
596 | 763 | } |
597 | 983 | return R; |
598 | 983 | } |
599 | | |
600 | | /// Compute the weight of a basic block. |
601 | | /// |
602 | | /// The weight of basic block \p BB is the maximum weight of all the |
603 | | /// instructions in BB. |
604 | | /// |
605 | | /// \param BB The basic block to query. |
606 | | /// |
607 | | /// \returns the weight for \p BB. |
608 | 414 | ErrorOr<uint64_t> SampleProfileLoader::getBlockWeight(const BasicBlock *BB) { |
609 | 414 | uint64_t Max = 0; |
610 | 414 | bool HasWeight = false; |
611 | 1.81k | for (auto &I : BB->getInstList()) { |
612 | 1.81k | const ErrorOr<uint64_t> &R = getInstWeight(I); |
613 | 1.81k | if (R) { |
614 | 780 | Max = std::max(Max, R.get()); |
615 | 780 | HasWeight = true; |
616 | 780 | } |
617 | 1.81k | } |
618 | 414 | return HasWeight ? ErrorOr<uint64_t>(Max)249 : std::error_code()165 ; |
619 | 414 | } |
620 | | |
621 | | /// Compute and store the weights of every basic block. |
622 | | /// |
623 | | /// This populates the BlockWeights map by computing |
624 | | /// the weights of every basic block in the CFG. |
625 | | /// |
626 | | /// \param F The function to query. |
627 | 110 | bool SampleProfileLoader::computeBlockWeights(Function &F) { |
628 | 110 | bool Changed = false; |
629 | 110 | LLVM_DEBUG(dbgs() << "Block weights\n"); |
630 | 414 | for (const auto &BB : F) { |
631 | 414 | ErrorOr<uint64_t> Weight = getBlockWeight(&BB); |
632 | 414 | if (Weight) { |
633 | 249 | BlockWeights[&BB] = Weight.get(); |
634 | 249 | VisitedBlocks.insert(&BB); |
635 | 249 | Changed = true; |
636 | 249 | } |
637 | 414 | LLVM_DEBUG(printBlockWeight(dbgs(), &BB)); |
638 | 414 | } |
639 | 110 | |
640 | 110 | return Changed; |
641 | 110 | } |
642 | | |
643 | | /// Get the FunctionSamples for a call instruction. |
644 | | /// |
645 | | /// The FunctionSamples of a call/invoke instruction \p Inst is the inlined |
646 | | /// instance in which that call instruction is calling to. It contains |
647 | | /// all samples that resides in the inlined instance. We first find the |
648 | | /// inlined instance in which the call instruction is from, then we |
649 | | /// traverse its children to find the callsite with the matching |
650 | | /// location. |
651 | | /// |
652 | | /// \param Inst Call/Invoke instruction to query. |
653 | | /// |
654 | | /// \returns The FunctionSamples pointer to the inlined instance. |
655 | | const FunctionSamples * |
656 | 265 | SampleProfileLoader::findCalleeFunctionSamples(const Instruction &Inst) const { |
657 | 265 | const DILocation *DIL = Inst.getDebugLoc(); |
658 | 265 | if (!DIL) { |
659 | 1 | return nullptr; |
660 | 1 | } |
661 | 264 | |
662 | 264 | StringRef CalleeName; |
663 | 264 | if (const CallInst *CI = dyn_cast<CallInst>(&Inst)) |
664 | 263 | if (Function *Callee = CI->getCalledFunction()) |
665 | 214 | CalleeName = Callee->getName(); |
666 | 264 | |
667 | 264 | const FunctionSamples *FS = findFunctionSamples(Inst); |
668 | 264 | if (FS == nullptr) |
669 | 0 | return nullptr; |
670 | 264 | |
671 | 264 | return FS->findFunctionSamplesAt(LineLocation(FunctionSamples::getOffset(DIL), |
672 | 264 | DIL->getBaseDiscriminator()), |
673 | 264 | CalleeName); |
674 | 264 | } |
675 | | |
676 | | /// Returns a vector of FunctionSamples that are the indirect call targets |
677 | | /// of \p Inst. The vector is sorted by the total number of samples. Stores |
678 | | /// the total call count of the indirect call in \p Sum. |
679 | | std::vector<const FunctionSamples *> |
680 | | SampleProfileLoader::findIndirectCallFunctionSamples( |
681 | 35 | const Instruction &Inst, uint64_t &Sum) const { |
682 | 35 | const DILocation *DIL = Inst.getDebugLoc(); |
683 | 35 | std::vector<const FunctionSamples *> R; |
684 | 35 | |
685 | 35 | if (!DIL) { |
686 | 0 | return R; |
687 | 0 | } |
688 | 35 | |
689 | 35 | const FunctionSamples *FS = findFunctionSamples(Inst); |
690 | 35 | if (FS == nullptr) |
691 | 0 | return R; |
692 | 35 | |
693 | 35 | uint32_t LineOffset = FunctionSamples::getOffset(DIL); |
694 | 35 | uint32_t Discriminator = DIL->getBaseDiscriminator(); |
695 | 35 | |
696 | 35 | auto T = FS->findCallTargetMapAt(LineOffset, Discriminator); |
697 | 35 | Sum = 0; |
698 | 35 | if (T) |
699 | 21 | for (const auto &T_C : T.get()) |
700 | 28 | Sum += T_C.second; |
701 | 35 | if (const FunctionSamplesMap *M = FS->findFunctionSamplesMapAt(LineLocation( |
702 | 22 | FunctionSamples::getOffset(DIL), DIL->getBaseDiscriminator()))) { |
703 | 22 | if (M->empty()) |
704 | 0 | return R; |
705 | 32 | for (const auto &NameFS : *M)22 { |
706 | 32 | Sum += NameFS.second.getEntrySamples(); |
707 | 32 | R.push_back(&NameFS.second); |
708 | 32 | } |
709 | 22 | llvm::sort(R, [](const FunctionSamples *L, const FunctionSamples *R) { |
710 | 10 | if (L->getEntrySamples() != R->getEntrySamples()) |
711 | 8 | return L->getEntrySamples() > R->getEntrySamples(); |
712 | 2 | return FunctionSamples::getGUID(L->getName()) < |
713 | 2 | FunctionSamples::getGUID(R->getName()); |
714 | 2 | }); |
715 | 22 | } |
716 | 35 | return R; |
717 | 35 | } |
718 | | |
719 | | /// Get the FunctionSamples for an instruction. |
720 | | /// |
721 | | /// The FunctionSamples of an instruction \p Inst is the inlined instance |
722 | | /// in which that instruction is coming from. We traverse the inline stack |
723 | | /// of that instruction, and match it with the tree nodes in the profile. |
724 | | /// |
725 | | /// \param Inst Instruction to query. |
726 | | /// |
727 | | /// \returns the FunctionSamples pointer to the inlined instance. |
728 | | const FunctionSamples * |
729 | 1.73k | SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const { |
730 | 1.73k | const DILocation *DIL = Inst.getDebugLoc(); |
731 | 1.73k | if (!DIL) |
732 | 0 | return Samples; |
733 | 1.73k | |
734 | 1.73k | auto it = DILocation2SampleMap.try_emplace(DIL,nullptr); |
735 | 1.73k | if (it.second) |
736 | 792 | it.first->second = Samples->findFunctionSamples(DIL); |
737 | 1.73k | return it.first->second; |
738 | 1.73k | } |
739 | | |
740 | 34 | bool SampleProfileLoader::inlineCallInstruction(Instruction *I) { |
741 | 34 | assert(isa<CallInst>(I) || isa<InvokeInst>(I)); |
742 | 34 | CallSite CS(I); |
743 | 34 | Function *CalledFunction = CS.getCalledFunction(); |
744 | 34 | assert(CalledFunction); |
745 | 34 | DebugLoc DLoc = I->getDebugLoc(); |
746 | 34 | BasicBlock *BB = I->getParent(); |
747 | 34 | InlineParams Params = getInlineParams(); |
748 | 34 | Params.ComputeFullInlineCost = true; |
749 | 34 | // Checks if there is anything in the reachable portion of the callee at |
750 | 34 | // this callsite that makes this inlining potentially illegal. Need to |
751 | 34 | // set ComputeFullInlineCost, otherwise getInlineCost may return early |
752 | 34 | // when cost exceeds threshold without checking all IRs in the callee. |
753 | 34 | // The acutal cost does not matter because we only checks isNever() to |
754 | 34 | // see if it is legal to inline the callsite. |
755 | 34 | InlineCost Cost = |
756 | 34 | getInlineCost(cast<CallBase>(*I), Params, GetTTI(*CalledFunction), GetAC, |
757 | 34 | None, nullptr, nullptr); |
758 | 34 | if (Cost.isNever()) { |
759 | 2 | ORE->emit(OptimizationRemark(DEBUG_TYPE, "Not inline", DLoc, BB) |
760 | 2 | << "incompatible inlining"); |
761 | 2 | return false; |
762 | 2 | } |
763 | 32 | InlineFunctionInfo IFI(nullptr, &GetAC); |
764 | 32 | if (InlineFunction(CS, IFI)) { |
765 | 32 | // The call to InlineFunction erases I, so we can't pass it here. |
766 | 32 | ORE->emit(OptimizationRemark(DEBUG_TYPE, "HotInline", DLoc, BB) |
767 | 32 | << "inlined hot callee '" << ore::NV("Callee", CalledFunction) |
768 | 32 | << "' into '" << ore::NV("Caller", BB->getParent()) << "'"); |
769 | 32 | return true; |
770 | 32 | } |
771 | 0 | return false; |
772 | 0 | } |
773 | | |
774 | | /// Iteratively inline hot callsites of a function. |
775 | | /// |
776 | | /// Iteratively traverse all callsites of the function \p F, and find if |
777 | | /// the corresponding inlined instance exists and is hot in profile. If |
778 | | /// it is hot enough, inline the callsites and adds new callsites of the |
779 | | /// callee into the caller. If the call is an indirect call, first promote |
780 | | /// it to direct call. Each indirect call is limited with a single target. |
781 | | /// |
782 | | /// \param F function to perform iterative inlining. |
783 | | /// \param InlinedGUIDs a set to be updated to include all GUIDs that are |
784 | | /// inlined in the profiled binary. |
785 | | /// |
786 | | /// \returns True if there is any inline happened. |
787 | | bool SampleProfileLoader::inlineHotFunctions( |
788 | 110 | Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) { |
789 | 110 | DenseSet<Instruction *> PromotedInsns; |
790 | 110 | |
791 | 110 | DenseMap<Instruction *, const FunctionSamples *> localNotInlinedCallSites; |
792 | 110 | bool Changed = false; |
793 | 137 | while (true) { |
794 | 137 | bool LocalChanged = false; |
795 | 137 | SmallVector<Instruction *, 10> CIS; |
796 | 507 | for (auto &BB : F) { |
797 | 507 | bool Hot = false; |
798 | 507 | SmallVector<Instruction *, 10> Candidates; |
799 | 2.17k | for (auto &I : BB.getInstList()) { |
800 | 2.17k | const FunctionSamples *FS = nullptr; |
801 | 2.17k | if ((isa<CallInst>(I) || isa<InvokeInst>(I)1.83k ) && |
802 | 2.17k | !isa<IntrinsicInst>(I)341 && (FS = findCalleeFunctionSamples(I))170 ) { |
803 | 70 | Candidates.push_back(&I); |
804 | 70 | if (FS->getEntrySamples() > 0) |
805 | 59 | localNotInlinedCallSites.try_emplace(&I, FS); |
806 | 70 | if (callsiteIsHot(FS, PSI)) |
807 | 60 | Hot = true; |
808 | 70 | } |
809 | 2.17k | } |
810 | 507 | if (Hot) { |
811 | 55 | CIS.insert(CIS.begin(), Candidates.begin(), Candidates.end()); |
812 | 55 | } |
813 | 507 | } |
814 | 137 | for (auto I : CIS) { |
815 | 60 | Function *CalledFunction = CallSite(I).getCalledFunction(); |
816 | 60 | // Do not inline recursive calls. |
817 | 60 | if (CalledFunction == &F) |
818 | 2 | continue; |
819 | 58 | if (CallSite(I).isIndirectCall()) { |
820 | 28 | if (PromotedInsns.count(I)) |
821 | 10 | continue; |
822 | 18 | uint64_t Sum; |
823 | 24 | for (const auto *FS : findIndirectCallFunctionSamples(*I, Sum)) { |
824 | 24 | if (IsThinLTOPreLink) { |
825 | 4 | FS->findInlinedFunctions(InlinedGUIDs, F.getParent(), |
826 | 4 | PSI->getOrCompHotCountThreshold()); |
827 | 4 | continue; |
828 | 4 | } |
829 | 20 | auto CalleeFunctionName = FS->getFuncNameInModule(F.getParent()); |
830 | 20 | // If it is a recursive call, we do not inline it as it could bloat |
831 | 20 | // the code exponentially. There is way to better handle this, e.g. |
832 | 20 | // clone the caller first, and inline the cloned caller if it is |
833 | 20 | // recursive. As llvm does not inline recursive calls, we will |
834 | 20 | // simply ignore it instead of handling it explicitly. |
835 | 20 | if (CalleeFunctionName == F.getName()) |
836 | 2 | continue; |
837 | 18 | |
838 | 18 | if (!callsiteIsHot(FS, PSI)) |
839 | 2 | continue; |
840 | 16 | |
841 | 16 | const char *Reason = "Callee function not available"; |
842 | 16 | auto R = SymbolMap.find(CalleeFunctionName); |
843 | 16 | if (R != SymbolMap.end() && R->getValue() && |
844 | 16 | !R->getValue()->isDeclaration()14 && |
845 | 16 | R->getValue()->getSubprogram()14 && |
846 | 16 | isLegalToPromote(CallSite(I), R->getValue(), &Reason)14 ) { |
847 | 12 | uint64_t C = FS->getEntrySamples(); |
848 | 12 | Instruction *DI = |
849 | 12 | pgo::promoteIndirectCall(I, R->getValue(), C, Sum, false, ORE); |
850 | 12 | Sum -= C; |
851 | 12 | PromotedInsns.insert(I); |
852 | 12 | // If profile mismatches, we should not attempt to inline DI. |
853 | 12 | if ((isa<CallInst>(DI) || isa<InvokeInst>(DI)0 ) && |
854 | 12 | inlineCallInstruction(DI)) { |
855 | 12 | localNotInlinedCallSites.erase(I); |
856 | 12 | LocalChanged = true; |
857 | 12 | } |
858 | 12 | } else { |
859 | 4 | LLVM_DEBUG(dbgs() |
860 | 4 | << "\nFailed to promote indirect call to " |
861 | 4 | << CalleeFunctionName << " because " << Reason << "\n"); |
862 | 4 | } |
863 | 16 | } |
864 | 30 | } else if (CalledFunction && CalledFunction->getSubprogram()28 && |
865 | 30 | !CalledFunction->isDeclaration()22 ) { |
866 | 22 | if (inlineCallInstruction(I)) { |
867 | 20 | localNotInlinedCallSites.erase(I); |
868 | 20 | LocalChanged = true; |
869 | 20 | } |
870 | 22 | } else if (8 IsThinLTOPreLink8 ) { |
871 | 4 | findCalleeFunctionSamples(*I)->findInlinedFunctions( |
872 | 4 | InlinedGUIDs, F.getParent(), PSI->getOrCompHotCountThreshold()); |
873 | 4 | } |
874 | 58 | } |
875 | 137 | if (LocalChanged) { |
876 | 27 | Changed = true; |
877 | 110 | } else { |
878 | 110 | break; |
879 | 110 | } |
880 | 137 | } |
881 | 110 | |
882 | 110 | // Accumulate not inlined callsite information into notInlinedSamples |
883 | 110 | for (const auto &Pair : localNotInlinedCallSites) { |
884 | 29 | Instruction *I = Pair.getFirst(); |
885 | 29 | Function *Callee = CallSite(I).getCalledFunction(); |
886 | 29 | if (!Callee || Callee->isDeclaration()9 ) |
887 | 24 | continue; |
888 | 5 | const FunctionSamples *FS = Pair.getSecond(); |
889 | 5 | auto pair = |
890 | 5 | notInlinedCallInfo.try_emplace(Callee, NotInlinedProfileInfo{0}); |
891 | 5 | pair.first->second.entryCount += FS->getEntrySamples(); |
892 | 5 | } |
893 | 110 | return Changed; |
894 | 110 | } |
895 | | |
896 | | /// Find equivalence classes for the given block. |
897 | | /// |
898 | | /// This finds all the blocks that are guaranteed to execute the same |
899 | | /// number of times as \p BB1. To do this, it traverses all the |
900 | | /// descendants of \p BB1 in the dominator or post-dominator tree. |
901 | | /// |
902 | | /// A block BB2 will be in the same equivalence class as \p BB1 if |
903 | | /// the following holds: |
904 | | /// |
905 | | /// 1- \p BB1 is a descendant of BB2 in the opposite tree. So, if BB2 |
906 | | /// is a descendant of \p BB1 in the dominator tree, then BB2 should |
907 | | /// dominate BB1 in the post-dominator tree. |
908 | | /// |
909 | | /// 2- Both BB2 and \p BB1 must be in the same loop. |
910 | | /// |
911 | | /// For every block BB2 that meets those two requirements, we set BB2's |
912 | | /// equivalence class to \p BB1. |
913 | | /// |
914 | | /// \param BB1 Block to check. |
915 | | /// \param Descendants Descendants of \p BB1 in either the dom or pdom tree. |
916 | | /// \param DomTree Opposite dominator tree. If \p Descendants is filled |
917 | | /// with blocks from \p BB1's dominator tree, then |
918 | | /// this is the post-dominator tree, and vice versa. |
919 | | template <bool IsPostDom> |
920 | | void SampleProfileLoader::findEquivalencesFor( |
921 | | BasicBlock *BB1, ArrayRef<BasicBlock *> Descendants, |
922 | 294 | DominatorTreeBase<BasicBlock, IsPostDom> *DomTree) { |
923 | 294 | const BasicBlock *EC = EquivalenceClass[BB1]; |
924 | 294 | uint64_t Weight = BlockWeights[EC]; |
925 | 966 | for (const auto *BB2 : Descendants) { |
926 | 966 | bool IsDomParent = DomTree->dominates(BB2, BB1); |
927 | 966 | bool IsInSameLoop = LI->getLoopFor(BB1) == LI->getLoopFor(BB2); |
928 | 966 | if (BB1 != BB2 && IsDomParent673 && IsInSameLoop224 ) { |
929 | 114 | EquivalenceClass[BB2] = EC; |
930 | 114 | // If BB2 is visited, then the entire EC should be marked as visited. |
931 | 114 | if (VisitedBlocks.count(BB2)) { |
932 | 44 | VisitedBlocks.insert(EC); |
933 | 44 | } |
934 | 114 | |
935 | 114 | // If BB2 is heavier than BB1, make BB2 have the same weight |
936 | 114 | // as BB1. |
937 | 114 | // |
938 | 114 | // Note that we don't worry about the opposite situation here |
939 | 114 | // (when BB2 is lighter than BB1). We will deal with this |
940 | 114 | // during the propagation phase. Right now, we just want to |
941 | 114 | // make sure that BB1 has the largest weight of all the |
942 | 114 | // members of its equivalence set. |
943 | 114 | Weight = std::max(Weight, BlockWeights[BB2]); |
944 | 114 | } |
945 | 966 | } |
946 | 294 | if (EC == &EC->getParent()->getEntryBlock()) { |
947 | 104 | BlockWeights[EC] = Samples->getHeadSamples() + 1; |
948 | 190 | } else { |
949 | 190 | BlockWeights[EC] = Weight; |
950 | 190 | } |
951 | 294 | } |
952 | | |
953 | | /// Find equivalence classes. |
954 | | /// |
955 | | /// Since samples may be missing from blocks, we can fill in the gaps by setting |
956 | | /// the weights of all the blocks in the same equivalence class to the same |
957 | | /// weight. To compute the concept of equivalence, we use dominance and loop |
958 | | /// information. Two blocks B1 and B2 are in the same equivalence class if B1 |
959 | | /// dominates B2, B2 post-dominates B1 and both are in the same loop. |
960 | | /// |
961 | | /// \param F The function to query. |
962 | 104 | void SampleProfileLoader::findEquivalenceClasses(Function &F) { |
963 | 104 | SmallVector<BasicBlock *, 8> DominatedBBs; |
964 | 104 | LLVM_DEBUG(dbgs() << "\nBlock equivalence classes\n"); |
965 | 104 | // Find equivalence sets based on dominance and post-dominance information. |
966 | 408 | for (auto &BB : F) { |
967 | 408 | BasicBlock *BB1 = &BB; |
968 | 408 | |
969 | 408 | // Compute BB1's equivalence class once. |
970 | 408 | if (EquivalenceClass.count(BB1)) { |
971 | 114 | LLVM_DEBUG(printBlockEquivalence(dbgs(), BB1)); |
972 | 114 | continue; |
973 | 114 | } |
974 | 294 | |
975 | 294 | // By default, blocks are in their own equivalence class. |
976 | 294 | EquivalenceClass[BB1] = BB1; |
977 | 294 | |
978 | 294 | // Traverse all the blocks dominated by BB1. We are looking for |
979 | 294 | // every basic block BB2 such that: |
980 | 294 | // |
981 | 294 | // 1- BB1 dominates BB2. |
982 | 294 | // 2- BB2 post-dominates BB1. |
983 | 294 | // 3- BB1 and BB2 are in the same loop nest. |
984 | 294 | // |
985 | 294 | // If all those conditions hold, it means that BB2 is executed |
986 | 294 | // as many times as BB1, so they are placed in the same equivalence |
987 | 294 | // class by making BB2's equivalence class be BB1. |
988 | 294 | DominatedBBs.clear(); |
989 | 294 | DT->getDescendants(BB1, DominatedBBs); |
990 | 294 | findEquivalencesFor(BB1, DominatedBBs, PDT.get()); |
991 | 294 | |
992 | 294 | LLVM_DEBUG(printBlockEquivalence(dbgs(), BB1)); |
993 | 294 | } |
994 | 104 | |
995 | 104 | // Assign weights to equivalence classes. |
996 | 104 | // |
997 | 104 | // All the basic blocks in the same equivalence class will execute |
998 | 104 | // the same number of times. Since we know that the head block in |
999 | 104 | // each equivalence class has the largest weight, assign that weight |
1000 | 104 | // to all the blocks in that equivalence class. |
1001 | 104 | LLVM_DEBUG( |
1002 | 104 | dbgs() << "\nAssign the same weight to all blocks in the same class\n"); |
1003 | 408 | for (auto &BI : F) { |
1004 | 408 | const BasicBlock *BB = &BI; |
1005 | 408 | const BasicBlock *EquivBB = EquivalenceClass[BB]; |
1006 | 408 | if (BB != EquivBB) |
1007 | 114 | BlockWeights[BB] = BlockWeights[EquivBB]; |
1008 | 408 | LLVM_DEBUG(printBlockWeight(dbgs(), BB)); |
1009 | 408 | } |
1010 | 104 | } |
1011 | | |
1012 | | /// Visit the given edge to decide if it has a valid weight. |
1013 | | /// |
1014 | | /// If \p E has not been visited before, we copy to \p UnknownEdge |
1015 | | /// and increment the count of unknown edges. |
1016 | | /// |
1017 | | /// \param E Edge to visit. |
1018 | | /// \param NumUnknownEdges Current number of unknown edges. |
1019 | | /// \param UnknownEdge Set if E has not been visited before. |
1020 | | /// |
1021 | | /// \returns E's weight, if known. Otherwise, return 0. |
1022 | | uint64_t SampleProfileLoader::visitEdge(Edge E, unsigned *NumUnknownEdges, |
1023 | 5.08k | Edge *UnknownEdge) { |
1024 | 5.08k | if (!VisitedEdges.count(E)) { |
1025 | 1.60k | (*NumUnknownEdges)++; |
1026 | 1.60k | *UnknownEdge = E; |
1027 | 1.60k | return 0; |
1028 | 1.60k | } |
1029 | 3.48k | |
1030 | 3.48k | return EdgeWeights[E]; |
1031 | 3.48k | } |
1032 | | |
1033 | | /// Propagate weights through incoming/outgoing edges. |
1034 | | /// |
1035 | | /// If the weight of a basic block is known, and there is only one edge |
1036 | | /// with an unknown weight, we can calculate the weight of that edge. |
1037 | | /// |
1038 | | /// Similarly, if all the edges have a known count, we can calculate the |
1039 | | /// count of the basic block, if needed. |
1040 | | /// |
1041 | | /// \param F Function to process. |
1042 | | /// \param UpdateBlockCount Whether we should update basic block counts that |
1043 | | /// has already been annotated. |
1044 | | /// |
1045 | | /// \returns True if new weights were assigned to edges or blocks. |
1046 | | bool SampleProfileLoader::propagateThroughEdges(Function &F, |
1047 | 497 | bool UpdateBlockCount) { |
1048 | 497 | bool Changed = false; |
1049 | 497 | LLVM_DEBUG(dbgs() << "\nPropagation through edges\n"); |
1050 | 2.42k | for (const auto &BI : F) { |
1051 | 2.42k | const BasicBlock *BB = &BI; |
1052 | 2.42k | const BasicBlock *EC = EquivalenceClass[BB]; |
1053 | 2.42k | |
1054 | 2.42k | // Visit all the predecessor and successor edges to determine |
1055 | 2.42k | // which ones have a weight assigned already. Note that it doesn't |
1056 | 2.42k | // matter that we only keep track of a single unknown edge. The |
1057 | 2.42k | // only case we are interested in handling is when only a single |
1058 | 2.42k | // edge is unknown (see setEdgeOrBlockWeight). |
1059 | 7.26k | for (unsigned i = 0; i < 2; i++4.84k ) { |
1060 | 4.84k | uint64_t TotalWeight = 0; |
1061 | 4.84k | unsigned NumUnknownEdges = 0, NumTotalEdges = 0; |
1062 | 4.84k | Edge UnknownEdge, SelfReferentialEdge, SingleEdge; |
1063 | 4.84k | |
1064 | 4.84k | if (i == 0) { |
1065 | 2.42k | // First, visit all predecessor edges. |
1066 | 2.42k | NumTotalEdges = Predecessors[BB].size(); |
1067 | 2.54k | for (auto *Pred : Predecessors[BB]) { |
1068 | 2.54k | Edge E = std::make_pair(Pred, BB); |
1069 | 2.54k | TotalWeight += visitEdge(E, &NumUnknownEdges, &UnknownEdge); |
1070 | 2.54k | if (E.first == E.second) |
1071 | 0 | SelfReferentialEdge = E; |
1072 | 2.54k | } |
1073 | 2.42k | if (NumTotalEdges == 1) { |
1074 | 1.31k | SingleEdge = std::make_pair(Predecessors[BB][0], BB); |
1075 | 1.31k | } |
1076 | 2.42k | } else { |
1077 | 2.42k | // On the second round, visit all successor edges. |
1078 | 2.42k | NumTotalEdges = Successors[BB].size(); |
1079 | 2.54k | for (auto *Succ : Successors[BB]) { |
1080 | 2.54k | Edge E = std::make_pair(BB, Succ); |
1081 | 2.54k | TotalWeight += visitEdge(E, &NumUnknownEdges, &UnknownEdge); |
1082 | 2.54k | } |
1083 | 2.42k | if (NumTotalEdges == 1) { |
1084 | 1.25k | SingleEdge = std::make_pair(BB, Successors[BB][0]); |
1085 | 1.25k | } |
1086 | 2.42k | } |
1087 | 4.84k | |
1088 | 4.84k | // After visiting all the edges, there are three cases that we |
1089 | 4.84k | // can handle immediately: |
1090 | 4.84k | // |
1091 | 4.84k | // - All the edge weights are known (i.e., NumUnknownEdges == 0). |
1092 | 4.84k | // In this case, we simply check that the sum of all the edges |
1093 | 4.84k | // is the same as BB's weight. If not, we change BB's weight |
1094 | 4.84k | // to match. Additionally, if BB had not been visited before, |
1095 | 4.84k | // we mark it visited. |
1096 | 4.84k | // |
1097 | 4.84k | // - Only one edge is unknown and BB has already been visited. |
1098 | 4.84k | // In this case, we can compute the weight of the edge by |
1099 | 4.84k | // subtracting the total block weight from all the known |
1100 | 4.84k | // edge weights. If the edges weight more than BB, then the |
1101 | 4.84k | // edge of the last remaining edge is set to zero. |
1102 | 4.84k | // |
1103 | 4.84k | // - There exists a self-referential edge and the weight of BB is |
1104 | 4.84k | // known. In this case, this edge can be based on BB's weight. |
1105 | 4.84k | // We add up all the other known edges and set the weight on |
1106 | 4.84k | // the self-referential edge as we did in the previous case. |
1107 | 4.84k | // |
1108 | 4.84k | // In any other case, we must continue iterating. Eventually, |
1109 | 4.84k | // all edges will get a weight, or iteration will stop when |
1110 | 4.84k | // it reaches SampleProfileMaxPropagateIterations. |
1111 | 4.84k | if (NumUnknownEdges <= 1) { |
1112 | 4.57k | uint64_t &BBWeight = BlockWeights[EC]; |
1113 | 4.57k | if (NumUnknownEdges == 0) { |
1114 | 3.49k | if (!VisitedBlocks.count(EC)) { |
1115 | 709 | // If we already know the weight of all edges, the weight of the |
1116 | 709 | // basic block can be computed. It should be no larger than the sum |
1117 | 709 | // of all edge weights. |
1118 | 709 | if (TotalWeight > BBWeight) { |
1119 | 25 | BBWeight = TotalWeight; |
1120 | 25 | Changed = true; |
1121 | 25 | LLVM_DEBUG(dbgs() << "All edge weights for " << BB->getName() |
1122 | 25 | << " known. Set weight for block: "; |
1123 | 25 | printBlockWeight(dbgs(), BB);); |
1124 | 25 | } |
1125 | 2.78k | } else if (NumTotalEdges == 1 && |
1126 | 2.78k | EdgeWeights[SingleEdge] < BlockWeights[EC]1.42k ) { |
1127 | 96 | // If there is only one edge for the visited basic block, use the |
1128 | 96 | // block weight to adjust edge weight if edge weight is smaller. |
1129 | 96 | EdgeWeights[SingleEdge] = BlockWeights[EC]; |
1130 | 96 | Changed = true; |
1131 | 96 | } |
1132 | 3.49k | } else if (1.07k NumUnknownEdges == 11.07k && VisitedBlocks.count(EC)1.07k ) { |
1133 | 700 | // If there is a single unknown edge and the block has been |
1134 | 700 | // visited, then we can compute E's weight. |
1135 | 700 | if (BBWeight >= TotalWeight) |
1136 | 700 | EdgeWeights[UnknownEdge] = BBWeight - TotalWeight; |
1137 | 0 | else |
1138 | 0 | EdgeWeights[UnknownEdge] = 0; |
1139 | 700 | const BasicBlock *OtherEC; |
1140 | 700 | if (i == 0) |
1141 | 365 | OtherEC = EquivalenceClass[UnknownEdge.first]; |
1142 | 335 | else |
1143 | 335 | OtherEC = EquivalenceClass[UnknownEdge.second]; |
1144 | 700 | // Edge weights should never exceed the BB weights it connects. |
1145 | 700 | if (VisitedBlocks.count(OtherEC) && |
1146 | 700 | EdgeWeights[UnknownEdge] > BlockWeights[OtherEC]504 ) |
1147 | 76 | EdgeWeights[UnknownEdge] = BlockWeights[OtherEC]; |
1148 | 700 | VisitedEdges.insert(UnknownEdge); |
1149 | 700 | Changed = true; |
1150 | 700 | LLVM_DEBUG(dbgs() << "Set weight for edge: "; |
1151 | 700 | printEdgeWeight(dbgs(), UnknownEdge)); |
1152 | 700 | } |
1153 | 4.57k | } else if (266 VisitedBlocks.count(EC)266 && BlockWeights[EC] == 0188 ) { |
1154 | 12 | // If a block Weights 0, all its in/out edges should weight 0. |
1155 | 12 | if (i == 0) { |
1156 | 16 | for (auto *Pred : Predecessors[BB]) { |
1157 | 16 | Edge E = std::make_pair(Pred, BB); |
1158 | 16 | EdgeWeights[E] = 0; |
1159 | 16 | VisitedEdges.insert(E); |
1160 | 16 | } |
1161 | 8 | } else { |
1162 | 8 | for (auto *Succ : Successors[BB]) { |
1163 | 8 | Edge E = std::make_pair(BB, Succ); |
1164 | 8 | EdgeWeights[E] = 0; |
1165 | 8 | VisitedEdges.insert(E); |
1166 | 8 | } |
1167 | 4 | } |
1168 | 254 | } else if (SelfReferentialEdge.first && VisitedBlocks.count(EC)0 ) { |
1169 | 0 | uint64_t &BBWeight = BlockWeights[BB]; |
1170 | 0 | // We have a self-referential edge and the weight of BB is known. |
1171 | 0 | if (BBWeight >= TotalWeight) |
1172 | 0 | EdgeWeights[SelfReferentialEdge] = BBWeight - TotalWeight; |
1173 | 0 | else |
1174 | 0 | EdgeWeights[SelfReferentialEdge] = 0; |
1175 | 0 | VisitedEdges.insert(SelfReferentialEdge); |
1176 | 0 | Changed = true; |
1177 | 0 | LLVM_DEBUG(dbgs() << "Set self-referential edge weight to: "; |
1178 | 0 | printEdgeWeight(dbgs(), SelfReferentialEdge)); |
1179 | 0 | } |
1180 | 4.84k | if (UpdateBlockCount && !VisitedBlocks.count(EC)1.18k && TotalWeight > 0239 ) { |
1181 | 34 | BlockWeights[EC] = TotalWeight; |
1182 | 34 | VisitedBlocks.insert(EC); |
1183 | 34 | Changed = true; |
1184 | 34 | } |
1185 | 4.84k | } |
1186 | 2.42k | } |
1187 | 497 | |
1188 | 497 | return Changed; |
1189 | 497 | } |
1190 | | |
1191 | | /// Build in/out edge lists for each basic block in the CFG. |
1192 | | /// |
1193 | | /// We are interested in unique edges. If a block B1 has multiple |
1194 | | /// edges to another block B2, we only add a single B1->B2 edge. |
1195 | 104 | void SampleProfileLoader::buildEdges(Function &F) { |
1196 | 408 | for (auto &BI : F) { |
1197 | 408 | BasicBlock *B1 = &BI; |
1198 | 408 | |
1199 | 408 | // Add predecessors for B1. |
1200 | 408 | SmallPtrSet<BasicBlock *, 16> Visited; |
1201 | 408 | if (!Predecessors[B1].empty()) |
1202 | 408 | llvm_unreachable0 ("Found a stale predecessors list in a basic block."); |
1203 | 810 | for (pred_iterator PI = pred_begin(B1), PE = pred_end(B1); 408 PI != PE; ++PI402 ) { |
1204 | 402 | BasicBlock *B2 = *PI; |
1205 | 402 | if (Visited.insert(B2).second) |
1206 | 402 | Predecessors[B1].push_back(B2); |
1207 | 402 | } |
1208 | 408 | |
1209 | 408 | // Add successors for B1. |
1210 | 408 | Visited.clear(); |
1211 | 408 | if (!Successors[B1].empty()) |
1212 | 408 | llvm_unreachable0 ("Found a stale successors list in a basic block."); |
1213 | 810 | for (succ_iterator SI = succ_begin(B1), SE = succ_end(B1); 408 SI != SE; ++SI402 ) { |
1214 | 402 | BasicBlock *B2 = *SI; |
1215 | 402 | if (Visited.insert(B2).second) |
1216 | 402 | Successors[B1].push_back(B2); |
1217 | 402 | } |
1218 | 408 | } |
1219 | 104 | } |
1220 | | |
1221 | | /// Returns the sorted CallTargetMap \p M by count in descending order. |
1222 | | static SmallVector<InstrProfValueData, 2> SortCallTargets( |
1223 | 17 | const SampleRecord::CallTargetMap &M) { |
1224 | 17 | SmallVector<InstrProfValueData, 2> R; |
1225 | 41 | for (auto I = M.begin(); I != M.end(); ++I24 ) |
1226 | 24 | R.push_back({FunctionSamples::getGUID(I->getKey()), I->getValue()}); |
1227 | 17 | llvm::sort(R, [](const InstrProfValueData &L, const InstrProfValueData &R) { |
1228 | 7 | if (L.Count == R.Count) |
1229 | 0 | return L.Value > R.Value; |
1230 | 7 | else |
1231 | 7 | return L.Count > R.Count; |
1232 | 7 | }); |
1233 | 17 | return R; |
1234 | 17 | } |
1235 | | |
1236 | | /// Propagate weights into edges |
1237 | | /// |
1238 | | /// The following rules are applied to every block BB in the CFG: |
1239 | | /// |
1240 | | /// - If BB has a single predecessor/successor, then the weight |
1241 | | /// of that edge is the weight of the block. |
1242 | | /// |
1243 | | /// - If all incoming or outgoing edges are known except one, and the |
1244 | | /// weight of the block is already known, the weight of the unknown |
1245 | | /// edge will be the weight of the block minus the sum of all the known |
1246 | | /// edges. If the sum of all the known edges is larger than BB's weight, |
1247 | | /// we set the unknown edge weight to zero. |
1248 | | /// |
1249 | | /// - If there is a self-referential edge, and the weight of the block is |
1250 | | /// known, the weight for that edge is set to the weight of the block |
1251 | | /// minus the weight of the other incoming edges to that block (if |
1252 | | /// known). |
1253 | 104 | void SampleProfileLoader::propagateWeights(Function &F) { |
1254 | 104 | bool Changed = true; |
1255 | 104 | unsigned I = 0; |
1256 | 104 | |
1257 | 104 | // If BB weight is larger than its corresponding loop's header BB weight, |
1258 | 104 | // use the BB weight to replace the loop header BB weight. |
1259 | 408 | for (auto &BI : F) { |
1260 | 408 | BasicBlock *BB = &BI; |
1261 | 408 | Loop *L = LI->getLoopFor(BB); |
1262 | 408 | if (!L) { |
1263 | 241 | continue; |
1264 | 241 | } |
1265 | 167 | BasicBlock *Header = L->getHeader(); |
1266 | 167 | if (Header && BlockWeights[BB] > BlockWeights[Header]) { |
1267 | 22 | BlockWeights[Header] = BlockWeights[BB]; |
1268 | 22 | } |
1269 | 167 | } |
1270 | 104 | |
1271 | 104 | // Before propagation starts, build, for each block, a list of |
1272 | 104 | // unique predecessors and successors. This is necessary to handle |
1273 | 104 | // identical edges in multiway branches. Since we visit all blocks and all |
1274 | 104 | // edges of the CFG, it is cleaner to build these lists once at the start |
1275 | 104 | // of the pass. |
1276 | 104 | buildEdges(F); |
1277 | 104 | |
1278 | 104 | // Propagate until we converge or we go past the iteration limit. |
1279 | 284 | while (Changed && I++ < SampleProfileMaxPropagateIterations180 ) { |
1280 | 180 | Changed = propagateThroughEdges(F, false); |
1281 | 180 | } |
1282 | 104 | |
1283 | 104 | // The first propagation propagates BB counts from annotated BBs to unknown |
1284 | 104 | // BBs. The 2nd propagation pass resets edges weights, and use all BB weights |
1285 | 104 | // to propagate edge weights. |
1286 | 104 | VisitedEdges.clear(); |
1287 | 104 | Changed = true; |
1288 | 284 | while (Changed && I++ < SampleProfileMaxPropagateIterations180 ) { |
1289 | 180 | Changed = propagateThroughEdges(F, false); |
1290 | 180 | } |
1291 | 104 | |
1292 | 104 | // The 3rd propagation pass allows adjust annotated BB weights that are |
1293 | 104 | // obviously wrong. |
1294 | 104 | Changed = true; |
1295 | 241 | while (Changed && I++ < SampleProfileMaxPropagateIterations137 ) { |
1296 | 137 | Changed = propagateThroughEdges(F, true); |
1297 | 137 | } |
1298 | 104 | |
1299 | 104 | // Generate MD_prof metadata for every branch instruction using the |
1300 | 104 | // edge weights computed during propagation. |
1301 | 104 | LLVM_DEBUG(dbgs() << "\nPropagation complete. Setting branch weights\n"); |
1302 | 104 | LLVMContext &Ctx = F.getContext(); |
1303 | 104 | MDBuilder MDB(Ctx); |
1304 | 408 | for (auto &BI : F) { |
1305 | 408 | BasicBlock *BB = &BI; |
1306 | 408 | |
1307 | 408 | if (BlockWeights[BB]) { |
1308 | 1.61k | for (auto &I : BB->getInstList()) { |
1309 | 1.61k | if (!isa<CallInst>(I) && !isa<InvokeInst>(I)1.35k ) |
1310 | 1.35k | continue; |
1311 | 252 | CallSite CS(&I); |
1312 | 252 | if (!CS.getCalledFunction()) { |
1313 | 25 | const DebugLoc &DLoc = I.getDebugLoc(); |
1314 | 25 | if (!DLoc) |
1315 | 0 | continue; |
1316 | 25 | const DILocation *DIL = DLoc; |
1317 | 25 | uint32_t LineOffset = FunctionSamples::getOffset(DIL); |
1318 | 25 | uint32_t Discriminator = DIL->getBaseDiscriminator(); |
1319 | 25 | |
1320 | 25 | const FunctionSamples *FS = findFunctionSamples(I); |
1321 | 25 | if (!FS) |
1322 | 0 | continue; |
1323 | 25 | auto T = FS->findCallTargetMapAt(LineOffset, Discriminator); |
1324 | 25 | if (!T || T.get().empty()21 ) |
1325 | 8 | continue; |
1326 | 17 | SmallVector<InstrProfValueData, 2> SortedCallTargets = |
1327 | 17 | SortCallTargets(T.get()); |
1328 | 17 | uint64_t Sum; |
1329 | 17 | findIndirectCallFunctionSamples(I, Sum); |
1330 | 17 | annotateValueSite(*I.getParent()->getParent()->getParent(), I, |
1331 | 17 | SortedCallTargets, Sum, IPVK_IndirectCallTarget, |
1332 | 17 | SortedCallTargets.size()); |
1333 | 227 | } else if (!isa<IntrinsicInst>(&I)) { |
1334 | 86 | I.setMetadata(LLVMContext::MD_prof, |
1335 | 86 | MDB.createBranchWeights( |
1336 | 86 | {static_cast<uint32_t>(BlockWeights[BB])})); |
1337 | 86 | } |
1338 | 252 | } |
1339 | 347 | } |
1340 | 408 | Instruction *TI = BB->getTerminator(); |
1341 | 408 | if (TI->getNumSuccessors() == 1) |
1342 | 196 | continue; |
1343 | 212 | if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI)109 ) |
1344 | 109 | continue; |
1345 | 103 | |
1346 | 103 | DebugLoc BranchLoc = TI->getDebugLoc(); |
1347 | 103 | LLVM_DEBUG(dbgs() << "\nGetting weights for branch at line " |
1348 | 103 | << ((BranchLoc) ? Twine(BranchLoc.getLine()) |
1349 | 103 | : Twine("<UNKNOWN LOCATION>")) |
1350 | 103 | << ".\n"); |
1351 | 103 | SmallVector<uint32_t, 4> Weights; |
1352 | 103 | uint32_t MaxWeight = 0; |
1353 | 103 | Instruction *MaxDestInst; |
1354 | 309 | for (unsigned I = 0; I < TI->getNumSuccessors(); ++I206 ) { |
1355 | 206 | BasicBlock *Succ = TI->getSuccessor(I); |
1356 | 206 | Edge E = std::make_pair(BB, Succ); |
1357 | 206 | uint64_t Weight = EdgeWeights[E]; |
1358 | 206 | LLVM_DEBUG(dbgs() << "\t"; printEdgeWeight(dbgs(), E)); |
1359 | 206 | // Use uint32_t saturated arithmetic to adjust the incoming weights, |
1360 | 206 | // if needed. Sample counts in profiles are 64-bit unsigned values, |
1361 | 206 | // but internally branch weights are expressed as 32-bit values. |
1362 | 206 | if (Weight > std::numeric_limits<uint32_t>::max()) { |
1363 | 0 | LLVM_DEBUG(dbgs() << " (saturated due to uint32_t overflow)"); |
1364 | 0 | Weight = std::numeric_limits<uint32_t>::max(); |
1365 | 0 | } |
1366 | 206 | // Weight is added by one to avoid propagation errors introduced by |
1367 | 206 | // 0 weights. |
1368 | 206 | Weights.push_back(static_cast<uint32_t>(Weight + 1)); |
1369 | 206 | if (Weight != 0) { |
1370 | 143 | if (Weight > MaxWeight) { |
1371 | 99 | MaxWeight = Weight; |
1372 | 99 | MaxDestInst = Succ->getFirstNonPHIOrDbgOrLifetime(); |
1373 | 99 | } |
1374 | 143 | } |
1375 | 206 | } |
1376 | 103 | |
1377 | 103 | uint64_t TempWeight; |
1378 | 103 | // Only set weights if there is at least one non-zero weight. |
1379 | 103 | // In any other case, let the analyzer set weights. |
1380 | 103 | // Do not set weights if the weights are present. In ThinLTO, the profile |
1381 | 103 | // annotation is done twice. If the first annotation already set the |
1382 | 103 | // weights, the second pass does not need to set it. |
1383 | 103 | if (MaxWeight > 0 && !TI->extractProfTotalWeight(TempWeight)88 ) { |
1384 | 79 | LLVM_DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n"); |
1385 | 79 | TI->setMetadata(LLVMContext::MD_prof, |
1386 | 79 | MDB.createBranchWeights(Weights)); |
1387 | 79 | ORE->emit([&]() { |
1388 | 14 | return OptimizationRemark(DEBUG_TYPE, "PopularDest", MaxDestInst) |
1389 | 14 | << "most popular destination for conditional branches at " |
1390 | 14 | << ore::NV("CondBranchesLoc", BranchLoc); |
1391 | 14 | }); |
1392 | 79 | } else { |
1393 | 24 | LLVM_DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n"); |
1394 | 24 | } |
1395 | 103 | } |
1396 | 104 | } |
1397 | | |
1398 | | /// Get the line number for the function header. |
1399 | | /// |
1400 | | /// This looks up function \p F in the current compilation unit and |
1401 | | /// retrieves the line number where the function is defined. This is |
1402 | | /// line 0 for all the samples read from the profile file. Every line |
1403 | | /// number is relative to this line. |
1404 | | /// |
1405 | | /// \param F Function object to query. |
1406 | | /// |
1407 | | /// \returns the line number where \p F is defined. If it returns 0, |
1408 | | /// it means that there is no debug information available for \p F. |
1409 | 127 | unsigned SampleProfileLoader::getFunctionLoc(Function &F) { |
1410 | 127 | if (DISubprogram *S = F.getSubprogram()) |
1411 | 118 | return S->getLine(); |
1412 | 9 | |
1413 | 9 | if (NoWarnSampleUnused) |
1414 | 0 | return 0; |
1415 | 9 | |
1416 | 9 | // If the start of \p F is missing, emit a diagnostic to inform the user |
1417 | 9 | // about the missed opportunity. |
1418 | 9 | F.getContext().diagnose(DiagnosticInfoSampleProfile( |
1419 | 9 | "No debug information found in function " + F.getName() + |
1420 | 9 | ": Function profile not used", |
1421 | 9 | DS_Warning)); |
1422 | 9 | return 0; |
1423 | 9 | } |
1424 | | |
1425 | 104 | void SampleProfileLoader::computeDominanceAndLoopInfo(Function &F) { |
1426 | 104 | DT.reset(new DominatorTree); |
1427 | 104 | DT->recalculate(F); |
1428 | 104 | |
1429 | 104 | PDT.reset(new PostDominatorTree(F)); |
1430 | 104 | |
1431 | 104 | LI.reset(new LoopInfo); |
1432 | 104 | LI->analyze(*DT); |
1433 | 104 | } |
1434 | | |
1435 | | /// Generate branch weight metadata for all branches in \p F. |
1436 | | /// |
1437 | | /// Branch weights are computed out of instruction samples using a |
1438 | | /// propagation heuristic. Propagation proceeds in 3 phases: |
1439 | | /// |
1440 | | /// 1- Assignment of block weights. All the basic blocks in the function |
1441 | | /// are initial assigned the same weight as their most frequently |
1442 | | /// executed instruction. |
1443 | | /// |
1444 | | /// 2- Creation of equivalence classes. Since samples may be missing from |
1445 | | /// blocks, we can fill in the gaps by setting the weights of all the |
1446 | | /// blocks in the same equivalence class to the same weight. To compute |
1447 | | /// the concept of equivalence, we use dominance and loop information. |
1448 | | /// Two blocks B1 and B2 are in the same equivalence class if B1 |
1449 | | /// dominates B2, B2 post-dominates B1 and both are in the same loop. |
1450 | | /// |
1451 | | /// 3- Propagation of block weights into edges. This uses a simple |
1452 | | /// propagation heuristic. The following rules are applied to every |
1453 | | /// block BB in the CFG: |
1454 | | /// |
1455 | | /// - If BB has a single predecessor/successor, then the weight |
1456 | | /// of that edge is the weight of the block. |
1457 | | /// |
1458 | | /// - If all the edges are known except one, and the weight of the |
1459 | | /// block is already known, the weight of the unknown edge will |
1460 | | /// be the weight of the block minus the sum of all the known |
1461 | | /// edges. If the sum of all the known edges is larger than BB's weight, |
1462 | | /// we set the unknown edge weight to zero. |
1463 | | /// |
1464 | | /// - If there is a self-referential edge, and the weight of the block is |
1465 | | /// known, the weight for that edge is set to the weight of the block |
1466 | | /// minus the weight of the other incoming edges to that block (if |
1467 | | /// known). |
1468 | | /// |
1469 | | /// Since this propagation is not guaranteed to finalize for every CFG, we |
1470 | | /// only allow it to proceed for a limited number of iterations (controlled |
1471 | | /// by -sample-profile-max-propagate-iterations). |
1472 | | /// |
1473 | | /// FIXME: Try to replace this propagation heuristic with a scheme |
1474 | | /// that is guaranteed to finalize. A work-list approach similar to |
1475 | | /// the standard value propagation algorithm used by SSA-CCP might |
1476 | | /// work here. |
1477 | | /// |
1478 | | /// Once all the branch weights are computed, we emit the MD_prof |
1479 | | /// metadata on BB using the computed values for each of its branches. |
1480 | | /// |
1481 | | /// \param F The function to query. |
1482 | | /// |
1483 | | /// \returns true if \p F was modified. Returns false, otherwise. |
1484 | 119 | bool SampleProfileLoader::emitAnnotations(Function &F) { |
1485 | 119 | bool Changed = false; |
1486 | 119 | |
1487 | 119 | if (getFunctionLoc(F) == 0) |
1488 | 9 | return false; |
1489 | 110 | |
1490 | 110 | LLVM_DEBUG(dbgs() << "Line number for the first instruction in " |
1491 | 110 | << F.getName() << ": " << getFunctionLoc(F) << "\n"); |
1492 | 110 | |
1493 | 110 | DenseSet<GlobalValue::GUID> InlinedGUIDs; |
1494 | 110 | Changed |= inlineHotFunctions(F, InlinedGUIDs); |
1495 | 110 | |
1496 | 110 | // Compute basic block weights. |
1497 | 110 | Changed |= computeBlockWeights(F); |
1498 | 110 | |
1499 | 110 | if (Changed) { |
1500 | 104 | // Add an entry count to the function using the samples gathered at the |
1501 | 104 | // function entry. |
1502 | 104 | // Sets the GUIDs that are inlined in the profiled binary. This is used |
1503 | 104 | // for ThinLink to make correct liveness analysis, and also make the IR |
1504 | 104 | // match the profiled binary before annotation. |
1505 | 104 | F.setEntryCount( |
1506 | 104 | ProfileCount(Samples->getHeadSamples() + 1, Function::PCT_Real), |
1507 | 104 | &InlinedGUIDs); |
1508 | 104 | |
1509 | 104 | // Compute dominance and loop info needed for propagation. |
1510 | 104 | computeDominanceAndLoopInfo(F); |
1511 | 104 | |
1512 | 104 | // Find equivalence classes. |
1513 | 104 | findEquivalenceClasses(F); |
1514 | 104 | |
1515 | 104 | // Propagate weights to all edges. |
1516 | 104 | propagateWeights(F); |
1517 | 104 | } |
1518 | 110 | |
1519 | 110 | // If coverage checking was requested, compute it now. |
1520 | 110 | if (SampleProfileRecordCoverage) { |
1521 | 6 | unsigned Used = CoverageTracker.countUsedRecords(Samples, PSI); |
1522 | 6 | unsigned Total = CoverageTracker.countBodyRecords(Samples, PSI); |
1523 | 6 | unsigned Coverage = CoverageTracker.computeCoverage(Used, Total); |
1524 | 6 | if (Coverage < SampleProfileRecordCoverage) { |
1525 | 4 | F.getContext().diagnose(DiagnosticInfoSampleProfile( |
1526 | 4 | F.getSubprogram()->getFilename(), getFunctionLoc(F), |
1527 | 4 | Twine(Used) + " of " + Twine(Total) + " available profile records (" + |
1528 | 4 | Twine(Coverage) + "%) were applied", |
1529 | 4 | DS_Warning)); |
1530 | 4 | } |
1531 | 6 | } |
1532 | 110 | |
1533 | 110 | if (SampleProfileSampleCoverage) { |
1534 | 4 | uint64_t Used = CoverageTracker.getTotalUsedSamples(); |
1535 | 4 | uint64_t Total = CoverageTracker.countBodySamples(Samples, PSI); |
1536 | 4 | unsigned Coverage = CoverageTracker.computeCoverage(Used, Total); |
1537 | 4 | if (Coverage < SampleProfileSampleCoverage) { |
1538 | 4 | F.getContext().diagnose(DiagnosticInfoSampleProfile( |
1539 | 4 | F.getSubprogram()->getFilename(), getFunctionLoc(F), |
1540 | 4 | Twine(Used) + " of " + Twine(Total) + " available profile samples (" + |
1541 | 4 | Twine(Coverage) + "%) were applied", |
1542 | 4 | DS_Warning)); |
1543 | 4 | } |
1544 | 4 | } |
1545 | 110 | return Changed; |
1546 | 110 | } |
1547 | | |
1548 | | char SampleProfileLoaderLegacyPass::ID = 0; |
1549 | | |
1550 | 11.0k | INITIALIZE_PASS_BEGIN(SampleProfileLoaderLegacyPass, "sample-profile", |
1551 | 11.0k | "Sample Profile loader", false, false) |
1552 | 11.0k | INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) |
1553 | 11.0k | INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) |
1554 | 11.0k | INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) |
1555 | 11.0k | INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass, "sample-profile", |
1556 | | "Sample Profile loader", false, false) |
1557 | | |
1558 | 106 | bool SampleProfileLoader::doInitialization(Module &M) { |
1559 | 106 | auto &Ctx = M.getContext(); |
1560 | 106 | auto ReaderOrErr = SampleProfileReader::create(Filename, Ctx); |
1561 | 106 | if (std::error_code EC = ReaderOrErr.getError()) { |
1562 | 4 | std::string Msg = "Could not open profile: " + EC.message(); |
1563 | 4 | Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg)); |
1564 | 4 | return false; |
1565 | 4 | } |
1566 | 102 | Reader = std::move(ReaderOrErr.get()); |
1567 | 102 | Reader->collectFuncsToUse(M); |
1568 | 102 | ProfileIsValid = (Reader->read() == sampleprof_error::success); |
1569 | 102 | |
1570 | 102 | if (!RemappingFilename.empty()) { |
1571 | 2 | // Apply profile remappings to the loaded profile data if requested. |
1572 | 2 | // For now, we only support remapping symbols encoded using the Itanium |
1573 | 2 | // C++ ABI's name mangling scheme. |
1574 | 2 | ReaderOrErr = SampleProfileReaderItaniumRemapper::create( |
1575 | 2 | RemappingFilename, Ctx, std::move(Reader)); |
1576 | 2 | if (std::error_code EC = ReaderOrErr.getError()) { |
1577 | 0 | std::string Msg = "Could not open profile remapping file: " + EC.message(); |
1578 | 0 | Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg)); |
1579 | 0 | return false; |
1580 | 0 | } |
1581 | 2 | Reader = std::move(ReaderOrErr.get()); |
1582 | 2 | ProfileIsValid = (Reader->read() == sampleprof_error::success); |
1583 | 2 | } |
1584 | 102 | return true; |
1585 | 102 | } |
1586 | | |
1587 | 0 | ModulePass *llvm::createSampleProfileLoaderPass() { |
1588 | 0 | return new SampleProfileLoaderLegacyPass(); |
1589 | 0 | } |
1590 | | |
1591 | 19 | ModulePass *llvm::createSampleProfileLoaderPass(StringRef Name) { |
1592 | 19 | return new SampleProfileLoaderLegacyPass(Name); |
1593 | 19 | } |
1594 | | |
1595 | | bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM, |
1596 | 94 | ProfileSummaryInfo *_PSI) { |
1597 | 94 | FunctionSamples::GUIDToFuncNameMapper Mapper(M); |
1598 | 94 | if (!ProfileIsValid) |
1599 | 0 | return false; |
1600 | 94 | |
1601 | 94 | PSI = _PSI; |
1602 | 94 | if (M.getProfileSummary(/* IsCS */ false) == nullptr) |
1603 | 91 | M.setProfileSummary(Reader->getSummary().getMD(M.getContext()), |
1604 | 91 | ProfileSummary::PSK_Sample); |
1605 | 94 | |
1606 | 94 | // Compute the total number of samples collected in this profile. |
1607 | 94 | for (const auto &I : Reader->getProfiles()) |
1608 | 143 | TotalCollectedSamples += I.second.getTotalSamples(); |
1609 | 94 | |
1610 | 94 | // Populate the symbol map. |
1611 | 344 | for (const auto &N_F : M.getValueSymbolTable()) { |
1612 | 344 | StringRef OrigName = N_F.getKey(); |
1613 | 344 | Function *F = dyn_cast<Function>(N_F.getValue()); |
1614 | 344 | if (F == nullptr) |
1615 | 36 | continue; |
1616 | 308 | SymbolMap[OrigName] = F; |
1617 | 308 | auto pos = OrigName.find('.'); |
1618 | 308 | if (pos != StringRef::npos) { |
1619 | 48 | StringRef NewName = OrigName.substr(0, pos); |
1620 | 48 | auto r = SymbolMap.insert(std::make_pair(NewName, F)); |
1621 | 48 | // Failiing to insert means there is already an entry in SymbolMap, |
1622 | 48 | // thus there are multiple functions that are mapped to the same |
1623 | 48 | // stripped name. In this case of name conflicting, set the value |
1624 | 48 | // to nullptr to avoid confusion. |
1625 | 48 | if (!r.second) |
1626 | 22 | r.first->second = nullptr; |
1627 | 48 | } |
1628 | 308 | } |
1629 | 94 | |
1630 | 94 | bool retval = false; |
1631 | 94 | for (auto &F : M) |
1632 | 326 | if (!F.isDeclaration()) { |
1633 | 203 | clearFunctionData(); |
1634 | 203 | retval |= runOnFunction(F, AM); |
1635 | 203 | } |
1636 | 94 | |
1637 | 94 | // Account for cold calls not inlined.... |
1638 | 94 | for (const std::pair<Function *, NotInlinedProfileInfo> &pair : |
1639 | 94 | notInlinedCallInfo) |
1640 | 5 | updateProfileCallee(pair.first, pair.second.entryCount); |
1641 | 94 | |
1642 | 94 | return retval; |
1643 | 94 | } |
1644 | | |
1645 | 56 | bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) { |
1646 | 56 | ACT = &getAnalysis<AssumptionCacheTracker>(); |
1647 | 56 | TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>(); |
1648 | 56 | ProfileSummaryInfo *PSI = |
1649 | 56 | &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); |
1650 | 56 | return SampleLoader.runOnModule(M, nullptr, PSI); |
1651 | 56 | } |
1652 | | |
1653 | 203 | bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) { |
1654 | 203 | |
1655 | 203 | DILocation2SampleMap.clear(); |
1656 | 203 | // By default the entry count is initialized to -1, which will be treated |
1657 | 203 | // conservatively by getEntryCount as the same as unknown (None). This is |
1658 | 203 | // to avoid newly added code to be treated as cold. If we have samples |
1659 | 203 | // this will be overwritten in emitAnnotations. |
1660 | 203 | // If ProfileSampleAccurate is true or F has profile-sample-accurate |
1661 | 203 | // attribute, initialize the entry count to 0 so callsites or functions |
1662 | 203 | // unsampled will be treated as cold. |
1663 | 203 | uint64_t initialEntryCount = |
1664 | 203 | (ProfileSampleAccurate || F.hasFnAttribute("profile-sample-accurate")195 ) |
1665 | 203 | ? 010 |
1666 | 203 | : -1193 ; |
1667 | 203 | F.setEntryCount(ProfileCount(initialEntryCount, Function::PCT_Real)); |
1668 | 203 | std::unique_ptr<OptimizationRemarkEmitter> OwnedORE; |
1669 | 203 | if (AM) { |
1670 | 68 | auto &FAM = |
1671 | 68 | AM->getResult<FunctionAnalysisManagerModuleProxy>(*F.getParent()) |
1672 | 68 | .getManager(); |
1673 | 68 | ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); |
1674 | 135 | } else { |
1675 | 135 | OwnedORE = make_unique<OptimizationRemarkEmitter>(&F); |
1676 | 135 | ORE = OwnedORE.get(); |
1677 | 135 | } |
1678 | 203 | Samples = Reader->getSamplesFor(F); |
1679 | 203 | if (Samples && !Samples->empty()124 ) |
1680 | 119 | return emitAnnotations(F); |
1681 | 84 | return false; |
1682 | 84 | } |
1683 | | |
1684 | | PreservedAnalyses SampleProfileLoaderPass::run(Module &M, |
1685 | 44 | ModuleAnalysisManager &AM) { |
1686 | 44 | FunctionAnalysisManager &FAM = |
1687 | 44 | AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); |
1688 | 44 | |
1689 | 44 | auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & { |
1690 | 16 | return FAM.getResult<AssumptionAnalysis>(F); |
1691 | 16 | }; |
1692 | 44 | auto GetTTI = [&](Function &F) -> TargetTransformInfo & { |
1693 | 8 | return FAM.getResult<TargetIRAnalysis>(F); |
1694 | 8 | }; |
1695 | 44 | |
1696 | 44 | SampleProfileLoader SampleLoader( |
1697 | 44 | ProfileFileName.empty() ? SampleProfileFile29 : ProfileFileName15 , |
1698 | 44 | ProfileRemappingFileName.empty() ? SampleProfileRemappingFile43 |
1699 | 44 | : ProfileRemappingFileName1 , |
1700 | 44 | IsThinLTOPreLink, GetAssumptionCache, GetTTI); |
1701 | 44 | |
1702 | 44 | SampleLoader.doInitialization(M); |
1703 | 44 | |
1704 | 44 | ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M); |
1705 | 44 | if (!SampleLoader.runOnModule(M, &AM, PSI)) |
1706 | 7 | return PreservedAnalyses::all(); |
1707 | 37 | |
1708 | 37 | return PreservedAnalyses::none(); |
1709 | 37 | } |