/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Transforms/IPO/Inliner.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===- Inliner.cpp - Code common to all inliners --------------------------===// |
2 | | // |
3 | | // The LLVM Compiler Infrastructure |
4 | | // |
5 | | // This file is distributed under the University of Illinois Open Source |
6 | | // License. See LICENSE.TXT for details. |
7 | | // |
8 | | //===----------------------------------------------------------------------===// |
9 | | // |
10 | | // This file implements the mechanics required to implement inlining without |
11 | | // missing any calls and updating the call graph. The decisions of which calls |
12 | | // are profitable to inline are implemented elsewhere. |
13 | | // |
14 | | //===----------------------------------------------------------------------===// |
15 | | |
16 | | #include "llvm/Transforms/IPO/Inliner.h" |
17 | | #include "llvm/ADT/SmallPtrSet.h" |
18 | | #include "llvm/ADT/Statistic.h" |
19 | | #include "llvm/Analysis/AliasAnalysis.h" |
20 | | #include "llvm/Analysis/AssumptionCache.h" |
21 | | #include "llvm/Analysis/BasicAliasAnalysis.h" |
22 | | #include "llvm/Analysis/BlockFrequencyInfo.h" |
23 | | #include "llvm/Analysis/CallGraph.h" |
24 | | #include "llvm/Analysis/InlineCost.h" |
25 | | #include "llvm/Analysis/OptimizationDiagnosticInfo.h" |
26 | | #include "llvm/Analysis/ProfileSummaryInfo.h" |
27 | | #include "llvm/Analysis/TargetLibraryInfo.h" |
28 | | #include "llvm/IR/CallSite.h" |
29 | | #include "llvm/IR/DataLayout.h" |
30 | | #include "llvm/IR/DiagnosticInfo.h" |
31 | | #include "llvm/IR/InstIterator.h" |
32 | | #include "llvm/IR/Instructions.h" |
33 | | #include "llvm/IR/IntrinsicInst.h" |
34 | | #include "llvm/IR/Module.h" |
35 | | #include "llvm/Support/Debug.h" |
36 | | #include "llvm/Support/raw_ostream.h" |
37 | | #include "llvm/Transforms/Utils/Cloning.h" |
38 | | #include "llvm/Transforms/Utils/Local.h" |
39 | | #include "llvm/Transforms/Utils/ModuleUtils.h" |
40 | | using namespace llvm; |
41 | | |
42 | 726k | #define DEBUG_TYPE "inline" |
43 | | |
44 | | STATISTIC(NumInlined, "Number of functions inlined"); |
45 | | STATISTIC(NumCallsDeleted, "Number of call sites deleted, not inlined"); |
46 | | STATISTIC(NumDeleted, "Number of functions deleted because all callers found"); |
47 | | STATISTIC(NumMergedAllocas, "Number of allocas merged together"); |
48 | | |
49 | | // This weirdly named statistic tracks the number of times that, when attempting |
50 | | // to inline a function A into B, we analyze the callers of B in order to see |
51 | | // if those would be more profitable and blocked inline steps. |
52 | | STATISTIC(NumCallerCallersAnalyzed, "Number of caller-callers analyzed"); |
53 | | |
54 | | /// Flag to disable manual alloca merging. |
55 | | /// |
56 | | /// Merging of allocas was originally done as a stack-size saving technique |
57 | | /// prior to LLVM's code generator having support for stack coloring based on |
58 | | /// lifetime markers. It is now in the process of being removed. To experiment |
59 | | /// with disabling it and relying fully on lifetime marker based stack |
60 | | /// coloring, you can pass this flag to LLVM. |
61 | | static cl::opt<bool> |
62 | | DisableInlinedAllocaMerging("disable-inlined-alloca-merging", |
63 | | cl::init(false), cl::Hidden); |
64 | | |
65 | | namespace { |
66 | | enum class InlinerFunctionImportStatsOpts { |
67 | | No = 0, |
68 | | Basic = 1, |
69 | | Verbose = 2, |
70 | | }; |
71 | | |
72 | | cl::opt<InlinerFunctionImportStatsOpts> InlinerFunctionImportStats( |
73 | | "inliner-function-import-stats", |
74 | | cl::init(InlinerFunctionImportStatsOpts::No), |
75 | | cl::values(clEnumValN(InlinerFunctionImportStatsOpts::Basic, "basic", |
76 | | "basic statistics"), |
77 | | clEnumValN(InlinerFunctionImportStatsOpts::Verbose, "verbose", |
78 | | "printing of statistics for each inlined function")), |
79 | | cl::Hidden, cl::desc("Enable inliner stats for imported functions")); |
80 | | } // namespace |
81 | | |
82 | | LegacyInlinerBase::LegacyInlinerBase(char &ID) |
83 | 17.5k | : CallGraphSCCPass(ID), InsertLifetime(true) {} |
84 | | |
85 | | LegacyInlinerBase::LegacyInlinerBase(char &ID, bool InsertLifetime) |
86 | 6.73k | : CallGraphSCCPass(ID), InsertLifetime(InsertLifetime) {} |
87 | | |
88 | | /// For this class, we declare that we require and preserve the call graph. |
89 | | /// If the derived class implements this method, it should |
90 | | /// always explicitly call the implementation here. |
91 | 24.2k | void LegacyInlinerBase::getAnalysisUsage(AnalysisUsage &AU) const { |
92 | 24.2k | AU.addRequired<AssumptionCacheTracker>(); |
93 | 24.2k | AU.addRequired<ProfileSummaryInfoWrapperPass>(); |
94 | 24.2k | AU.addRequired<TargetLibraryInfoWrapperPass>(); |
95 | 24.2k | getAAResultsAnalysisUsage(AU); |
96 | 24.2k | CallGraphSCCPass::getAnalysisUsage(AU); |
97 | 24.2k | } |
98 | | |
99 | | typedef DenseMap<ArrayType *, std::vector<AllocaInst *>> InlinedArrayAllocasTy; |
100 | | |
101 | | /// Look at all of the allocas that we inlined through this call site. If we |
102 | | /// have already inlined other allocas through other calls into this function, |
103 | | /// then we know that they have disjoint lifetimes and that we can merge them. |
104 | | /// |
105 | | /// There are many heuristics possible for merging these allocas, and the |
106 | | /// different options have different tradeoffs. One thing that we *really* |
107 | | /// don't want to hurt is SRoA: once inlining happens, often allocas are no |
108 | | /// longer address taken and so they can be promoted. |
109 | | /// |
110 | | /// Our "solution" for that is to only merge allocas whose outermost type is an |
111 | | /// array type. These are usually not promoted because someone is using a |
112 | | /// variable index into them. These are also often the most important ones to |
113 | | /// merge. |
114 | | /// |
115 | | /// A better solution would be to have real memory lifetime markers in the IR |
116 | | /// and not have the inliner do any merging of allocas at all. This would |
117 | | /// allow the backend to do proper stack slot coloring of all allocas that |
118 | | /// *actually make it to the backend*, which is really what we want. |
119 | | /// |
120 | | /// Because we don't have this information, we do this simple and useful hack. |
121 | | static void mergeInlinedArrayAllocas( |
122 | | Function *Caller, InlineFunctionInfo &IFI, |
123 | 251k | InlinedArrayAllocasTy &InlinedArrayAllocas, int InlineHistory) { |
124 | 251k | SmallPtrSet<AllocaInst *, 16> UsedAllocas; |
125 | 251k | |
126 | 251k | // When processing our SCC, check to see if CS was inlined from some other |
127 | 251k | // call site. For example, if we're processing "A" in this code: |
128 | 251k | // A() { B() } |
129 | 251k | // B() { x = alloca ... C() } |
130 | 251k | // C() { y = alloca ... } |
131 | 251k | // Assume that C was not inlined into B initially, and so we're processing A |
132 | 251k | // and decide to inline B into A. Doing this makes an alloca available for |
133 | 251k | // reuse and makes a callsite (C) available for inlining. When we process |
134 | 251k | // the C call site we don't want to do any alloca merging between X and Y |
135 | 251k | // because their scopes are not disjoint. We could make this smarter by |
136 | 251k | // keeping track of the inline history for each alloca in the |
137 | 251k | // InlinedArrayAllocas but this isn't likely to be a significant win. |
138 | 251k | if (InlineHistory != -1) // Only do merging for top-level call sites in SCC. |
139 | 6.03k | return; |
140 | 245k | |
141 | 245k | // Loop over all the allocas we have so far and see if they can be merged with |
142 | 245k | // a previously inlined alloca. If not, remember that we had it. |
143 | 325k | for (unsigned AllocaNo = 0, e = IFI.StaticAllocas.size(); 245k AllocaNo != e325k ; |
144 | 245k | ++AllocaNo80.1k ) { |
145 | 80.1k | AllocaInst *AI = IFI.StaticAllocas[AllocaNo]; |
146 | 80.1k | |
147 | 80.1k | // Don't bother trying to merge array allocations (they will usually be |
148 | 80.1k | // canonicalized to be an allocation *of* an array), or allocations whose |
149 | 80.1k | // type is not itself an array (because we're afraid of pessimizing SRoA). |
150 | 80.1k | ArrayType *ATy = dyn_cast<ArrayType>(AI->getAllocatedType()); |
151 | 80.1k | if (!ATy || 80.1k AI->isArrayAllocation()1.20k ) |
152 | 78.8k | continue; |
153 | 1.20k | |
154 | 1.20k | // Get the list of all available allocas for this array type. |
155 | 1.20k | std::vector<AllocaInst *> &AllocasForType = InlinedArrayAllocas[ATy]; |
156 | 1.20k | |
157 | 1.20k | // Loop over the allocas in AllocasForType to see if we can reuse one. Note |
158 | 1.20k | // that we have to be careful not to reuse the same "available" alloca for |
159 | 1.20k | // multiple different allocas that we just inlined, we use the 'UsedAllocas' |
160 | 1.20k | // set to keep track of which "available" allocas are being used by this |
161 | 1.20k | // function. Also, AllocasForType can be empty of course! |
162 | 1.20k | bool MergedAwayAlloca = false; |
163 | 549 | for (AllocaInst *AvailableAlloca : AllocasForType) { |
164 | 549 | |
165 | 549 | unsigned Align1 = AI->getAlignment(), |
166 | 549 | Align2 = AvailableAlloca->getAlignment(); |
167 | 549 | |
168 | 549 | // The available alloca has to be in the right function, not in some other |
169 | 549 | // function in this SCC. |
170 | 549 | if (AvailableAlloca->getParent() != AI->getParent()) |
171 | 8 | continue; |
172 | 541 | |
173 | 541 | // If the inlined function already uses this alloca then we can't reuse |
174 | 541 | // it. |
175 | 541 | if (541 !UsedAllocas.insert(AvailableAlloca).second541 ) |
176 | 268 | continue; |
177 | 273 | |
178 | 273 | // Otherwise, we *can* reuse it, RAUW AI into AvailableAlloca and declare |
179 | 273 | // success! |
180 | 273 | DEBUG273 (dbgs() << " ***MERGED ALLOCA: " << *AI |
181 | 273 | << "\n\t\tINTO: " << *AvailableAlloca << '\n'); |
182 | 273 | |
183 | 273 | // Move affected dbg.declare calls immediately after the new alloca to |
184 | 273 | // avoid the situation when a dbg.declare precedes its alloca. |
185 | 273 | if (auto *L = LocalAsMetadata::getIfExists(AI)) |
186 | 1 | if (auto *1 MDV1 = MetadataAsValue::getIfExists(AI->getContext(), L)) |
187 | 1 | for (User *U : MDV->users()) |
188 | 1 | if (DbgDeclareInst *1 DDI1 = dyn_cast<DbgDeclareInst>(U)) |
189 | 1 | DDI->moveBefore(AvailableAlloca->getNextNode()); |
190 | 273 | |
191 | 273 | AI->replaceAllUsesWith(AvailableAlloca); |
192 | 273 | |
193 | 273 | if (Align1 != Align2273 ) { |
194 | 2 | if (!Align1 || 2 !Align22 ) { |
195 | 1 | const DataLayout &DL = Caller->getParent()->getDataLayout(); |
196 | 1 | unsigned TypeAlign = DL.getABITypeAlignment(AI->getAllocatedType()); |
197 | 1 | |
198 | 1 | Align1 = Align1 ? Align11 : TypeAlign0 ; |
199 | 1 | Align2 = Align2 ? Align20 : TypeAlign1 ; |
200 | 1 | } |
201 | 2 | |
202 | 2 | if (Align1 > Align2) |
203 | 2 | AvailableAlloca->setAlignment(AI->getAlignment()); |
204 | 2 | } |
205 | 549 | |
206 | 549 | AI->eraseFromParent(); |
207 | 549 | MergedAwayAlloca = true; |
208 | 549 | ++NumMergedAllocas; |
209 | 549 | IFI.StaticAllocas[AllocaNo] = nullptr; |
210 | 549 | break; |
211 | 549 | } |
212 | 1.20k | |
213 | 1.20k | // If we already nuked the alloca, we're done with it. |
214 | 1.20k | if (MergedAwayAlloca) |
215 | 273 | continue; |
216 | 934 | |
217 | 934 | // If we were unable to merge away the alloca either because there are no |
218 | 934 | // allocas of the right type available or because we reused them all |
219 | 934 | // already, remember that this alloca came from an inlined function and mark |
220 | 934 | // it used so we don't reuse it for other allocas from this inline |
221 | 934 | // operation. |
222 | 934 | AllocasForType.push_back(AI); |
223 | 934 | UsedAllocas.insert(AI); |
224 | 934 | } |
225 | 251k | } |
226 | | |
227 | | /// If it is possible to inline the specified call site, |
228 | | /// do so and update the CallGraph for this operation. |
229 | | /// |
230 | | /// This function also does some basic book-keeping to update the IR. The |
231 | | /// InlinedArrayAllocas map keeps track of any allocas that are already |
232 | | /// available from other functions inlined into the caller. If we are able to |
233 | | /// inline this call site we attempt to reuse already available allocas or add |
234 | | /// any new allocas to the set if not possible. |
235 | | static bool InlineCallIfPossible( |
236 | | CallSite CS, InlineFunctionInfo &IFI, |
237 | | InlinedArrayAllocasTy &InlinedArrayAllocas, int InlineHistory, |
238 | | bool InsertLifetime, function_ref<AAResults &(Function &)> &AARGetter, |
239 | 254k | ImportedFunctionsInliningStatistics &ImportedFunctionsStats) { |
240 | 254k | Function *Callee = CS.getCalledFunction(); |
241 | 254k | Function *Caller = CS.getCaller(); |
242 | 254k | |
243 | 254k | AAResults &AAR = AARGetter(*Callee); |
244 | 254k | |
245 | 254k | // Try to inline the function. Get the list of static allocas that were |
246 | 254k | // inlined. |
247 | 254k | if (!InlineFunction(CS, IFI, &AAR, InsertLifetime)) |
248 | 3.49k | return false; |
249 | 251k | |
250 | 251k | if (251k InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No251k ) |
251 | 30 | ImportedFunctionsStats.recordInline(*Caller, *Callee); |
252 | 251k | |
253 | 251k | AttributeFuncs::mergeAttributesForInlining(*Caller, *Callee); |
254 | 251k | |
255 | 251k | if (!DisableInlinedAllocaMerging) |
256 | 251k | mergeInlinedArrayAllocas(Caller, IFI, InlinedArrayAllocas, InlineHistory); |
257 | 254k | |
258 | 254k | return true; |
259 | 254k | } |
260 | | |
261 | | /// Return true if inlining of CS can block the caller from being |
262 | | /// inlined which is proved to be more beneficial. \p IC is the |
263 | | /// estimated inline cost associated with callsite \p CS. |
264 | | /// \p TotalSecondaryCost will be set to the estimated cost of inlining the |
265 | | /// caller if \p CS is suppressed for inlining. |
266 | | static bool |
267 | | shouldBeDeferred(Function *Caller, CallSite CS, InlineCost IC, |
268 | | int &TotalSecondaryCost, |
269 | 201k | function_ref<InlineCost(CallSite CS)> GetInlineCost) { |
270 | 201k | |
271 | 201k | // For now we only handle local or inline functions. |
272 | 201k | if (!Caller->hasLocalLinkage() && 201k !Caller->hasLinkOnceODRLinkage()150k ) |
273 | 112k | return false; |
274 | 89.0k | // Try to detect the case where the current inlining candidate caller (call |
275 | 89.0k | // it B) is a static or linkonce-ODR function and is an inlining candidate |
276 | 89.0k | // elsewhere, and the current candidate callee (call it C) is large enough |
277 | 89.0k | // that inlining it into B would make B too big to inline later. In these |
278 | 89.0k | // circumstances it may be best not to inline C into B, but to inline B into |
279 | 89.0k | // its callers. |
280 | 89.0k | // |
281 | 89.0k | // This only applies to static and linkonce-ODR functions because those are |
282 | 89.0k | // expected to be available for inlining in the translation units where they |
283 | 89.0k | // are used. Thus we will always have the opportunity to make local inlining |
284 | 89.0k | // decisions. Importantly the linkonce-ODR linkage covers inline functions |
285 | 89.0k | // and templates in C++. |
286 | 89.0k | // |
287 | 89.0k | // FIXME: All of this logic should be sunk into getInlineCost. It relies on |
288 | 89.0k | // the internal implementation of the inline cost metrics rather than |
289 | 89.0k | // treating them as truly abstract units etc. |
290 | 89.0k | TotalSecondaryCost = 0; |
291 | 89.0k | // The candidate cost to be imposed upon the current function. |
292 | 89.0k | int CandidateCost = IC.getCost() - 1; |
293 | 89.0k | // This bool tracks what happens if we do NOT inline C into B. |
294 | 89.0k | bool callerWillBeRemoved = Caller->hasLocalLinkage(); |
295 | 89.0k | // This bool tracks what happens if we DO inline C into B. |
296 | 89.0k | bool inliningPreventsSomeOuterInline = false; |
297 | 238k | for (User *U : Caller->users()) { |
298 | 238k | CallSite CS2(U); |
299 | 238k | |
300 | 238k | // If this isn't a call to Caller (it could be some other sort |
301 | 238k | // of reference) skip it. Such references will prevent the caller |
302 | 238k | // from being removed. |
303 | 238k | if (!CS2 || 238k CS2.getCalledFunction() != Caller232k ) { |
304 | 7.27k | callerWillBeRemoved = false; |
305 | 7.27k | continue; |
306 | 7.27k | } |
307 | 231k | |
308 | 231k | InlineCost IC2 = GetInlineCost(CS2); |
309 | 231k | ++NumCallerCallersAnalyzed; |
310 | 231k | if (!IC2231k ) { |
311 | 117k | callerWillBeRemoved = false; |
312 | 117k | continue; |
313 | 117k | } |
314 | 113k | if (113k IC2.isAlways()113k ) |
315 | 0 | continue; |
316 | 113k | |
317 | 113k | // See if inlining of the original callsite would erase the cost delta of |
318 | 113k | // this callsite. We subtract off the penalty for the call instruction, |
319 | 113k | // which we would be deleting. |
320 | 113k | if (113k IC2.getCostDelta() <= CandidateCost113k ) { |
321 | 9.22k | inliningPreventsSomeOuterInline = true; |
322 | 9.22k | TotalSecondaryCost += IC2.getCost(); |
323 | 9.22k | } |
324 | 238k | } |
325 | 89.0k | // If all outer calls to Caller would get inlined, the cost for the last |
326 | 89.0k | // one is set very low by getInlineCost, in anticipation that Caller will |
327 | 89.0k | // be removed entirely. We did not account for this above unless there |
328 | 89.0k | // is only one caller of Caller. |
329 | 89.0k | if (callerWillBeRemoved && 89.0k !Caller->hasOneUse()27.2k ) |
330 | 6.43k | TotalSecondaryCost -= InlineConstants::LastCallToStaticBonus; |
331 | 89.0k | |
332 | 89.0k | if (inliningPreventsSomeOuterInline && 89.0k TotalSecondaryCost < IC.getCost()3.12k ) |
333 | 2.27k | return true; |
334 | 86.7k | |
335 | 86.7k | return false; |
336 | 86.7k | } |
337 | | |
338 | | /// Return the cost only if the inliner should attempt to inline at the given |
339 | | /// CallSite. If we return the cost, we will emit an optimisation remark later |
340 | | /// using that cost, so we won't do so from this function. |
341 | | static Optional<InlineCost> |
342 | | shouldInline(CallSite CS, function_ref<InlineCost(CallSite CS)> GetInlineCost, |
343 | 1.86M | OptimizationRemarkEmitter &ORE) { |
344 | 1.86M | using namespace ore; |
345 | 1.86M | InlineCost IC = GetInlineCost(CS); |
346 | 1.86M | Instruction *Call = CS.getInstruction(); |
347 | 1.86M | Function *Callee = CS.getCalledFunction(); |
348 | 1.86M | Function *Caller = CS.getCaller(); |
349 | 1.86M | |
350 | 1.86M | if (IC.isAlways()1.86M ) { |
351 | 60.0k | DEBUG(dbgs() << " Inlining: cost=always" |
352 | 60.0k | << ", Call: " << *CS.getInstruction() << "\n"); |
353 | 60.0k | return IC; |
354 | 60.0k | } |
355 | 1.80M | |
356 | 1.80M | if (1.80M IC.isNever()1.80M ) { |
357 | 1.52M | DEBUG(dbgs() << " NOT Inlining: cost=never" |
358 | 1.52M | << ", Call: " << *CS.getInstruction() << "\n"); |
359 | 30 | ORE.emit([&]() { |
360 | 30 | return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call) |
361 | 30 | << NV("Callee", Callee) << " not inlined into " |
362 | 30 | << NV("Caller", Caller) |
363 | 30 | << " because it should never be inlined (cost=never)"; |
364 | 30 | }); |
365 | 1.52M | return None; |
366 | 1.52M | } |
367 | 284k | |
368 | 284k | if (284k !IC284k ) { |
369 | 83.3k | DEBUG(dbgs() << " NOT Inlining: cost=" << IC.getCost() |
370 | 83.3k | << ", thres=" << IC.getThreshold() |
371 | 83.3k | << ", Call: " << *CS.getInstruction() << "\n"); |
372 | 0 | ORE.emit([&]() { |
373 | 0 | return OptimizationRemarkMissed(DEBUG_TYPE, "TooCostly", Call) |
374 | 0 | << NV("Callee", Callee) << " not inlined into " |
375 | 0 | << NV("Caller", Caller) << " because too costly to inline (cost=" |
376 | 0 | << NV("Cost", IC.getCost()) |
377 | 0 | << ", threshold=" << NV("Threshold", IC.getThreshold()) << ")"; |
378 | 0 | }); |
379 | 83.3k | return None; |
380 | 83.3k | } |
381 | 201k | |
382 | 201k | int TotalSecondaryCost = 0; |
383 | 201k | if (shouldBeDeferred(Caller, CS, IC, TotalSecondaryCost, GetInlineCost)201k ) { |
384 | 2.27k | DEBUG(dbgs() << " NOT Inlining: " << *CS.getInstruction() |
385 | 2.27k | << " Cost = " << IC.getCost() |
386 | 2.27k | << ", outer Cost = " << TotalSecondaryCost << '\n'); |
387 | 2.27k | ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "IncreaseCostInOtherContexts", |
388 | 2.27k | Call) |
389 | 2.27k | << "Not inlining. Cost of inlining " << NV("Callee", Callee) |
390 | 2.27k | << " increases the cost of inlining " << NV("Caller", Caller) |
391 | 2.27k | << " in other contexts"); |
392 | 2.27k | |
393 | 2.27k | // IC does not bool() to false, so get an InlineCost that will. |
394 | 2.27k | // This will not be inspected to make an error message. |
395 | 2.27k | return None; |
396 | 2.27k | } |
397 | 198k | |
398 | 198k | DEBUG198k (dbgs() << " Inlining: cost=" << IC.getCost() |
399 | 198k | << ", thres=" << IC.getThreshold() |
400 | 198k | << ", Call: " << *CS.getInstruction() << '\n'); |
401 | 198k | return IC; |
402 | 198k | } |
403 | | |
404 | | /// Return true if the specified inline history ID |
405 | | /// indicates an inline history that includes the specified function. |
406 | | static bool InlineHistoryIncludes( |
407 | | Function *F, int InlineHistoryID, |
408 | 82.7k | const SmallVectorImpl<std::pair<Function *, int>> &InlineHistory) { |
409 | 181k | while (InlineHistoryID != -1181k ) { |
410 | 98.8k | assert(unsigned(InlineHistoryID) < InlineHistory.size() && |
411 | 98.8k | "Invalid inline history ID"); |
412 | 98.8k | if (InlineHistory[InlineHistoryID].first == F) |
413 | 244 | return true; |
414 | 98.5k | InlineHistoryID = InlineHistory[InlineHistoryID].second; |
415 | 98.5k | } |
416 | 82.5k | return false; |
417 | 82.7k | } |
418 | | |
419 | 24.2k | bool LegacyInlinerBase::doInitialization(CallGraph &CG) { |
420 | 24.2k | if (InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No) |
421 | 2 | ImportedFunctionsStats.setModuleInfo(CG.getModule()); |
422 | 24.2k | return false; // No changes to CallGraph. |
423 | 24.2k | } |
424 | | |
425 | 756k | bool LegacyInlinerBase::runOnSCC(CallGraphSCC &SCC) { |
426 | 756k | if (skipSCC(SCC)) |
427 | 43 | return false; |
428 | 756k | return inlineCalls(SCC); |
429 | 756k | } |
430 | | |
431 | | static bool |
432 | | inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG, |
433 | | std::function<AssumptionCache &(Function &)> GetAssumptionCache, |
434 | | ProfileSummaryInfo *PSI, TargetLibraryInfo &TLI, |
435 | | bool InsertLifetime, |
436 | | function_ref<InlineCost(CallSite CS)> GetInlineCost, |
437 | | function_ref<AAResults &(Function &)> AARGetter, |
438 | 879k | ImportedFunctionsInliningStatistics &ImportedFunctionsStats) { |
439 | 879k | SmallPtrSet<Function *, 8> SCCFunctions; |
440 | 879k | DEBUG(dbgs() << "Inliner visiting SCC:"); |
441 | 879k | for (CallGraphNode *Node : SCC) { |
442 | 879k | Function *F = Node->getFunction(); |
443 | 879k | if (F) |
444 | 839k | SCCFunctions.insert(F); |
445 | 879k | DEBUG(dbgs() << " " << (F ? F->getName() : "INDIRECTNODE")); |
446 | 879k | } |
447 | 879k | |
448 | 879k | // Scan through and identify all call sites ahead of time so that we only |
449 | 879k | // inline call sites in the original functions, not call sites that result |
450 | 879k | // from inlining other functions. |
451 | 879k | SmallVector<std::pair<CallSite, int>, 16> CallSites; |
452 | 879k | |
453 | 879k | // When inlining a callee produces new call sites, we want to keep track of |
454 | 879k | // the fact that they were inlined from the callee. This allows us to avoid |
455 | 879k | // infinite inlining in some obscure cases. To represent this, we use an |
456 | 879k | // index into the InlineHistory vector. |
457 | 879k | SmallVector<std::pair<Function *, int>, 8> InlineHistory; |
458 | 879k | |
459 | 879k | for (CallGraphNode *Node : SCC) { |
460 | 879k | Function *F = Node->getFunction(); |
461 | 879k | if (!F || 879k F->isDeclaration()839k ) |
462 | 277k | continue; |
463 | 602k | |
464 | 602k | OptimizationRemarkEmitter ORE(F); |
465 | 602k | for (BasicBlock &BB : *F) |
466 | 3.39M | for (Instruction &I : BB) 3.39M { |
467 | 17.7M | CallSite CS(cast<Value>(&I)); |
468 | 17.7M | // If this isn't a call, or it is a call to an intrinsic, it can |
469 | 17.7M | // never be inlined. |
470 | 17.7M | if (!CS || 17.7M isa<IntrinsicInst>(I)2.75M ) |
471 | 15.2M | continue; |
472 | 2.52M | |
473 | 2.52M | // If this is a direct call to an external function, we can never inline |
474 | 2.52M | // it. If it is an indirect call, inlining may resolve it to be a |
475 | 2.52M | // direct call, so we keep it. |
476 | 2.52M | if (Function *2.52M Callee2.52M = CS.getCalledFunction()) |
477 | 2.41M | if (2.41M Callee->isDeclaration()2.41M ) { |
478 | 660k | using namespace ore; |
479 | 660k | ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "NoDefinition", &I) |
480 | 2.41M | << NV("Callee", Callee) << " will not be inlined into " |
481 | 2.41M | << NV("Caller", CS.getCaller()) |
482 | 2.41M | << " because its definition is unavailable" |
483 | 2.41M | << setIsVerbose()); |
484 | 2.41M | continue; |
485 | 2.41M | } |
486 | 1.85M | |
487 | 1.85M | CallSites.push_back(std::make_pair(CS, -1)); |
488 | 1.85M | } |
489 | 879k | } |
490 | 879k | |
491 | 879k | DEBUG(dbgs() << ": " << CallSites.size() << " call sites.\n"); |
492 | 879k | |
493 | 879k | // If there are no calls in this function, exit early. |
494 | 879k | if (CallSites.empty()) |
495 | 535k | return false; |
496 | 343k | |
497 | 343k | // Now that we have all of the call sites, move the ones to functions in the |
498 | 343k | // current SCC to the end of the list. |
499 | 343k | unsigned FirstCallInSCC = CallSites.size(); |
500 | 2.20M | for (unsigned i = 0; i < FirstCallInSCC2.20M ; ++i1.85M ) |
501 | 1.85M | if (Function *1.85M F1.85M = CallSites[i].first.getCalledFunction()) |
502 | 1.74M | if (1.74M SCCFunctions.count(F)1.74M ) |
503 | 7.20k | std::swap(CallSites[i--], CallSites[--FirstCallInSCC]); |
504 | 343k | |
505 | 343k | InlinedArrayAllocasTy InlinedArrayAllocas; |
506 | 343k | InlineFunctionInfo InlineInfo(&CG, &GetAssumptionCache, PSI); |
507 | 343k | |
508 | 343k | // Now that we have all of the call sites, loop over them and inline them if |
509 | 343k | // it looks profitable to do so. |
510 | 343k | bool Changed = false; |
511 | 343k | bool LocalChange; |
512 | 411k | do { |
513 | 411k | LocalChange = false; |
514 | 411k | // Iterate over the outer loop because inlining functions can cause indirect |
515 | 411k | // calls to become direct calls. |
516 | 411k | // CallSites may be modified inside so ranged for loop can not be used. |
517 | 2.67M | for (unsigned CSi = 0; CSi != CallSites.size()2.67M ; ++CSi2.26M ) { |
518 | 2.26M | CallSite CS = CallSites[CSi].first; |
519 | 2.26M | |
520 | 2.26M | Function *Caller = CS.getCaller(); |
521 | 2.26M | Function *Callee = CS.getCalledFunction(); |
522 | 2.26M | |
523 | 2.26M | // We can only inline direct calls to non-declarations. |
524 | 2.26M | if (!Callee || 2.26M Callee->isDeclaration()2.11M ) |
525 | 397k | continue; |
526 | 1.86M | |
527 | 1.86M | Instruction *Instr = CS.getInstruction(); |
528 | 1.86M | |
529 | 1.86M | bool IsTriviallyDead = isInstructionTriviallyDead(Instr, &TLI); |
530 | 1.86M | |
531 | 1.86M | int InlineHistoryID; |
532 | 1.86M | if (!IsTriviallyDead1.86M ) { |
533 | 1.86M | // If this call site was obtained by inlining another function, verify |
534 | 1.86M | // that the include path for the function did not include the callee |
535 | 1.86M | // itself. If so, we'd be recursively inlining the same function, |
536 | 1.86M | // which would provide the same callsites, which would cause us to |
537 | 1.86M | // infinitely inline. |
538 | 1.86M | InlineHistoryID = CallSites[CSi].second; |
539 | 1.86M | if (InlineHistoryID != -1 && |
540 | 82.7k | InlineHistoryIncludes(Callee, InlineHistoryID, InlineHistory)) |
541 | 242 | continue; |
542 | 1.86M | } |
543 | 1.86M | |
544 | 1.86M | // FIXME for new PM: because of the old PM we currently generate ORE and |
545 | 1.86M | // in turn BFI on demand. With the new PM, the ORE dependency should |
546 | 1.86M | // just become a regular analysis dependency. |
547 | 1.86M | OptimizationRemarkEmitter ORE(Caller); |
548 | 1.86M | |
549 | 1.86M | Optional<InlineCost> OIC = shouldInline(CS, GetInlineCost, ORE); |
550 | 1.86M | // If the policy determines that we should inline this function, |
551 | 1.86M | // delete the call instead. |
552 | 1.86M | if (!OIC) |
553 | 1.60M | continue; |
554 | 258k | |
555 | 258k | // If this call site is dead and it is to a readonly function, we should |
556 | 258k | // just delete the call instead of trying to inline it, regardless of |
557 | 258k | // size. This happens because IPSCCP propagates the result out of the |
558 | 258k | // call and then we're left with the dead call. |
559 | 258k | if (258k IsTriviallyDead258k ) { |
560 | 3.93k | DEBUG(dbgs() << " -> Deleting dead call: " << *Instr << "\n"); |
561 | 3.93k | // Update the call graph by deleting the edge from Callee to Caller. |
562 | 3.93k | CG[Caller]->removeCallEdgeFor(CS); |
563 | 3.93k | Instr->eraseFromParent(); |
564 | 3.93k | ++NumCallsDeleted; |
565 | 258k | } else { |
566 | 254k | // Get DebugLoc to report. CS will be invalid after Inliner. |
567 | 254k | DebugLoc DLoc = CS->getDebugLoc(); |
568 | 254k | BasicBlock *Block = CS.getParent(); |
569 | 254k | |
570 | 254k | // Attempt to inline the function. |
571 | 254k | using namespace ore; |
572 | 254k | if (!InlineCallIfPossible(CS, InlineInfo, InlinedArrayAllocas, |
573 | 254k | InlineHistoryID, InsertLifetime, AARGetter, |
574 | 254k | ImportedFunctionsStats)) { |
575 | 3.49k | ORE.emit( |
576 | 3.49k | OptimizationRemarkMissed(DEBUG_TYPE, "NotInlined", DLoc, Block) |
577 | 3.49k | << NV("Callee", Callee) << " will not be inlined into " |
578 | 3.49k | << NV("Caller", Caller)); |
579 | 3.49k | continue; |
580 | 3.49k | } |
581 | 251k | ++NumInlined; |
582 | 251k | |
583 | 251k | if (OIC->isAlways()) |
584 | 59.8k | ORE.emit(OptimizationRemark(59.8k DEBUG_TYPE59.8k , "AlwaysInline", DLoc, Block) |
585 | 59.8k | << NV("Callee", Callee) << " inlined into " |
586 | 59.8k | << NV("Caller", Caller) << " with cost=always"); |
587 | 251k | else |
588 | 191k | ORE.emit([&]() 191k { |
589 | 18 | return OptimizationRemark(DEBUG_TYPE, "Inlined", DLoc, Block) |
590 | 18 | << NV("Callee", Callee) << " inlined into " |
591 | 18 | << NV("Caller", Caller) |
592 | 18 | << " with cost=" << NV("Cost", OIC->getCost()) |
593 | 18 | << " (threshold=" << NV("Threshold", OIC->getThreshold()) |
594 | 18 | << ")"; |
595 | 191k | }); |
596 | 251k | |
597 | 251k | // If inlining this function gave us any new call sites, throw them |
598 | 251k | // onto our worklist to process. They are useful inline candidates. |
599 | 251k | if (!InlineInfo.InlinedCalls.empty()251k ) { |
600 | 74.7k | // Create a new inline history entry for this, so that we remember |
601 | 74.7k | // that these new callsites came about due to inlining Callee. |
602 | 74.7k | int NewHistoryID = InlineHistory.size(); |
603 | 74.7k | InlineHistory.push_back(std::make_pair(Callee, InlineHistoryID)); |
604 | 74.7k | |
605 | 74.7k | for (Value *Ptr : InlineInfo.InlinedCalls) |
606 | 179k | CallSites.push_back(std::make_pair(CallSite(Ptr), NewHistoryID)); |
607 | 74.7k | } |
608 | 254k | } |
609 | 258k | |
610 | 258k | // If we inlined or deleted the last possible call site to the function, |
611 | 258k | // delete the function body now. |
612 | 255k | if (255k Callee && 255k Callee->use_empty()255k && Callee->hasLocalLinkage()83.1k && |
613 | 255k | // TODO: Can remove if in SCC now. |
614 | 39.8k | !SCCFunctions.count(Callee) && |
615 | 255k | |
616 | 255k | // The function may be apparently dead, but if there are indirect |
617 | 255k | // callgraph references to the node, we cannot delete it yet, this |
618 | 255k | // could invalidate the CGSCC iterator. |
619 | 255k | CG[Callee]->getNumReferences() == 039.6k ) { |
620 | 39.5k | DEBUG(dbgs() << " -> Deleting dead function: " << Callee->getName() |
621 | 39.5k | << "\n"); |
622 | 39.5k | CallGraphNode *CalleeNode = CG[Callee]; |
623 | 39.5k | |
624 | 39.5k | // Remove any call graph edges from the callee to its callees. |
625 | 39.5k | CalleeNode->removeAllCalledFunctions(); |
626 | 39.5k | |
627 | 39.5k | // Removing the node for callee from the call graph and delete it. |
628 | 39.5k | delete CG.removeFunctionFromModule(CalleeNode); |
629 | 39.5k | ++NumDeleted; |
630 | 39.5k | } |
631 | 255k | |
632 | 255k | // Remove this call site from the list. If possible, use |
633 | 255k | // swap/pop_back for efficiency, but do not use it if doing so would |
634 | 255k | // move a call site to a function in this SCC before the |
635 | 255k | // 'FirstCallInSCC' barrier. |
636 | 255k | if (SCC.isSingular()255k ) { |
637 | 245k | CallSites[CSi] = CallSites.back(); |
638 | 245k | CallSites.pop_back(); |
639 | 255k | } else { |
640 | 10.0k | CallSites.erase(CallSites.begin() + CSi); |
641 | 10.0k | } |
642 | 2.26M | --CSi; |
643 | 2.26M | |
644 | 2.26M | Changed = true; |
645 | 2.26M | LocalChange = true; |
646 | 2.26M | } |
647 | 411k | } while (LocalChange); |
648 | 879k | |
649 | 879k | return Changed; |
650 | 879k | } |
651 | | |
652 | 879k | bool LegacyInlinerBase::inlineCalls(CallGraphSCC &SCC) { |
653 | 879k | CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph(); |
654 | 879k | ACT = &getAnalysis<AssumptionCacheTracker>(); |
655 | 879k | PSI = getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); |
656 | 879k | auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); |
657 | 259k | auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & { |
658 | 259k | return ACT->getAssumptionCache(F); |
659 | 259k | }; |
660 | 879k | return inlineCallsImpl(SCC, CG, GetAssumptionCache, PSI, TLI, InsertLifetime, |
661 | 2.09M | [this](CallSite CS) { return getInlineCost(CS); }, |
662 | 879k | LegacyAARGetter(*this), ImportedFunctionsStats); |
663 | 879k | } |
664 | | |
665 | | /// Remove now-dead linkonce functions at the end of |
666 | | /// processing to avoid breaking the SCC traversal. |
667 | 17.5k | bool LegacyInlinerBase::doFinalization(CallGraph &CG) { |
668 | 17.5k | if (InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No) |
669 | 2 | ImportedFunctionsStats.dump(InlinerFunctionImportStats == |
670 | 2 | InlinerFunctionImportStatsOpts::Verbose); |
671 | 17.5k | return removeDeadFunctions(CG); |
672 | 17.5k | } |
673 | | |
674 | | /// Remove dead functions that are not included in DNR (Do Not Remove) list. |
675 | | bool LegacyInlinerBase::removeDeadFunctions(CallGraph &CG, |
676 | 24.2k | bool AlwaysInlineOnly) { |
677 | 24.2k | SmallVector<CallGraphNode *, 16> FunctionsToRemove; |
678 | 24.2k | SmallVector<Function *, 16> DeadFunctionsInComdats; |
679 | 24.2k | |
680 | 30.2k | auto RemoveCGN = [&](CallGraphNode *CGN) { |
681 | 30.2k | // Remove any call graph edges from the function to its callees. |
682 | 30.2k | CGN->removeAllCalledFunctions(); |
683 | 30.2k | |
684 | 30.2k | // Remove any edges from the external node to the function's call graph |
685 | 30.2k | // node. These edges might have been made irrelegant due to |
686 | 30.2k | // optimization of the program. |
687 | 30.2k | CG.getExternalCallingNode()->removeAnyCallEdgeTo(CGN); |
688 | 30.2k | |
689 | 30.2k | // Removing the node for callee from the call graph and delete it. |
690 | 30.2k | FunctionsToRemove.push_back(CGN); |
691 | 30.2k | }; |
692 | 24.2k | |
693 | 24.2k | // Scan for all of the functions, looking for ones that should now be removed |
694 | 24.2k | // from the program. Insert the dead ones in the FunctionsToRemove set. |
695 | 823k | for (const auto &I : CG) { |
696 | 823k | CallGraphNode *CGN = I.second.get(); |
697 | 823k | Function *F = CGN->getFunction(); |
698 | 823k | if (!F || 823k F->isDeclaration()799k ) |
699 | 260k | continue; |
700 | 563k | |
701 | 563k | // Handle the case when this function is called and we only want to care |
702 | 563k | // about always-inline functions. This is a bit of a hack to share code |
703 | 563k | // between here and the InlineAlways pass. |
704 | 563k | if (563k AlwaysInlineOnly && 563k !F->hasFnAttribute(Attribute::AlwaysInline)70.5k ) |
705 | 70.3k | continue; |
706 | 492k | |
707 | 492k | // If the only remaining users of the function are dead constants, remove |
708 | 492k | // them. |
709 | 492k | F->removeDeadConstantUsers(); |
710 | 492k | |
711 | 492k | if (!F->isDefTriviallyDead()) |
712 | 462k | continue; |
713 | 30.2k | |
714 | 30.2k | // It is unsafe to drop a function with discardable linkage from a COMDAT |
715 | 30.2k | // without also dropping the other members of the COMDAT. |
716 | 30.2k | // The inliner doesn't visit non-function entities which are in COMDAT |
717 | 30.2k | // groups so it is unsafe to do so *unless* the linkage is local. |
718 | 30.2k | if (30.2k !F->hasLocalLinkage()30.2k ) { |
719 | 29.8k | if (F->hasComdat()29.8k ) { |
720 | 57 | DeadFunctionsInComdats.push_back(F); |
721 | 57 | continue; |
722 | 57 | } |
723 | 30.1k | } |
724 | 30.1k | |
725 | 30.1k | RemoveCGN(CGN); |
726 | 30.1k | } |
727 | 24.2k | if (!DeadFunctionsInComdats.empty()24.2k ) { |
728 | 20 | // Filter out the functions whose comdats remain alive. |
729 | 20 | filterDeadComdatFunctions(CG.getModule(), DeadFunctionsInComdats); |
730 | 20 | // Remove the rest. |
731 | 20 | for (Function *F : DeadFunctionsInComdats) |
732 | 53 | RemoveCGN(CG[F]); |
733 | 20 | } |
734 | 24.2k | |
735 | 24.2k | if (FunctionsToRemove.empty()) |
736 | 23.0k | return false; |
737 | 1.24k | |
738 | 1.24k | // Now that we know which functions to delete, do so. We didn't want to do |
739 | 1.24k | // this inline, because that would invalidate our CallGraph::iterator |
740 | 1.24k | // objects. :( |
741 | 1.24k | // |
742 | 1.24k | // Note that it doesn't matter that we are iterating over a non-stable order |
743 | 1.24k | // here to do this, it doesn't matter which order the functions are deleted |
744 | 1.24k | // in. |
745 | 1.24k | array_pod_sort(FunctionsToRemove.begin(), FunctionsToRemove.end()); |
746 | 1.24k | FunctionsToRemove.erase( |
747 | 1.24k | std::unique(FunctionsToRemove.begin(), FunctionsToRemove.end()), |
748 | 1.24k | FunctionsToRemove.end()); |
749 | 30.2k | for (CallGraphNode *CGN : FunctionsToRemove) { |
750 | 30.2k | delete CG.removeFunctionFromModule(CGN); |
751 | 30.2k | ++NumDeleted; |
752 | 30.2k | } |
753 | 24.2k | return true; |
754 | 24.2k | } |
755 | | |
756 | | PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, |
757 | | CGSCCAnalysisManager &AM, LazyCallGraph &CG, |
758 | 605 | CGSCCUpdateResult &UR) { |
759 | 605 | const ModuleAnalysisManager &MAM = |
760 | 605 | AM.getResult<ModuleAnalysisManagerCGSCCProxy>(InitialC, CG).getManager(); |
761 | 605 | bool Changed = false; |
762 | 605 | |
763 | 605 | assert(InitialC.size() > 0 && "Cannot handle an empty SCC!"); |
764 | 605 | Module &M = *InitialC.begin()->getFunction().getParent(); |
765 | 605 | ProfileSummaryInfo *PSI = MAM.getCachedResult<ProfileSummaryAnalysis>(M); |
766 | 605 | |
767 | 605 | // We use a single common worklist for calls across the entire SCC. We |
768 | 605 | // process these in-order and append new calls introduced during inlining to |
769 | 605 | // the end. |
770 | 605 | // |
771 | 605 | // Note that this particular order of processing is actually critical to |
772 | 605 | // avoid very bad behaviors. Consider *highly connected* call graphs where |
773 | 605 | // each function contains a small amonut of code and a couple of calls to |
774 | 605 | // other functions. Because the LLVM inliner is fundamentally a bottom-up |
775 | 605 | // inliner, it can handle gracefully the fact that these all appear to be |
776 | 605 | // reasonable inlining candidates as it will flatten things until they become |
777 | 605 | // too big to inline, and then move on and flatten another batch. |
778 | 605 | // |
779 | 605 | // However, when processing call edges *within* an SCC we cannot rely on this |
780 | 605 | // bottom-up behavior. As a consequence, with heavily connected *SCCs* of |
781 | 605 | // functions we can end up incrementally inlining N calls into each of |
782 | 605 | // N functions because each incremental inlining decision looks good and we |
783 | 605 | // don't have a topological ordering to prevent explosions. |
784 | 605 | // |
785 | 605 | // To compensate for this, we don't process transitive edges made immediate |
786 | 605 | // by inlining until we've done one pass of inlining across the entire SCC. |
787 | 605 | // Large, highly connected SCCs still lead to some amount of code bloat in |
788 | 605 | // this model, but it is uniformly spread across all the functions in the SCC |
789 | 605 | // and eventually they all become too large to inline, rather than |
790 | 605 | // incrementally maknig a single function grow in a super linear fashion. |
791 | 605 | SmallVector<std::pair<CallSite, int>, 16> Calls; |
792 | 605 | |
793 | 605 | // Populate the initial list of calls in this SCC. |
794 | 644 | for (auto &N : InitialC) { |
795 | 644 | // We want to generally process call sites top-down in order for |
796 | 644 | // simplifications stemming from replacing the call with the returned value |
797 | 644 | // after inlining to be visible to subsequent inlining decisions. |
798 | 644 | // FIXME: Using instructions sequence is a really bad way to do this. |
799 | 644 | // Instead we should do an actual RPO walk of the function body. |
800 | 644 | for (Instruction &I : instructions(N.getFunction())) |
801 | 5.57k | if (auto 5.57k CS5.57k = CallSite(&I)) |
802 | 2.91k | if (Function *2.91k Callee2.91k = CS.getCalledFunction()) |
803 | 2.87k | if (2.87k !Callee->isDeclaration()2.87k ) |
804 | 470 | Calls.push_back({CS, -1}); |
805 | 644 | } |
806 | 605 | if (Calls.empty()) |
807 | 305 | return PreservedAnalyses::all(); |
808 | 300 | |
809 | 300 | // Capture updatable variables for the current SCC and RefSCC. |
810 | 300 | auto *C = &InitialC; |
811 | 300 | auto *RC = &C->getOuterRefSCC(); |
812 | 300 | |
813 | 300 | // When inlining a callee produces new call sites, we want to keep track of |
814 | 300 | // the fact that they were inlined from the callee. This allows us to avoid |
815 | 300 | // infinite inlining in some obscure cases. To represent this, we use an |
816 | 300 | // index into the InlineHistory vector. |
817 | 300 | SmallVector<std::pair<Function *, int>, 16> InlineHistory; |
818 | 300 | |
819 | 300 | // Track a set vector of inlined callees so that we can augment the caller |
820 | 300 | // with all of their edges in the call graph before pruning out the ones that |
821 | 300 | // got simplified away. |
822 | 300 | SmallSetVector<Function *, 4> InlinedCallees; |
823 | 300 | |
824 | 300 | // Track the dead functions to delete once finished with inlining calls. We |
825 | 300 | // defer deleting these to make it easier to handle the call graph updates. |
826 | 300 | SmallVector<Function *, 4> DeadFunctions; |
827 | 300 | |
828 | 300 | // Loop forward over all of the calls. Note that we cannot cache the size as |
829 | 300 | // inlining can introduce new calls that need to be processed. |
830 | 659 | for (int i = 0; i < (int)Calls.size()659 ; ++i359 ) { |
831 | 359 | // We expect the calls to typically be batched with sequences of calls that |
832 | 359 | // have the same caller, so we first set up some shared infrastructure for |
833 | 359 | // this caller. We also do any pruning we can at this layer on the caller |
834 | 359 | // alone. |
835 | 359 | Function &F = *Calls[i].first.getCaller(); |
836 | 359 | LazyCallGraph::Node &N = *CG.lookup(F); |
837 | 359 | if (CG.lookupSCC(N) != C) |
838 | 8 | continue; |
839 | 351 | if (351 F.hasFnAttribute(Attribute::OptimizeNone)351 ) |
840 | 0 | continue; |
841 | 351 | |
842 | 351 | DEBUG351 (dbgs() << "Inlining calls in: " << F.getName() << "\n"); |
843 | 351 | |
844 | 351 | // Get a FunctionAnalysisManager via a proxy for this particular node. We |
845 | 351 | // do this each time we visit a node as the SCC may have changed and as |
846 | 351 | // we're going to mutate this particular function we want to make sure the |
847 | 351 | // proxy is in place to forward any invalidation events. We can use the |
848 | 351 | // manager we get here for looking up results for functions other than this |
849 | 351 | // node however because those functions aren't going to be mutated by this |
850 | 351 | // pass. |
851 | 351 | FunctionAnalysisManager &FAM = |
852 | 351 | AM.getResult<FunctionAnalysisManagerCGSCCProxy>(*C, CG) |
853 | 351 | .getManager(); |
854 | 351 | |
855 | 351 | // Get the remarks emission analysis for the caller. |
856 | 351 | auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); |
857 | 351 | |
858 | 351 | std::function<AssumptionCache &(Function &)> GetAssumptionCache = |
859 | 755 | [&](Function &F) -> AssumptionCache & { |
860 | 755 | return FAM.getResult<AssumptionAnalysis>(F); |
861 | 755 | }; |
862 | 428 | auto GetBFI = [&](Function &F) -> BlockFrequencyInfo & { |
863 | 428 | return FAM.getResult<BlockFrequencyAnalysis>(F); |
864 | 428 | }; |
865 | 351 | |
866 | 543 | auto GetInlineCost = [&](CallSite CS) { |
867 | 543 | Function &Callee = *CS.getCalledFunction(); |
868 | 543 | auto &CalleeTTI = FAM.getResult<TargetIRAnalysis>(Callee); |
869 | 543 | return getInlineCost(CS, Params, CalleeTTI, GetAssumptionCache, {GetBFI}, |
870 | 543 | PSI, &ORE); |
871 | 543 | }; |
872 | 351 | |
873 | 351 | // Now process as many calls as we have within this caller in the sequnece. |
874 | 351 | // We bail out as soon as the caller has to change so we can update the |
875 | 351 | // call graph and prepare the context of that new caller. |
876 | 351 | bool DidInline = false; |
877 | 877 | for (; i < (int)Calls.size() && 877 Calls[i].first.getCaller() == &F579 ; ++i526 ) { |
878 | 526 | int InlineHistoryID; |
879 | 526 | CallSite CS; |
880 | 526 | std::tie(CS, InlineHistoryID) = Calls[i]; |
881 | 526 | Function &Callee = *CS.getCalledFunction(); |
882 | 526 | |
883 | 526 | if (InlineHistoryID != -1 && |
884 | 68 | InlineHistoryIncludes(&Callee, InlineHistoryID, InlineHistory)) |
885 | 2 | continue; |
886 | 524 | |
887 | 524 | // Check if this inlining may repeat breaking an SCC apart that has |
888 | 524 | // already been split once before. In that case, inlining here may |
889 | 524 | // trigger infinite inlining, much like is prevented within the inliner |
890 | 524 | // itself by the InlineHistory above, but spread across CGSCC iterations |
891 | 524 | // and thus hidden from the full inline history. |
892 | 524 | if (524 CG.lookupSCC(*CG.lookup(Callee)) == C && |
893 | 524 | UR.InlinedInternalEdges.count({&N, C})166 ) { |
894 | 4 | DEBUG(dbgs() << "Skipping inlining internal SCC edge from a node " |
895 | 4 | "previously split out of this SCC by inlining: " |
896 | 4 | << F.getName() << " -> " << Callee.getName() << "\n"); |
897 | 4 | continue; |
898 | 4 | } |
899 | 520 | |
900 | 520 | Optional<InlineCost> OIC = shouldInline(CS, GetInlineCost, ORE); |
901 | 520 | // Check whether we want to inline this callsite. |
902 | 520 | if (!OIC) |
903 | 210 | continue; |
904 | 310 | |
905 | 310 | // Setup the data structure used to plumb customization into the |
906 | 310 | // `InlineFunction` routine. |
907 | 310 | InlineFunctionInfo IFI( |
908 | 310 | /*cg=*/nullptr, &GetAssumptionCache, PSI, |
909 | 310 | &FAM.getResult<BlockFrequencyAnalysis>(*(CS.getCaller())), |
910 | 310 | &FAM.getResult<BlockFrequencyAnalysis>(Callee)); |
911 | 310 | |
912 | 310 | // Get DebugLoc to report. CS will be invalid after Inliner. |
913 | 310 | DebugLoc DLoc = CS->getDebugLoc(); |
914 | 310 | BasicBlock *Block = CS.getParent(); |
915 | 310 | |
916 | 310 | using namespace ore; |
917 | 310 | if (!InlineFunction(CS, IFI)310 ) { |
918 | 1 | ORE.emit( |
919 | 1 | OptimizationRemarkMissed(DEBUG_TYPE, "NotInlined", DLoc, Block) |
920 | 1 | << NV("Callee", &Callee) << " will not be inlined into " |
921 | 1 | << NV("Caller", &F)); |
922 | 1 | continue; |
923 | 1 | } |
924 | 309 | DidInline = true; |
925 | 309 | InlinedCallees.insert(&Callee); |
926 | 309 | |
927 | 309 | if (OIC->isAlways()) |
928 | 52 | ORE.emit(OptimizationRemark(52 DEBUG_TYPE52 , "AlwaysInline", DLoc, Block) |
929 | 52 | << NV("Callee", &Callee) << " inlined into " |
930 | 52 | << NV("Caller", &F) << " with cost=always"); |
931 | 309 | else |
932 | 257 | ORE.emit( |
933 | 257 | OptimizationRemark(DEBUG_TYPE, "Inlined", DLoc, Block) |
934 | 257 | << NV("Callee", &Callee) << " inlined into " << NV("Caller", &F) |
935 | 257 | << " with cost=" << NV("Cost", OIC->getCost()) |
936 | 257 | << " (threshold=" << NV("Threshold", OIC->getThreshold()) << ")"); |
937 | 309 | |
938 | 309 | // Add any new callsites to defined functions to the worklist. |
939 | 309 | if (!IFI.InlinedCallSites.empty()309 ) { |
940 | 176 | int NewHistoryID = InlineHistory.size(); |
941 | 176 | InlineHistory.push_back({&Callee, InlineHistoryID}); |
942 | 176 | for (CallSite &CS : reverse(IFI.InlinedCallSites)) |
943 | 939 | if (Function *939 NewCallee939 = CS.getCalledFunction()) |
944 | 931 | if (931 !NewCallee->isDeclaration()931 ) |
945 | 68 | Calls.push_back({CS, NewHistoryID}); |
946 | 176 | } |
947 | 309 | |
948 | 309 | // Merge the attributes based on the inlining. |
949 | 309 | AttributeFuncs::mergeAttributesForInlining(F, Callee); |
950 | 309 | |
951 | 309 | // For local functions, check whether this makes the callee trivially |
952 | 309 | // dead. In that case, we can drop the body of the function eagerly |
953 | 309 | // which may reduce the number of callers of other functions to one, |
954 | 309 | // changing inline cost thresholds. |
955 | 309 | if (Callee.hasLocalLinkage()309 ) { |
956 | 133 | // To check this we also need to nuke any dead constant uses (perhaps |
957 | 133 | // made dead by this operation on other functions). |
958 | 133 | Callee.removeDeadConstantUsers(); |
959 | 133 | if (Callee.use_empty() && 133 !CG.isLibFunction(Callee)86 ) { |
960 | 85 | Calls.erase( |
961 | 85 | std::remove_if(Calls.begin() + i + 1, Calls.end(), |
962 | 25 | [&Callee](const std::pair<CallSite, int> &Call) { |
963 | 25 | return Call.first.getCaller() == &Callee; |
964 | 25 | }), |
965 | 85 | Calls.end()); |
966 | 85 | // Clear the body and queue the function itself for deletion when we |
967 | 85 | // finish inlining and call graph updates. |
968 | 85 | // Note that after this point, it is an error to do anything other |
969 | 85 | // than use the callee's address or delete it. |
970 | 85 | Callee.dropAllReferences(); |
971 | 85 | assert(find(DeadFunctions, &Callee) == DeadFunctions.end() && |
972 | 85 | "Cannot put cause a function to become dead twice!"); |
973 | 85 | DeadFunctions.push_back(&Callee); |
974 | 85 | } |
975 | 133 | } |
976 | 526 | } |
977 | 351 | |
978 | 351 | // Back the call index up by one to put us in a good position to go around |
979 | 351 | // the outer loop. |
980 | 351 | --i; |
981 | 351 | |
982 | 351 | if (!DidInline) |
983 | 102 | continue; |
984 | 249 | Changed = true; |
985 | 249 | |
986 | 249 | // Add all the inlined callees' edges as ref edges to the caller. These are |
987 | 249 | // by definition trivial edges as we always have *some* transitive ref edge |
988 | 249 | // chain. While in some cases these edges are direct calls inside the |
989 | 249 | // callee, they have to be modeled in the inliner as reference edges as |
990 | 249 | // there may be a reference edge anywhere along the chain from the current |
991 | 249 | // caller to the callee that causes the whole thing to appear like |
992 | 249 | // a (transitive) reference edge that will require promotion to a call edge |
993 | 249 | // below. |
994 | 287 | for (Function *InlinedCallee : InlinedCallees) { |
995 | 287 | LazyCallGraph::Node &CalleeN = *CG.lookup(*InlinedCallee); |
996 | 287 | for (LazyCallGraph::Edge &E : *CalleeN) |
997 | 73 | RC->insertTrivialRefEdge(N, E.getNode()); |
998 | 287 | } |
999 | 249 | |
1000 | 249 | // At this point, since we have made changes we have at least removed |
1001 | 249 | // a call instruction. However, in the process we do some incremental |
1002 | 249 | // simplification of the surrounding code. This simplification can |
1003 | 249 | // essentially do all of the same things as a function pass and we can |
1004 | 249 | // re-use the exact same logic for updating the call graph to reflect the |
1005 | 249 | // change. |
1006 | 249 | LazyCallGraph::SCC *OldC = C; |
1007 | 249 | C = &updateCGAndAnalysisManagerForFunctionPass(CG, *C, N, AM, UR); |
1008 | 249 | DEBUG(dbgs() << "Updated inlining SCC: " << *C << "\n"); |
1009 | 249 | RC = &C->getOuterRefSCC(); |
1010 | 249 | |
1011 | 249 | // If this causes an SCC to split apart into multiple smaller SCCs, there |
1012 | 249 | // is a subtle risk we need to prepare for. Other transformations may |
1013 | 249 | // expose an "infinite inlining" opportunity later, and because of the SCC |
1014 | 249 | // mutation, we will revisit this function and potentially re-inline. If we |
1015 | 249 | // do, and that re-inlining also has the potentially to mutate the SCC |
1016 | 249 | // structure, the infinite inlining problem can manifest through infinite |
1017 | 249 | // SCC splits and merges. To avoid this, we capture the originating caller |
1018 | 249 | // node and the SCC containing the call edge. This is a slight over |
1019 | 249 | // approximation of the possible inlining decisions that must be avoided, |
1020 | 249 | // but is relatively efficient to store. |
1021 | 249 | // FIXME: This seems like a very heavyweight way of retaining the inline |
1022 | 249 | // history, we should look for a more efficient way of tracking it. |
1023 | 249 | if (C != OldC && 249 llvm::any_of(InlinedCallees, [&](Function *Callee) 16 { |
1024 | 17 | return CG.lookupSCC(*CG.lookup(*Callee)) == OldC; |
1025 | 249 | })) { |
1026 | 12 | DEBUG(dbgs() << "Inlined an internal call edge and split an SCC, " |
1027 | 12 | "retaining this to avoid infinite inlining.\n"); |
1028 | 12 | UR.InlinedInternalEdges.insert({&N, OldC}); |
1029 | 12 | } |
1030 | 359 | InlinedCallees.clear(); |
1031 | 359 | } |
1032 | 300 | |
1033 | 300 | // Now that we've finished inlining all of the calls across this SCC, delete |
1034 | 300 | // all of the trivially dead functions, updating the call graph and the CGSCC |
1035 | 300 | // pass manager in the process. |
1036 | 300 | // |
1037 | 300 | // Note that this walks a pointer set which has non-deterministic order but |
1038 | 300 | // that is OK as all we do is delete things and add pointers to unordered |
1039 | 300 | // sets. |
1040 | 85 | for (Function *DeadF : DeadFunctions) { |
1041 | 85 | // Get the necessary information out of the call graph and nuke the |
1042 | 85 | // function there. Also, cclear out any cached analyses. |
1043 | 85 | auto &DeadC = *CG.lookupSCC(*CG.lookup(*DeadF)); |
1044 | 85 | FunctionAnalysisManager &FAM = |
1045 | 85 | AM.getResult<FunctionAnalysisManagerCGSCCProxy>(DeadC, CG) |
1046 | 85 | .getManager(); |
1047 | 85 | FAM.clear(*DeadF, DeadF->getName()); |
1048 | 85 | AM.clear(DeadC, DeadC.getName()); |
1049 | 85 | auto &DeadRC = DeadC.getOuterRefSCC(); |
1050 | 85 | CG.removeDeadFunction(*DeadF); |
1051 | 85 | |
1052 | 85 | // Mark the relevant parts of the call graph as invalid so we don't visit |
1053 | 85 | // them. |
1054 | 85 | UR.InvalidatedSCCs.insert(&DeadC); |
1055 | 85 | UR.InvalidatedRefSCCs.insert(&DeadRC); |
1056 | 85 | |
1057 | 85 | // And delete the actual function from the module. |
1058 | 85 | M.getFunctionList().erase(DeadF); |
1059 | 85 | } |
1060 | 300 | |
1061 | 300 | if (!Changed) |
1062 | 69 | return PreservedAnalyses::all(); |
1063 | 231 | |
1064 | 231 | // Even if we change the IR, we update the core CGSCC data structures and so |
1065 | 231 | // can preserve the proxy to the function analysis manager. |
1066 | 231 | PreservedAnalyses PA; |
1067 | 231 | PA.preserve<FunctionAnalysisManagerCGSCCProxy>(); |
1068 | 231 | return PA; |
1069 | 231 | } |