/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Transforms/Scalar/LoopSink.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===-- LoopSink.cpp - Loop Sink Pass -------------------------------------===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | // |
9 | | // This pass does the inverse transformation of what LICM does. |
10 | | // It traverses all of the instructions in the loop's preheader and sinks |
11 | | // them to the loop body where frequency is lower than the loop's preheader. |
12 | | // This pass is a reverse-transformation of LICM. It differs from the Sink |
13 | | // pass in the following ways: |
14 | | // |
15 | | // * It only handles sinking of instructions from the loop's preheader to the |
16 | | // loop's body |
17 | | // * It uses alias set tracker to get more accurate alias info |
18 | | // * It uses block frequency info to find the optimal sinking locations |
19 | | // |
20 | | // Overall algorithm: |
21 | | // |
22 | | // For I in Preheader: |
23 | | // InsertBBs = BBs that uses I |
24 | | // For BB in sorted(LoopBBs): |
25 | | // DomBBs = BBs in InsertBBs that are dominated by BB |
26 | | // if freq(DomBBs) > freq(BB) |
27 | | // InsertBBs = UseBBs - DomBBs + BB |
28 | | // For BB in InsertBBs: |
29 | | // Insert I at BB's beginning |
30 | | // |
31 | | //===----------------------------------------------------------------------===// |
32 | | |
33 | | #include "llvm/Transforms/Scalar/LoopSink.h" |
34 | | #include "llvm/ADT/Statistic.h" |
35 | | #include "llvm/Analysis/AliasAnalysis.h" |
36 | | #include "llvm/Analysis/AliasSetTracker.h" |
37 | | #include "llvm/Analysis/BasicAliasAnalysis.h" |
38 | | #include "llvm/Analysis/BlockFrequencyInfo.h" |
39 | | #include "llvm/Analysis/Loads.h" |
40 | | #include "llvm/Analysis/LoopInfo.h" |
41 | | #include "llvm/Analysis/LoopPass.h" |
42 | | #include "llvm/Analysis/ScalarEvolution.h" |
43 | | #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" |
44 | | #include "llvm/Transforms/Utils/Local.h" |
45 | | #include "llvm/IR/Dominators.h" |
46 | | #include "llvm/IR/Instructions.h" |
47 | | #include "llvm/IR/LLVMContext.h" |
48 | | #include "llvm/IR/Metadata.h" |
49 | | #include "llvm/Support/CommandLine.h" |
50 | | #include "llvm/Transforms/Scalar.h" |
51 | | #include "llvm/Transforms/Scalar/LoopPassManager.h" |
52 | | #include "llvm/Transforms/Utils/LoopUtils.h" |
53 | | using namespace llvm; |
54 | | |
55 | | #define DEBUG_TYPE "loopsink" |
56 | | |
57 | | STATISTIC(NumLoopSunk, "Number of instructions sunk into loop"); |
58 | | STATISTIC(NumLoopSunkCloned, "Number of cloned instructions sunk into loop"); |
59 | | |
60 | | static cl::opt<unsigned> SinkFrequencyPercentThreshold( |
61 | | "sink-freq-percent-threshold", cl::Hidden, cl::init(90), |
62 | | cl::desc("Do not sink instructions that require cloning unless they " |
63 | | "execute less than this percent of the time.")); |
64 | | |
65 | | static cl::opt<unsigned> MaxNumberOfUseBBsForSinking( |
66 | | "max-uses-for-sinking", cl::Hidden, cl::init(30), |
67 | | cl::desc("Do not sink instructions that have too many uses.")); |
68 | | |
69 | | /// Return adjusted total frequency of \p BBs. |
70 | | /// |
71 | | /// * If there is only one BB, sinking instruction will not introduce code |
72 | | /// size increase. Thus there is no need to adjust the frequency. |
73 | | /// * If there are more than one BB, sinking would lead to code size increase. |
74 | | /// In this case, we add some "tax" to the total frequency to make it harder |
75 | | /// to sink. E.g. |
76 | | /// Freq(Preheader) = 100 |
77 | | /// Freq(BBs) = sum(50, 49) = 99 |
78 | | /// Even if Freq(BBs) < Freq(Preheader), we will not sink from Preheade to |
79 | | /// BBs as the difference is too small to justify the code size increase. |
80 | | /// To model this, The adjusted Freq(BBs) will be: |
81 | | /// AdjustedFreq(BBs) = 99 / SinkFrequencyPercentThreshold% |
82 | | static BlockFrequency adjustedSumFreq(SmallPtrSetImpl<BasicBlock *> &BBs, |
83 | 40 | BlockFrequencyInfo &BFI) { |
84 | 40 | BlockFrequency T = 0; |
85 | 40 | for (BasicBlock *B : BBs) |
86 | 49 | T += BFI.getBlockFreq(B); |
87 | 40 | if (BBs.size() > 1) |
88 | 10 | T /= BranchProbability(SinkFrequencyPercentThreshold, 100); |
89 | 40 | return T; |
90 | 40 | } |
91 | | |
92 | | /// Return a set of basic blocks to insert sinked instructions. |
93 | | /// |
94 | | /// The returned set of basic blocks (BBsToSinkInto) should satisfy: |
95 | | /// |
96 | | /// * Inside the loop \p L |
97 | | /// * For each UseBB in \p UseBBs, there is at least one BB in BBsToSinkInto |
98 | | /// that domintates the UseBB |
99 | | /// * Has minimum total frequency that is no greater than preheader frequency |
100 | | /// |
101 | | /// The purpose of the function is to find the optimal sinking points to |
102 | | /// minimize execution cost, which is defined as "sum of frequency of |
103 | | /// BBsToSinkInto". |
104 | | /// As a result, the returned BBsToSinkInto needs to have minimum total |
105 | | /// frequency. |
106 | | /// Additionally, if the total frequency of BBsToSinkInto exceeds preheader |
107 | | /// frequency, the optimal solution is not sinking (return empty set). |
108 | | /// |
109 | | /// \p ColdLoopBBs is used to help find the optimal sinking locations. |
110 | | /// It stores a list of BBs that is: |
111 | | /// |
112 | | /// * Inside the loop \p L |
113 | | /// * Has a frequency no larger than the loop's preheader |
114 | | /// * Sorted by BB frequency |
115 | | /// |
116 | | /// The complexity of the function is O(UseBBs.size() * ColdLoopBBs.size()). |
117 | | /// To avoid expensive computation, we cap the maximum UseBBs.size() in its |
118 | | /// caller. |
119 | | static SmallPtrSet<BasicBlock *, 2> |
120 | | findBBsToSinkInto(const Loop &L, const SmallPtrSetImpl<BasicBlock *> &UseBBs, |
121 | | const SmallVectorImpl<BasicBlock *> &ColdLoopBBs, |
122 | 17 | DominatorTree &DT, BlockFrequencyInfo &BFI) { |
123 | 17 | SmallPtrSet<BasicBlock *, 2> BBsToSinkInto; |
124 | 17 | if (UseBBs.size() == 0) |
125 | 0 | return BBsToSinkInto; |
126 | 17 | |
127 | 17 | BBsToSinkInto.insert(UseBBs.begin(), UseBBs.end()); |
128 | 17 | SmallPtrSet<BasicBlock *, 2> BBsDominatedByColdestBB; |
129 | 17 | |
130 | 17 | // For every iteration: |
131 | 17 | // * Pick the ColdestBB from ColdLoopBBs |
132 | 17 | // * Find the set BBsDominatedByColdestBB that satisfy: |
133 | 17 | // - BBsDominatedByColdestBB is a subset of BBsToSinkInto |
134 | 17 | // - Every BB in BBsDominatedByColdestBB is dominated by ColdestBB |
135 | 17 | // * If Freq(ColdestBB) < Freq(BBsDominatedByColdestBB), remove |
136 | 17 | // BBsDominatedByColdestBB from BBsToSinkInto, add ColdestBB to |
137 | 17 | // BBsToSinkInto |
138 | 35 | for (BasicBlock *ColdestBB : ColdLoopBBs) { |
139 | 35 | BBsDominatedByColdestBB.clear(); |
140 | 35 | for (BasicBlock *SinkedBB : BBsToSinkInto) |
141 | 58 | if (DT.dominates(ColdestBB, SinkedBB)) |
142 | 28 | BBsDominatedByColdestBB.insert(SinkedBB); |
143 | 35 | if (BBsDominatedByColdestBB.size() == 0) |
144 | 12 | continue; |
145 | 23 | if (adjustedSumFreq(BBsDominatedByColdestBB, BFI) > |
146 | 23 | BFI.getBlockFreq(ColdestBB)) { |
147 | 10 | for (BasicBlock *DominatedBB : BBsDominatedByColdestBB) { |
148 | 10 | BBsToSinkInto.erase(DominatedBB); |
149 | 10 | } |
150 | 5 | BBsToSinkInto.insert(ColdestBB); |
151 | 5 | } |
152 | 23 | } |
153 | 17 | |
154 | 17 | // Can't sink into blocks that have no valid insertion point. |
155 | 22 | for (BasicBlock *BB : BBsToSinkInto) { |
156 | 22 | if (BB->getFirstInsertionPt() == BB->end()) { |
157 | 1 | BBsToSinkInto.clear(); |
158 | 1 | break; |
159 | 1 | } |
160 | 22 | } |
161 | 17 | |
162 | 17 | // If the total frequency of BBsToSinkInto is larger than preheader frequency, |
163 | 17 | // do not sink. |
164 | 17 | if (adjustedSumFreq(BBsToSinkInto, BFI) > |
165 | 17 | BFI.getBlockFreq(L.getLoopPreheader())) |
166 | 3 | BBsToSinkInto.clear(); |
167 | 17 | return BBsToSinkInto; |
168 | 17 | } |
169 | | |
170 | | // Sinks \p I from the loop \p L's preheader to its uses. Returns true if |
171 | | // sinking is successful. |
172 | | // \p LoopBlockNumber is used to sort the insertion blocks to ensure |
173 | | // determinism. |
174 | | static bool sinkInstruction(Loop &L, Instruction &I, |
175 | | const SmallVectorImpl<BasicBlock *> &ColdLoopBBs, |
176 | | const SmallDenseMap<BasicBlock *, int, 16> &LoopBlockNumber, |
177 | | LoopInfo &LI, DominatorTree &DT, |
178 | 17 | BlockFrequencyInfo &BFI) { |
179 | 17 | // Compute the set of blocks in loop L which contain a use of I. |
180 | 17 | SmallPtrSet<BasicBlock *, 2> BBs; |
181 | 30 | for (auto &U : I.uses()) { |
182 | 30 | Instruction *UI = cast<Instruction>(U.getUser()); |
183 | 30 | // We cannot sink I to PHI-uses. |
184 | 30 | if (dyn_cast<PHINode>(UI)) |
185 | 0 | return false; |
186 | 30 | // We cannot sink I if it has uses outside of the loop. |
187 | 30 | if (!L.contains(LI.getLoopFor(UI->getParent()))) |
188 | 0 | return false; |
189 | 30 | BBs.insert(UI->getParent()); |
190 | 30 | } |
191 | 17 | |
192 | 17 | // findBBsToSinkInto is O(BBs.size() * ColdLoopBBs.size()). We cap the max |
193 | 17 | // BBs.size() to avoid expensive computation. |
194 | 17 | // FIXME: Handle code size growth for min_size and opt_size. |
195 | 17 | if (BBs.size() > MaxNumberOfUseBBsForSinking) |
196 | 0 | return false; |
197 | 17 | |
198 | 17 | // Find the set of BBs that we should insert a copy of I. |
199 | 17 | SmallPtrSet<BasicBlock *, 2> BBsToSinkInto = |
200 | 17 | findBBsToSinkInto(L, BBs, ColdLoopBBs, DT, BFI); |
201 | 17 | if (BBsToSinkInto.empty()) |
202 | 4 | return false; |
203 | 13 | |
204 | 13 | // Return if any of the candidate blocks to sink into is non-cold. |
205 | 13 | if (BBsToSinkInto.size() > 1) { |
206 | 3 | for (auto *BB : BBsToSinkInto) |
207 | 5 | if (!LoopBlockNumber.count(BB)) |
208 | 1 | return false; |
209 | 3 | } |
210 | 13 | |
211 | 13 | // Copy the final BBs into a vector and sort them using the total ordering |
212 | 13 | // of the loop block numbers as iterating the set doesn't give a useful |
213 | 13 | // order. No need to stable sort as the block numbers are a total ordering. |
214 | 13 | SmallVector<BasicBlock *, 2> SortedBBsToSinkInto; |
215 | 12 | SortedBBsToSinkInto.insert(SortedBBsToSinkInto.begin(), BBsToSinkInto.begin(), |
216 | 12 | BBsToSinkInto.end()); |
217 | 12 | llvm::sort(SortedBBsToSinkInto, [&](BasicBlock *A, BasicBlock *B) { |
218 | 2 | return LoopBlockNumber.find(A)->second < LoopBlockNumber.find(B)->second; |
219 | 2 | }); |
220 | 12 | |
221 | 12 | BasicBlock *MoveBB = *SortedBBsToSinkInto.begin(); |
222 | 12 | // FIXME: Optimize the efficiency for cloned value replacement. The current |
223 | 12 | // implementation is O(SortedBBsToSinkInto.size() * I.num_uses()). |
224 | 12 | for (BasicBlock *N : makeArrayRef(SortedBBsToSinkInto).drop_front(1)) { |
225 | 2 | assert(LoopBlockNumber.find(N)->second > |
226 | 2 | LoopBlockNumber.find(MoveBB)->second && |
227 | 2 | "BBs not sorted!"); |
228 | 2 | // Clone I and replace its uses. |
229 | 2 | Instruction *IC = I.clone(); |
230 | 2 | IC->setName(I.getName()); |
231 | 2 | IC->insertBefore(&*N->getFirstInsertionPt()); |
232 | 2 | // Replaces uses of I with IC in N |
233 | 6 | for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE;) { |
234 | 4 | Use &U = *UI++; |
235 | 4 | auto *I = cast<Instruction>(U.getUser()); |
236 | 4 | if (I->getParent() == N) |
237 | 2 | U.set(IC); |
238 | 4 | } |
239 | 2 | // Replaces uses of I with IC in blocks dominated by N |
240 | 2 | replaceDominatedUsesWith(&I, IC, DT, N); |
241 | 2 | LLVM_DEBUG(dbgs() << "Sinking a clone of " << I << " To: " << N->getName() |
242 | 2 | << '\n'); |
243 | 2 | NumLoopSunkCloned++; |
244 | 2 | } |
245 | 12 | LLVM_DEBUG(dbgs() << "Sinking " << I << " To: " << MoveBB->getName() << '\n'); |
246 | 12 | NumLoopSunk++; |
247 | 12 | I.moveBefore(&*MoveBB->getFirstInsertionPt()); |
248 | 12 | |
249 | 12 | return true; |
250 | 13 | } |
251 | | |
252 | | /// Sinks instructions from loop's preheader to the loop body if the |
253 | | /// sum frequency of inserted copy is smaller than preheader's frequency. |
254 | | static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI, |
255 | | DominatorTree &DT, |
256 | | BlockFrequencyInfo &BFI, |
257 | 184k | ScalarEvolution *SE) { |
258 | 184k | BasicBlock *Preheader = L.getLoopPreheader(); |
259 | 184k | if (!Preheader) |
260 | 0 | return false; |
261 | 184k | |
262 | 184k | // Enable LoopSink only when runtime profile is available. |
263 | 184k | // With static profile, the sinking decision may be sub-optimal. |
264 | 184k | if (!Preheader->getParent()->hasProfileData()) |
265 | 184k | return false; |
266 | 31 | |
267 | 31 | const BlockFrequency PreheaderFreq = BFI.getBlockFreq(Preheader); |
268 | 31 | // If there are no basic blocks with lower frequency than the preheader then |
269 | 31 | // we can avoid the detailed analysis as we will never find profitable sinking |
270 | 31 | // opportunities. |
271 | 84 | if (all_of(L.blocks(), [&](const BasicBlock *BB) 31 { |
272 | 84 | return BFI.getBlockFreq(BB) > PreheaderFreq; |
273 | 84 | })) |
274 | 9 | return false; |
275 | 22 | |
276 | 22 | bool Changed = false; |
277 | 22 | AliasSetTracker CurAST(AA); |
278 | 22 | |
279 | 22 | // Compute alias set. |
280 | 22 | for (BasicBlock *BB : L.blocks()) |
281 | 119 | CurAST.add(*BB); |
282 | 22 | CurAST.add(*Preheader); |
283 | 22 | |
284 | 22 | // Sort loop's basic blocks by frequency |
285 | 22 | SmallVector<BasicBlock *, 10> ColdLoopBBs; |
286 | 22 | SmallDenseMap<BasicBlock *, int, 16> LoopBlockNumber; |
287 | 22 | int i = 0; |
288 | 22 | for (BasicBlock *B : L.blocks()) |
289 | 119 | if (BFI.getBlockFreq(B) < BFI.getBlockFreq(L.getLoopPreheader())) { |
290 | 46 | ColdLoopBBs.push_back(B); |
291 | 46 | LoopBlockNumber[B] = ++i; |
292 | 46 | } |
293 | 34 | llvm::stable_sort(ColdLoopBBs, [&](BasicBlock *A, BasicBlock *B) { |
294 | 34 | return BFI.getBlockFreq(A) < BFI.getBlockFreq(B); |
295 | 34 | }); |
296 | 22 | |
297 | 22 | // Traverse preheader's instructions in reverse order becaue if A depends |
298 | 22 | // on B (A appears after B), A needs to be sinked first before B can be |
299 | 22 | // sinked. |
300 | 68 | for (auto II = Preheader->rbegin(), E = Preheader->rend(); II != E;) { |
301 | 46 | Instruction *I = &*II++; |
302 | 46 | // No need to check for instruction's operands are loop invariant. |
303 | 46 | assert(L.hasLoopInvariantOperands(I) && |
304 | 46 | "Insts in a loop's preheader should have loop invariant operands!"); |
305 | 46 | if (!canSinkOrHoistInst(*I, &AA, &DT, &L, &CurAST, nullptr, false)) |
306 | 29 | continue; |
307 | 17 | if (sinkInstruction(L, *I, ColdLoopBBs, LoopBlockNumber, LI, DT, BFI)) |
308 | 12 | Changed = true; |
309 | 17 | } |
310 | 22 | |
311 | 22 | if (Changed && SE12 ) |
312 | 6 | SE->forgetLoopDispositions(&L); |
313 | 22 | return Changed; |
314 | 22 | } |
315 | | |
316 | 869 | PreservedAnalyses LoopSinkPass::run(Function &F, FunctionAnalysisManager &FAM) { |
317 | 869 | LoopInfo &LI = FAM.getResult<LoopAnalysis>(F); |
318 | 869 | // Nothing to do if there are no loops. |
319 | 869 | if (LI.empty()) |
320 | 820 | return PreservedAnalyses::all(); |
321 | 49 | |
322 | 49 | AAResults &AA = FAM.getResult<AAManager>(F); |
323 | 49 | DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F); |
324 | 49 | BlockFrequencyInfo &BFI = FAM.getResult<BlockFrequencyAnalysis>(F); |
325 | 49 | |
326 | 49 | // We want to do a postorder walk over the loops. Since loops are a tree this |
327 | 49 | // is equivalent to a reversed preorder walk and preorder is easy to compute |
328 | 49 | // without recursion. Since we reverse the preorder, we will visit siblings |
329 | 49 | // in reverse program order. This isn't expected to matter at all but is more |
330 | 49 | // consistent with sinking algorithms which generally work bottom-up. |
331 | 49 | SmallVector<Loop *, 4> PreorderLoops = LI.getLoopsInPreorder(); |
332 | 49 | |
333 | 49 | bool Changed = false; |
334 | 55 | do { |
335 | 55 | Loop &L = *PreorderLoops.pop_back_val(); |
336 | 55 | |
337 | 55 | // Note that we don't pass SCEV here because it is only used to invalidate |
338 | 55 | // loops in SCEV and we don't preserve (or request) SCEV at all making that |
339 | 55 | // unnecessary. |
340 | 55 | Changed |= sinkLoopInvariantInstructions(L, AA, LI, DT, BFI, |
341 | 55 | /*ScalarEvolution*/ nullptr); |
342 | 55 | } while (!PreorderLoops.empty()); |
343 | 49 | |
344 | 49 | if (!Changed) |
345 | 43 | return PreservedAnalyses::all(); |
346 | 6 | |
347 | 6 | PreservedAnalyses PA; |
348 | 6 | PA.preserveSet<CFGAnalyses>(); |
349 | 6 | return PA; |
350 | 6 | } |
351 | | |
352 | | namespace { |
353 | | struct LegacyLoopSinkPass : public LoopPass { |
354 | | static char ID; |
355 | 12.9k | LegacyLoopSinkPass() : LoopPass(ID) { |
356 | 12.9k | initializeLegacyLoopSinkPassPass(*PassRegistry::getPassRegistry()); |
357 | 12.9k | } |
358 | | |
359 | 184k | bool runOnLoop(Loop *L, LPPassManager &LPM) override { |
360 | 184k | if (skipLoop(L)) |
361 | 16 | return false; |
362 | 184k | |
363 | 184k | auto *SE = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>(); |
364 | 184k | return sinkLoopInvariantInstructions( |
365 | 184k | *L, getAnalysis<AAResultsWrapperPass>().getAAResults(), |
366 | 184k | getAnalysis<LoopInfoWrapperPass>().getLoopInfo(), |
367 | 184k | getAnalysis<DominatorTreeWrapperPass>().getDomTree(), |
368 | 184k | getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(), |
369 | 184k | SE ? &SE->getSE() : nullptr0 ); |
370 | 184k | } |
371 | | |
372 | 12.9k | void getAnalysisUsage(AnalysisUsage &AU) const override { |
373 | 12.9k | AU.setPreservesCFG(); |
374 | 12.9k | AU.addRequired<BlockFrequencyInfoWrapperPass>(); |
375 | 12.9k | getLoopAnalysisUsage(AU); |
376 | 12.9k | } |
377 | | }; |
378 | | } |
379 | | |
380 | | char LegacyLoopSinkPass::ID = 0; |
381 | 48.6k | INITIALIZE_PASS_BEGIN(LegacyLoopSinkPass, "loop-sink", "Loop Sink", false, |
382 | 48.6k | false) |
383 | 48.6k | INITIALIZE_PASS_DEPENDENCY(LoopPass) |
384 | 48.6k | INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) |
385 | 48.6k | INITIALIZE_PASS_END(LegacyLoopSinkPass, "loop-sink", "Loop Sink", false, false) |
386 | | |
387 | 12.9k | Pass *llvm::createLoopSinkPass() { return new LegacyLoopSinkPass(); } |