/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Transforms/Scalar/LoopRotation.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===- LoopRotation.cpp - Loop Rotation Pass ------------------------------===// |
2 | | // |
3 | | // The LLVM Compiler Infrastructure |
4 | | // |
5 | | // This file is distributed under the University of Illinois Open Source |
6 | | // License. See LICENSE.TXT for details. |
7 | | // |
8 | | //===----------------------------------------------------------------------===// |
9 | | // |
10 | | // This file implements Loop Rotation Pass. |
11 | | // |
12 | | //===----------------------------------------------------------------------===// |
13 | | |
14 | | #include "llvm/Transforms/Scalar/LoopRotation.h" |
15 | | #include "llvm/ADT/Statistic.h" |
16 | | #include "llvm/Analysis/AliasAnalysis.h" |
17 | | #include "llvm/Analysis/AssumptionCache.h" |
18 | | #include "llvm/Analysis/BasicAliasAnalysis.h" |
19 | | #include "llvm/Analysis/CodeMetrics.h" |
20 | | #include "llvm/Analysis/GlobalsModRef.h" |
21 | | #include "llvm/Analysis/InstructionSimplify.h" |
22 | | #include "llvm/Analysis/LoopPass.h" |
23 | | #include "llvm/Analysis/ScalarEvolution.h" |
24 | | #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" |
25 | | #include "llvm/Analysis/TargetTransformInfo.h" |
26 | | #include "llvm/Analysis/ValueTracking.h" |
27 | | #include "llvm/IR/CFG.h" |
28 | | #include "llvm/IR/Dominators.h" |
29 | | #include "llvm/IR/Function.h" |
30 | | #include "llvm/IR/IntrinsicInst.h" |
31 | | #include "llvm/IR/Module.h" |
32 | | #include "llvm/Support/CommandLine.h" |
33 | | #include "llvm/Support/Debug.h" |
34 | | #include "llvm/Support/raw_ostream.h" |
35 | | #include "llvm/Transforms/Scalar.h" |
36 | | #include "llvm/Transforms/Scalar/LoopPassManager.h" |
37 | | #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
38 | | #include "llvm/Transforms/Utils/Local.h" |
39 | | #include "llvm/Transforms/Utils/LoopUtils.h" |
40 | | #include "llvm/Transforms/Utils/SSAUpdater.h" |
41 | | #include "llvm/Transforms/Utils/ValueMapper.h" |
42 | | using namespace llvm; |
43 | | |
44 | | #define DEBUG_TYPE "loop-rotate" |
45 | | |
46 | | static cl::opt<unsigned> DefaultRotationThreshold( |
47 | | "rotation-max-header-size", cl::init(16), cl::Hidden, |
48 | | cl::desc("The default maximum header size for automatic loop rotation")); |
49 | | |
50 | | STATISTIC(NumRotated, "Number of loops rotated"); |
51 | | |
52 | | namespace { |
53 | | /// A simple loop rotation transformation. |
54 | | class LoopRotate { |
55 | | const unsigned MaxHeaderSize; |
56 | | LoopInfo *LI; |
57 | | const TargetTransformInfo *TTI; |
58 | | AssumptionCache *AC; |
59 | | DominatorTree *DT; |
60 | | ScalarEvolution *SE; |
61 | | const SimplifyQuery &SQ; |
62 | | |
63 | | public: |
64 | | LoopRotate(unsigned MaxHeaderSize, LoopInfo *LI, |
65 | | const TargetTransformInfo *TTI, AssumptionCache *AC, |
66 | | DominatorTree *DT, ScalarEvolution *SE, const SimplifyQuery &SQ) |
67 | | : MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE), |
68 | 700k | SQ(SQ) {} |
69 | | bool processLoop(Loop *L); |
70 | | |
71 | | private: |
72 | | bool rotateLoop(Loop *L, bool SimplifiedLatch); |
73 | | bool simplifyLoopLatch(Loop *L); |
74 | | }; |
75 | | } // end anonymous namespace |
76 | | |
77 | | /// RewriteUsesOfClonedInstructions - We just cloned the instructions from the |
78 | | /// old header into the preheader. If there were uses of the values produced by |
79 | | /// these instruction that were outside of the loop, we have to insert PHI nodes |
80 | | /// to merge the two values. Do this now. |
81 | | static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader, |
82 | | BasicBlock *OrigPreheader, |
83 | | ValueToValueMapTy &ValueMap, |
84 | 51.1k | SmallVectorImpl<PHINode*> *InsertedPHIs) { |
85 | 51.1k | // Remove PHI node entries that are no longer live. |
86 | 51.1k | BasicBlock::iterator I, E = OrigHeader->end(); |
87 | 132k | for (I = OrigHeader->begin(); PHINode *PN132k = dyn_cast<PHINode>(I); ++I81.0k ) |
88 | 81.0k | PN->removeIncomingValue(PN->getBasicBlockIndex(OrigPreheader)); |
89 | 51.1k | |
90 | 51.1k | // Now fix up users of the instructions in OrigHeader, inserting PHI nodes |
91 | 51.1k | // as necessary. |
92 | 51.1k | SSAUpdater SSA(InsertedPHIs); |
93 | 274k | for (I = OrigHeader->begin(); I != E274k ; ++I223k ) { |
94 | 223k | Value *OrigHeaderVal = &*I; |
95 | 223k | |
96 | 223k | // If there are no uses of the value (e.g. because it returns void), there |
97 | 223k | // is nothing to rewrite. |
98 | 223k | if (OrigHeaderVal->use_empty()) |
99 | 53.0k | continue; |
100 | 170k | |
101 | 170k | Value *OrigPreHeaderVal = ValueMap.lookup(OrigHeaderVal); |
102 | 170k | |
103 | 170k | // The value now exits in two versions: the initial value in the preheader |
104 | 170k | // and the loop "next" value in the original header. |
105 | 170k | SSA.Initialize(OrigHeaderVal->getType(), OrigHeaderVal->getName()); |
106 | 170k | SSA.AddAvailableValue(OrigHeader, OrigHeaderVal); |
107 | 170k | SSA.AddAvailableValue(OrigPreheader, OrigPreHeaderVal); |
108 | 170k | |
109 | 170k | // Visit each use of the OrigHeader instruction. |
110 | 170k | for (Value::use_iterator UI = OrigHeaderVal->use_begin(), |
111 | 170k | UE = OrigHeaderVal->use_end(); |
112 | 599k | UI != UE599k ;) { |
113 | 428k | // Grab the use before incrementing the iterator. |
114 | 428k | Use &U = *UI; |
115 | 428k | |
116 | 428k | // Increment the iterator before removing the use from the list. |
117 | 428k | ++UI; |
118 | 428k | |
119 | 428k | // SSAUpdater can't handle a non-PHI use in the same block as an |
120 | 428k | // earlier def. We can easily handle those cases manually. |
121 | 428k | Instruction *UserInst = cast<Instruction>(U.getUser()); |
122 | 428k | if (!isa<PHINode>(UserInst)428k ) { |
123 | 315k | BasicBlock *UserBB = UserInst->getParent(); |
124 | 315k | |
125 | 315k | // The original users in the OrigHeader are already using the |
126 | 315k | // original definitions. |
127 | 315k | if (UserBB == OrigHeader) |
128 | 142k | continue; |
129 | 172k | |
130 | 172k | // Users in the OrigPreHeader need to use the value to which the |
131 | 172k | // original definitions are mapped. |
132 | 172k | if (172k UserBB == OrigPreheader172k ) { |
133 | 0 | U = OrigPreHeaderVal; |
134 | 0 | continue; |
135 | 0 | } |
136 | 286k | } |
137 | 286k | |
138 | 286k | // Anything else can be handled by SSAUpdater. |
139 | 286k | SSA.RewriteUse(U); |
140 | 286k | } |
141 | 170k | |
142 | 170k | // Replace MetadataAsValue(ValueAsMetadata(OrigHeaderVal)) uses in debug |
143 | 170k | // intrinsics. |
144 | 170k | LLVMContext &C = OrigHeader->getContext(); |
145 | 170k | if (auto *VAM170k = ValueAsMetadata::getIfExists(OrigHeaderVal)) { |
146 | 9 | if (auto *MAV9 = MetadataAsValue::getIfExists(C, VAM)) { |
147 | 21 | for (auto UI = MAV->use_begin(), E = MAV->use_end(); UI != E21 ;) { |
148 | 12 | // Grab the use before incrementing the iterator. Otherwise, altering |
149 | 12 | // the Use will invalidate the iterator. |
150 | 12 | Use &U = *UI++; |
151 | 12 | DbgInfoIntrinsic *UserInst = dyn_cast<DbgInfoIntrinsic>(U.getUser()); |
152 | 12 | if (!UserInst) |
153 | 0 | continue; |
154 | 12 | |
155 | 12 | // The original users in the OrigHeader are already using the original |
156 | 12 | // definitions. |
157 | 12 | BasicBlock *UserBB = UserInst->getParent(); |
158 | 12 | if (UserBB == OrigHeader) |
159 | 6 | continue; |
160 | 6 | |
161 | 6 | // Users in the OrigPreHeader need to use the value to which the |
162 | 6 | // original definitions are mapped and anything else can be handled by |
163 | 6 | // the SSAUpdater. To avoid adding PHINodes, check if the value is |
164 | 6 | // available in UserBB, if not substitute undef. |
165 | 6 | Value *NewVal; |
166 | 6 | if (UserBB == OrigPreheader) |
167 | 0 | NewVal = OrigPreHeaderVal; |
168 | 6 | else if (6 SSA.HasValueForBlock(UserBB)6 ) |
169 | 3 | NewVal = SSA.GetValueInMiddleOfBlock(UserBB); |
170 | 6 | else |
171 | 3 | NewVal = UndefValue::get(OrigHeaderVal->getType()); |
172 | 12 | U = MetadataAsValue::get(C, ValueAsMetadata::get(NewVal)); |
173 | 12 | } |
174 | 9 | } |
175 | 9 | } |
176 | 223k | } |
177 | 51.1k | } |
178 | | |
179 | | /// Propagate dbg.value intrinsics through the newly inserted Phis. |
180 | | static void insertDebugValues(BasicBlock *OrigHeader, |
181 | 50.3k | SmallVectorImpl<PHINode*> &InsertedPHIs) { |
182 | 50.3k | ValueToValueMapTy DbgValueMap; |
183 | 50.3k | |
184 | 50.3k | // Map existing PHI nodes to their dbg.values. |
185 | 220k | for (auto &I : *OrigHeader) { |
186 | 220k | if (auto DbgII220k = dyn_cast<DbgInfoIntrinsic>(&I)) { |
187 | 6 | if (auto *Loc = dyn_cast_or_null<PHINode>(DbgII->getVariableLocation())) |
188 | 6 | DbgValueMap.insert({Loc, DbgII}); |
189 | 6 | } |
190 | 220k | } |
191 | 50.3k | |
192 | 50.3k | // Then iterate through the new PHIs and look to see if they use one of the |
193 | 50.3k | // previously mapped PHIs. If so, insert a new dbg.value intrinsic that will |
194 | 50.3k | // propagate the info through the new PHI. |
195 | 50.3k | LLVMContext &C = OrigHeader->getContext(); |
196 | 89.3k | for (auto PHI : InsertedPHIs) { |
197 | 181k | for (auto VI : PHI->operand_values()) { |
198 | 181k | auto V = DbgValueMap.find(VI); |
199 | 181k | if (V != DbgValueMap.end()181k ) { |
200 | 6 | auto *DbgII = cast<DbgInfoIntrinsic>(V->second); |
201 | 6 | Instruction *NewDbgII = DbgII->clone(); |
202 | 6 | auto PhiMAV = MetadataAsValue::get(C, ValueAsMetadata::get(PHI)); |
203 | 6 | NewDbgII->setOperand(0, PhiMAV); |
204 | 6 | BasicBlock *Parent = PHI->getParent(); |
205 | 6 | NewDbgII->insertBefore(Parent->getFirstNonPHIOrDbgOrLifetime()); |
206 | 6 | } |
207 | 181k | } |
208 | 89.3k | } |
209 | 50.3k | } |
210 | | |
211 | | /// Rotate loop LP. Return true if the loop is rotated. |
212 | | /// |
213 | | /// \param SimplifiedLatch is true if the latch was just folded into the final |
214 | | /// loop exit. In this case we may want to rotate even though the new latch is |
215 | | /// now an exiting branch. This rotation would have happened had the latch not |
216 | | /// been simplified. However, if SimplifiedLatch is false, then we avoid |
217 | | /// rotating loops in which the latch exits to avoid excessive or endless |
218 | | /// rotation. LoopRotate should be repeatable and converge to a canonical |
219 | | /// form. This property is satisfied because simplifying the loop latch can only |
220 | | /// happen once across multiple invocations of the LoopRotate pass. |
221 | 700k | bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { |
222 | 700k | // If the loop has only one block then there is not much to rotate. |
223 | 700k | if (L->getBlocks().size() == 1) |
224 | 348k | return false; |
225 | 351k | |
226 | 351k | BasicBlock *OrigHeader = L->getHeader(); |
227 | 351k | BasicBlock *OrigLatch = L->getLoopLatch(); |
228 | 351k | |
229 | 351k | BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator()); |
230 | 351k | if (!BI || 351k BI->isUnconditional()346k ) |
231 | 56.5k | return false; |
232 | 295k | |
233 | 295k | // If the loop header is not one of the loop exiting blocks then |
234 | 295k | // either this loop is already rotated or it is not |
235 | 295k | // suitable for loop rotation transformations. |
236 | 295k | if (295k !L->isLoopExiting(OrigHeader)295k ) |
237 | 160k | return false; |
238 | 135k | |
239 | 135k | // If the loop latch already contains a branch that leaves the loop then the |
240 | 135k | // loop is already rotated. |
241 | 135k | if (135k !OrigLatch135k ) |
242 | 0 | return false; |
243 | 135k | |
244 | 135k | // Rotate if either the loop latch does *not* exit the loop, or if the loop |
245 | 135k | // latch was just simplified. |
246 | 135k | if (135k L->isLoopExiting(OrigLatch) && 135k !SimplifiedLatch86.1k ) |
247 | 83.5k | return false; |
248 | 51.6k | |
249 | 51.6k | // Check size of original header and reject loop if it is very big or we can't |
250 | 51.6k | // duplicate blocks inside it. |
251 | 51.6k | { |
252 | 51.6k | SmallPtrSet<const Value *, 32> EphValues; |
253 | 51.6k | CodeMetrics::collectEphemeralValues(L, AC, EphValues); |
254 | 51.6k | |
255 | 51.6k | CodeMetrics Metrics; |
256 | 51.6k | Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues); |
257 | 51.6k | if (Metrics.notDuplicatable51.6k ) { |
258 | 2 | DEBUG(dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable" |
259 | 2 | << " instructions: "; |
260 | 2 | L->dump()); |
261 | 2 | return false; |
262 | 2 | } |
263 | 51.6k | if (51.6k Metrics.convergent51.6k ) { |
264 | 1 | DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent " |
265 | 1 | "instructions: "; |
266 | 1 | L->dump()); |
267 | 1 | return false; |
268 | 1 | } |
269 | 51.6k | if (51.6k Metrics.NumInsts > MaxHeaderSize51.6k ) |
270 | 498 | return false; |
271 | 51.1k | } |
272 | 51.1k | |
273 | 51.1k | // Now, this loop is suitable for rotation. |
274 | 51.1k | BasicBlock *OrigPreheader = L->getLoopPreheader(); |
275 | 51.1k | |
276 | 51.1k | // If the loop could not be converted to canonical form, it must have an |
277 | 51.1k | // indirectbr in it, just give up. |
278 | 51.1k | if (!OrigPreheader) |
279 | 0 | return false; |
280 | 51.1k | |
281 | 51.1k | // Anything ScalarEvolution may know about this loop or the PHI nodes |
282 | 51.1k | // in its header will soon be invalidated. |
283 | 51.1k | if (51.1k SE51.1k ) |
284 | 51.1k | SE->forgetLoop(L); |
285 | 51.1k | |
286 | 51.1k | DEBUG(dbgs() << "LoopRotation: rotating "; L->dump()); |
287 | 51.1k | |
288 | 51.1k | // Find new Loop header. NewHeader is a Header's one and only successor |
289 | 51.1k | // that is inside loop. Header's other successor is outside the |
290 | 51.1k | // loop. Otherwise loop is not suitable for rotation. |
291 | 51.1k | BasicBlock *Exit = BI->getSuccessor(0); |
292 | 51.1k | BasicBlock *NewHeader = BI->getSuccessor(1); |
293 | 51.1k | if (L->contains(Exit)) |
294 | 36.8k | std::swap(Exit, NewHeader); |
295 | 51.1k | assert(NewHeader && "Unable to determine new loop header"); |
296 | 51.1k | assert(L->contains(NewHeader) && !L->contains(Exit) && |
297 | 51.1k | "Unable to determine loop header and exit blocks"); |
298 | 51.1k | |
299 | 51.1k | // This code assumes that the new header has exactly one predecessor. |
300 | 51.1k | // Remove any single-entry PHI nodes in it. |
301 | 51.1k | assert(NewHeader->getSinglePredecessor() && |
302 | 51.1k | "New header doesn't have one pred!"); |
303 | 51.1k | FoldSingleEntryPHINodes(NewHeader); |
304 | 51.1k | |
305 | 51.1k | // Begin by walking OrigHeader and populating ValueMap with an entry for |
306 | 51.1k | // each Instruction. |
307 | 51.1k | BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end(); |
308 | 51.1k | ValueToValueMapTy ValueMap; |
309 | 51.1k | |
310 | 51.1k | // For PHI nodes, the value available in OldPreHeader is just the |
311 | 51.1k | // incoming value from OldPreHeader. |
312 | 132k | for (; PHINode *PN132k = dyn_cast<PHINode>(I); ++I81.0k ) |
313 | 81.0k | ValueMap[PN] = PN->getIncomingValueForBlock(OrigPreheader); |
314 | 51.1k | |
315 | 51.1k | // For the rest of the instructions, either hoist to the OrigPreheader if |
316 | 51.1k | // possible or create a clone in the OldPreHeader if not. |
317 | 51.1k | TerminatorInst *LoopEntryBranch = OrigPreheader->getTerminator(); |
318 | 201k | while (I != E201k ) { |
319 | 150k | Instruction *Inst = &*I++; |
320 | 150k | |
321 | 150k | // If the instruction's operands are invariant and it doesn't read or write |
322 | 150k | // memory, then it is safe to hoist. Doing this doesn't change the order of |
323 | 150k | // execution in the preheader, but does prevent the instruction from |
324 | 150k | // executing in each iteration of the loop. This means it is safe to hoist |
325 | 150k | // something that might trap, but isn't safe to hoist something that reads |
326 | 150k | // memory (without proving that the loop doesn't write). |
327 | 150k | if (L->hasLoopInvariantOperands(Inst) && 150k !Inst->mayReadFromMemory()22.2k && |
328 | 150k | !Inst->mayWriteToMemory()8.22k && !isa<TerminatorInst>(Inst)8.16k && |
329 | 150k | !isa<DbgInfoIntrinsic>(Inst)8.06k && !isa<AllocaInst>(Inst)8.06k ) { |
330 | 8.05k | Inst->moveBefore(LoopEntryBranch); |
331 | 8.05k | continue; |
332 | 8.05k | } |
333 | 142k | |
334 | 142k | // Otherwise, create a duplicate of the instruction. |
335 | 142k | Instruction *C = Inst->clone(); |
336 | 142k | |
337 | 142k | // Eagerly remap the operands of the instruction. |
338 | 142k | RemapInstruction(C, ValueMap, |
339 | 142k | RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); |
340 | 142k | |
341 | 142k | // With the operands remapped, see if the instruction constant folds or is |
342 | 142k | // otherwise simplifyable. This commonly occurs because the entry from PHI |
343 | 142k | // nodes allows icmps and other instructions to fold. |
344 | 142k | Value *V = SimplifyInstruction(C, SQ); |
345 | 142k | if (V && 142k LI->replacementPreservesLCSSAForm(C, V)17.0k ) { |
346 | 16.6k | // If so, then delete the temporary instruction and stick the folded value |
347 | 16.6k | // in the map. |
348 | 16.6k | ValueMap[Inst] = V; |
349 | 16.6k | if (!C->mayHaveSideEffects()16.6k ) { |
350 | 16.6k | C->deleteValue(); |
351 | 16.6k | C = nullptr; |
352 | 16.6k | } |
353 | 142k | } else { |
354 | 125k | ValueMap[Inst] = C; |
355 | 125k | } |
356 | 142k | if (C142k ) { |
357 | 125k | // Otherwise, stick the new instruction into the new block! |
358 | 125k | C->setName(Inst->getName()); |
359 | 125k | C->insertBefore(LoopEntryBranch); |
360 | 125k | |
361 | 125k | if (auto *II = dyn_cast<IntrinsicInst>(C)) |
362 | 135 | if (135 II->getIntrinsicID() == Intrinsic::assume135 ) |
363 | 0 | AC->registerAssumption(II); |
364 | 125k | } |
365 | 150k | } |
366 | 51.1k | |
367 | 51.1k | // Along with all the other instructions, we just cloned OrigHeader's |
368 | 51.1k | // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's |
369 | 51.1k | // successors by duplicating their incoming values for OrigHeader. |
370 | 51.1k | TerminatorInst *TI = OrigHeader->getTerminator(); |
371 | 51.1k | for (BasicBlock *SuccBB : TI->successors()) |
372 | 102k | for (BasicBlock::iterator BI = SuccBB->begin(); |
373 | 131k | PHINode *PN131k = dyn_cast<PHINode>(BI); ++BI29.2k ) |
374 | 29.2k | PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader); |
375 | 51.1k | |
376 | 51.1k | // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove |
377 | 51.1k | // OrigPreHeader's old terminator (the original branch into the loop), and |
378 | 51.1k | // remove the corresponding incoming values from the PHI nodes in OrigHeader. |
379 | 51.1k | LoopEntryBranch->eraseFromParent(); |
380 | 51.1k | |
381 | 51.1k | |
382 | 51.1k | SmallVector<PHINode*, 2> InsertedPHIs; |
383 | 51.1k | // If there were any uses of instructions in the duplicated block outside the |
384 | 51.1k | // loop, update them, inserting PHI nodes as required |
385 | 51.1k | RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap, |
386 | 51.1k | &InsertedPHIs); |
387 | 51.1k | |
388 | 51.1k | // Attach dbg.value intrinsics to the new phis if that phi uses a value that |
389 | 51.1k | // previously had debug metadata attached. This keeps the debug info |
390 | 51.1k | // up-to-date in the loop body. |
391 | 51.1k | if (!InsertedPHIs.empty()) |
392 | 50.3k | insertDebugValues(OrigHeader, InsertedPHIs); |
393 | 51.1k | |
394 | 51.1k | // NewHeader is now the header of the loop. |
395 | 51.1k | L->moveToHeader(NewHeader); |
396 | 51.1k | assert(L->getHeader() == NewHeader && "Latch block is our new header"); |
397 | 51.1k | |
398 | 51.1k | // Inform DT about changes to the CFG. |
399 | 51.1k | if (DT51.1k ) { |
400 | 51.1k | // The OrigPreheader branches to the NewHeader and Exit now. Then, inform |
401 | 51.1k | // the DT about the removed edge to the OrigHeader (that got removed). |
402 | 51.1k | SmallVector<DominatorTree::UpdateType, 3> Updates; |
403 | 51.1k | Updates.push_back({DominatorTree::Insert, OrigPreheader, Exit}); |
404 | 51.1k | Updates.push_back({DominatorTree::Insert, OrigPreheader, NewHeader}); |
405 | 51.1k | Updates.push_back({DominatorTree::Delete, OrigPreheader, OrigHeader}); |
406 | 51.1k | DT->applyUpdates(Updates); |
407 | 51.1k | } |
408 | 51.1k | |
409 | 51.1k | // At this point, we've finished our major CFG changes. As part of cloning |
410 | 51.1k | // the loop into the preheader we've simplified instructions and the |
411 | 51.1k | // duplicated conditional branch may now be branching on a constant. If it is |
412 | 51.1k | // branching on a constant and if that constant means that we enter the loop, |
413 | 51.1k | // then we fold away the cond branch to an uncond branch. This simplifies the |
414 | 51.1k | // loop in cases important for nested loops, and it also means we don't have |
415 | 51.1k | // to split as many edges. |
416 | 51.1k | BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator()); |
417 | 51.1k | assert(PHBI->isConditional() && "Should be clone of BI condbr!"); |
418 | 51.1k | if (!isa<ConstantInt>(PHBI->getCondition()) || |
419 | 13.1k | PHBI->getSuccessor(cast<ConstantInt>(PHBI->getCondition())->isZero()) != |
420 | 51.1k | NewHeader) { |
421 | 38.0k | // The conditional branch can't be folded, handle the general case. |
422 | 38.0k | // Split edges as necessary to preserve LoopSimplify form. |
423 | 38.0k | |
424 | 38.0k | // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and |
425 | 38.0k | // thus is not a preheader anymore. |
426 | 38.0k | // Split the edge to form a real preheader. |
427 | 38.0k | BasicBlock *NewPH = SplitCriticalEdge( |
428 | 38.0k | OrigPreheader, NewHeader, |
429 | 38.0k | CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA()); |
430 | 38.0k | NewPH->setName(NewHeader->getName() + ".lr.ph"); |
431 | 38.0k | |
432 | 38.0k | // Preserve canonical loop form, which means that 'Exit' should have only |
433 | 38.0k | // one predecessor. Note that Exit could be an exit block for multiple |
434 | 38.0k | // nested loops, causing both of the edges to now be critical and need to |
435 | 38.0k | // be split. |
436 | 38.0k | SmallVector<BasicBlock *, 4> ExitPreds(pred_begin(Exit), pred_end(Exit)); |
437 | 38.0k | bool SplitLatchEdge = false; |
438 | 81.1k | for (BasicBlock *ExitPred : ExitPreds) { |
439 | 81.1k | // We only need to split loop exit edges. |
440 | 81.1k | Loop *PredLoop = LI->getLoopFor(ExitPred); |
441 | 81.1k | if (!PredLoop || 81.1k PredLoop->contains(Exit)55.4k ) |
442 | 37.6k | continue; |
443 | 43.4k | if (43.4k isa<IndirectBrInst>(ExitPred->getTerminator())43.4k ) |
444 | 1 | continue; |
445 | 43.4k | SplitLatchEdge |= L->getLoopLatch() == ExitPred; |
446 | 43.4k | BasicBlock *ExitSplit = SplitCriticalEdge( |
447 | 43.4k | ExitPred, Exit, |
448 | 43.4k | CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA()); |
449 | 43.4k | ExitSplit->moveBefore(Exit); |
450 | 43.4k | } |
451 | 38.0k | assert(SplitLatchEdge && |
452 | 38.0k | "Despite splitting all preds, failed to split latch exit?"); |
453 | 51.1k | } else { |
454 | 13.1k | // We can fold the conditional branch in the preheader, this makes things |
455 | 13.1k | // simpler. The first step is to remove the extra edge to the Exit block. |
456 | 13.1k | Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/); |
457 | 13.1k | BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI); |
458 | 13.1k | NewBI->setDebugLoc(PHBI->getDebugLoc()); |
459 | 13.1k | PHBI->eraseFromParent(); |
460 | 13.1k | |
461 | 13.1k | // With our CFG finalized, update DomTree if it is available. |
462 | 13.1k | if (DT13.1k ) DT->deleteEdge(OrigPreheader, Exit)13.1k ; |
463 | 13.1k | } |
464 | 51.1k | |
465 | 51.1k | assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation"); |
466 | 51.1k | assert(L->getLoopLatch() && "Invalid loop latch after loop rotation"); |
467 | 51.1k | |
468 | 51.1k | // Now that the CFG and DomTree are in a consistent state again, try to merge |
469 | 51.1k | // the OrigHeader block into OrigLatch. This will succeed if they are |
470 | 51.1k | // connected by an unconditional branch. This is just a cleanup so the |
471 | 51.1k | // emitted code isn't too gross in this common case. |
472 | 51.1k | MergeBlockIntoPredecessor(OrigHeader, DT, LI); |
473 | 51.1k | |
474 | 51.1k | DEBUG(dbgs() << "LoopRotation: into "; L->dump()); |
475 | 700k | |
476 | 700k | ++NumRotated; |
477 | 700k | return true; |
478 | 700k | } |
479 | | |
480 | | /// Determine whether the instructions in this range may be safely and cheaply |
481 | | /// speculated. This is not an important enough situation to develop complex |
482 | | /// heuristics. We handle a single arithmetic instruction along with any type |
483 | | /// conversions. |
484 | | static bool shouldSpeculateInstrs(BasicBlock::iterator Begin, |
485 | 31.7k | BasicBlock::iterator End, Loop *L) { |
486 | 31.7k | bool seenIncrement = false; |
487 | 31.7k | bool MultiExitLoop = false; |
488 | 31.7k | |
489 | 31.7k | if (!L->getExitingBlock()) |
490 | 6.83k | MultiExitLoop = true; |
491 | 31.7k | |
492 | 52.4k | for (BasicBlock::iterator I = Begin; I != End52.4k ; ++I20.6k ) { |
493 | 46.9k | |
494 | 46.9k | if (!isSafeToSpeculativelyExecute(&*I)) |
495 | 6.75k | return false; |
496 | 40.2k | |
497 | 40.2k | if (40.2k isa<DbgInfoIntrinsic>(I)40.2k ) |
498 | 3 | continue; |
499 | 40.2k | |
500 | 40.2k | switch (I->getOpcode()) { |
501 | 6.08k | default: |
502 | 6.08k | return false; |
503 | 16.5k | case Instruction::GetElementPtr: |
504 | 16.5k | // GEPs are cheap if all indices are constant. |
505 | 16.5k | if (!cast<GEPOperator>(I)->hasAllConstantIndices()) |
506 | 9.78k | return false; |
507 | 6.73k | // fall-thru to increment case |
508 | 6.73k | LLVM_FALLTHROUGH6.73k ; |
509 | 14.5k | case Instruction::Add: |
510 | 14.5k | case Instruction::Sub: |
511 | 14.5k | case Instruction::And: |
512 | 14.5k | case Instruction::Or: |
513 | 14.5k | case Instruction::Xor: |
514 | 14.5k | case Instruction::Shl: |
515 | 14.5k | case Instruction::LShr: |
516 | 14.5k | case Instruction::AShr: { |
517 | 14.5k | Value *IVOpnd = |
518 | 14.5k | !isa<Constant>(I->getOperand(0)) |
519 | 14.3k | ? I->getOperand(0) |
520 | 220 | : !isa<Constant>(I->getOperand(1)) ? 220 I->getOperand(1)217 : nullptr3 ; |
521 | 14.5k | if (!IVOpnd) |
522 | 3 | return false; |
523 | 14.5k | |
524 | 14.5k | // If increment operand is used outside of the loop, this speculation |
525 | 14.5k | // could cause extra live range interference. |
526 | 14.5k | if (14.5k MultiExitLoop14.5k ) { |
527 | 9.24k | for (User *UseI : IVOpnd->users()) { |
528 | 9.24k | auto *UserInst = cast<Instruction>(UseI); |
529 | 9.24k | if (!L->contains(UserInst)) |
530 | 1.90k | return false; |
531 | 12.6k | } |
532 | 4.32k | } |
533 | 12.6k | |
534 | 12.6k | if (12.6k seenIncrement12.6k ) |
535 | 1.82k | return false; |
536 | 10.7k | seenIncrement = true; |
537 | 10.7k | break; |
538 | 10.7k | } |
539 | 9.83k | case Instruction::Trunc: |
540 | 9.83k | case Instruction::ZExt: |
541 | 9.83k | case Instruction::SExt: |
542 | 9.83k | // ignore type conversions |
543 | 9.83k | break; |
544 | 46.9k | } |
545 | 46.9k | } |
546 | 5.43k | return true; |
547 | 31.7k | } |
548 | | |
549 | | /// Fold the loop tail into the loop exit by speculating the loop tail |
550 | | /// instructions. Typically, this is a single post-increment. In the case of a |
551 | | /// simple 2-block loop, hoisting the increment can be much better than |
552 | | /// duplicating the entire loop header. In the case of loops with early exits, |
553 | | /// rotation will not work anyway, but simplifyLoopLatch will put the loop in |
554 | | /// canonical form so downstream passes can handle it. |
555 | | /// |
556 | | /// I don't believe this invalidates SCEV. |
557 | 700k | bool LoopRotate::simplifyLoopLatch(Loop *L) { |
558 | 700k | BasicBlock *Latch = L->getLoopLatch(); |
559 | 700k | if (!Latch || 700k Latch->hasAddressTaken()700k ) |
560 | 13 | return false; |
561 | 700k | |
562 | 700k | BranchInst *Jmp = dyn_cast<BranchInst>(Latch->getTerminator()); |
563 | 700k | if (!Jmp || 700k !Jmp->isUnconditional()700k ) |
564 | 627k | return false; |
565 | 73.2k | |
566 | 73.2k | BasicBlock *LastExit = Latch->getSinglePredecessor(); |
567 | 73.2k | if (!LastExit || 73.2k !L->isLoopExiting(LastExit)38.1k ) |
568 | 39.1k | return false; |
569 | 34.0k | |
570 | 34.0k | BranchInst *BI = dyn_cast<BranchInst>(LastExit->getTerminator()); |
571 | 34.0k | if (!BI) |
572 | 2.30k | return false; |
573 | 31.7k | |
574 | 31.7k | if (31.7k !shouldSpeculateInstrs(Latch->begin(), Jmp->getIterator(), L)31.7k ) |
575 | 26.3k | return false; |
576 | 5.43k | |
577 | 5.43k | DEBUG5.43k (dbgs() << "Folding loop latch " << Latch->getName() << " into " |
578 | 5.43k | << LastExit->getName() << "\n"); |
579 | 5.43k | |
580 | 5.43k | // Hoist the instructions from Latch into LastExit. |
581 | 5.43k | LastExit->getInstList().splice(BI->getIterator(), Latch->getInstList(), |
582 | 5.43k | Latch->begin(), Jmp->getIterator()); |
583 | 5.43k | |
584 | 5.43k | unsigned FallThruPath = BI->getSuccessor(0) == Latch ? 02.60k : 12.82k ; |
585 | 5.43k | BasicBlock *Header = Jmp->getSuccessor(0); |
586 | 5.43k | assert(Header == L->getHeader() && "expected a backward branch"); |
587 | 5.43k | |
588 | 5.43k | // Remove Latch from the CFG so that LastExit becomes the new Latch. |
589 | 5.43k | BI->setSuccessor(FallThruPath, Header); |
590 | 5.43k | Latch->replaceSuccessorsPhiUsesWith(LastExit); |
591 | 5.43k | Jmp->eraseFromParent(); |
592 | 5.43k | |
593 | 5.43k | // Nuke the Latch block. |
594 | 5.43k | assert(Latch->empty() && "unable to evacuate Latch"); |
595 | 5.43k | LI->removeBlock(Latch); |
596 | 5.43k | if (DT) |
597 | 5.43k | DT->eraseNode(Latch); |
598 | 700k | Latch->eraseFromParent(); |
599 | 700k | return true; |
600 | 700k | } |
601 | | |
602 | | /// Rotate \c L, and return true if any modification was made. |
603 | 700k | bool LoopRotate::processLoop(Loop *L) { |
604 | 700k | // Save the loop metadata. |
605 | 700k | MDNode *LoopMD = L->getLoopID(); |
606 | 700k | |
607 | 700k | // Simplify the loop latch before attempting to rotate the header |
608 | 700k | // upward. Rotation may not be needed if the loop tail can be folded into the |
609 | 700k | // loop exit. |
610 | 700k | bool SimplifiedLatch = simplifyLoopLatch(L); |
611 | 700k | |
612 | 700k | bool MadeChange = rotateLoop(L, SimplifiedLatch); |
613 | 700k | assert((!MadeChange || L->isLoopExiting(L->getLoopLatch())) && |
614 | 700k | "Loop latch should be exiting after loop-rotate."); |
615 | 700k | |
616 | 700k | // Restore the loop metadata. |
617 | 700k | // NB! We presume LoopRotation DOESN'T ADD its own metadata. |
618 | 700k | if ((MadeChange || 700k SimplifiedLatch649k ) && LoopMD54.0k ) |
619 | 2.91k | L->setLoopID(LoopMD); |
620 | 700k | |
621 | 700k | return MadeChange; |
622 | 700k | } |
623 | | |
624 | | LoopRotatePass::LoopRotatePass(bool EnableHeaderDuplication) |
625 | 95 | : EnableHeaderDuplication(EnableHeaderDuplication) {} |
626 | | |
627 | | PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM, |
628 | | LoopStandardAnalysisResults &AR, |
629 | 59 | LPMUpdater &) { |
630 | 59 | int Threshold = EnableHeaderDuplication ? DefaultRotationThreshold56 : 03 ; |
631 | 59 | const DataLayout &DL = L.getHeader()->getModule()->getDataLayout(); |
632 | 59 | const SimplifyQuery SQ = getBestSimplifyQuery(AR, DL); |
633 | 59 | LoopRotate LR(Threshold, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE, |
634 | 59 | SQ); |
635 | 59 | |
636 | 59 | bool Changed = LR.processLoop(&L); |
637 | 59 | if (!Changed) |
638 | 49 | return PreservedAnalyses::all(); |
639 | 10 | |
640 | 10 | return getLoopPassPreservedAnalyses(); |
641 | 10 | } |
642 | | |
643 | | namespace { |
644 | | |
645 | | class LoopRotateLegacyPass : public LoopPass { |
646 | | unsigned MaxHeaderSize; |
647 | | |
648 | | public: |
649 | | static char ID; // Pass ID, replacement for typeid |
650 | 34.5k | LoopRotateLegacyPass(int SpecifiedMaxHeaderSize = -1) : LoopPass(ID) { |
651 | 34.5k | initializeLoopRotateLegacyPassPass(*PassRegistry::getPassRegistry()); |
652 | 34.5k | if (SpecifiedMaxHeaderSize == -1) |
653 | 32.0k | MaxHeaderSize = DefaultRotationThreshold; |
654 | 34.5k | else |
655 | 2.51k | MaxHeaderSize = unsigned(SpecifiedMaxHeaderSize); |
656 | 34.5k | } |
657 | | |
658 | | // LCSSA form makes instruction renaming easier. |
659 | 34.5k | void getAnalysisUsage(AnalysisUsage &AU) const override { |
660 | 34.5k | AU.addRequired<AssumptionCacheTracker>(); |
661 | 34.5k | AU.addRequired<TargetTransformInfoWrapperPass>(); |
662 | 34.5k | getLoopAnalysisUsage(AU); |
663 | 34.5k | } |
664 | | |
665 | 700k | bool runOnLoop(Loop *L, LPPassManager &LPM) override { |
666 | 700k | if (skipLoop(L)) |
667 | 40 | return false; |
668 | 700k | Function &F = *L->getHeader()->getParent(); |
669 | 700k | |
670 | 700k | auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); |
671 | 700k | const auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); |
672 | 700k | auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); |
673 | 700k | auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); |
674 | 700k | auto *DT = DTWP ? &DTWP->getDomTree()700k : nullptr0 ; |
675 | 700k | auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>(); |
676 | 700k | auto *SE = SEWP ? &SEWP->getSE()700k : nullptr0 ; |
677 | 700k | const SimplifyQuery SQ = getBestSimplifyQuery(*this, F); |
678 | 700k | LoopRotate LR(MaxHeaderSize, LI, TTI, AC, DT, SE, SQ); |
679 | 700k | return LR.processLoop(L); |
680 | 700k | } |
681 | | }; |
682 | | } |
683 | | |
684 | | char LoopRotateLegacyPass::ID = 0; |
685 | 41.6k | INITIALIZE_PASS_BEGIN41.6k (LoopRotateLegacyPass, "loop-rotate", "Rotate Loops",
|
686 | 41.6k | false, false) |
687 | 41.6k | INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) |
688 | 41.6k | INITIALIZE_PASS_DEPENDENCY(LoopPass) |
689 | 41.6k | INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) |
690 | 41.6k | INITIALIZE_PASS_END(LoopRotateLegacyPass, "loop-rotate", "Rotate Loops", false, |
691 | | false) |
692 | | |
693 | 34.5k | Pass *llvm::createLoopRotatePass(int MaxHeaderSize) { |
694 | 34.5k | return new LoopRotateLegacyPass(MaxHeaderSize); |
695 | 34.5k | } |