/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===- LoopInterchange.cpp - Loop interchange pass------------------------===// |
2 | | // |
3 | | // The LLVM Compiler Infrastructure |
4 | | // |
5 | | // This file is distributed under the University of Illinois Open Source |
6 | | // License. See LICENSE.TXT for details. |
7 | | // |
8 | | //===----------------------------------------------------------------------===// |
9 | | // |
10 | | // This Pass handles loop interchange transform. |
11 | | // This pass interchanges loops to provide a more cache-friendly memory access |
12 | | // patterns. |
13 | | // |
14 | | //===----------------------------------------------------------------------===// |
15 | | |
16 | | #include "llvm/ADT/SmallVector.h" |
17 | | #include "llvm/Analysis/AliasAnalysis.h" |
18 | | #include "llvm/Analysis/AssumptionCache.h" |
19 | | #include "llvm/Analysis/BlockFrequencyInfo.h" |
20 | | #include "llvm/Analysis/CodeMetrics.h" |
21 | | #include "llvm/Analysis/DependenceAnalysis.h" |
22 | | #include "llvm/Analysis/LoopInfo.h" |
23 | | #include "llvm/Analysis/LoopIterator.h" |
24 | | #include "llvm/Analysis/LoopPass.h" |
25 | | #include "llvm/Analysis/OptimizationDiagnosticInfo.h" |
26 | | #include "llvm/Analysis/ScalarEvolution.h" |
27 | | #include "llvm/Analysis/ScalarEvolutionExpander.h" |
28 | | #include "llvm/Analysis/ScalarEvolutionExpressions.h" |
29 | | #include "llvm/Analysis/TargetTransformInfo.h" |
30 | | #include "llvm/Analysis/ValueTracking.h" |
31 | | #include "llvm/IR/Dominators.h" |
32 | | #include "llvm/IR/Function.h" |
33 | | #include "llvm/IR/IRBuilder.h" |
34 | | #include "llvm/IR/InstIterator.h" |
35 | | #include "llvm/IR/IntrinsicInst.h" |
36 | | #include "llvm/IR/Module.h" |
37 | | #include "llvm/Pass.h" |
38 | | #include "llvm/Support/Debug.h" |
39 | | #include "llvm/Support/raw_ostream.h" |
40 | | #include "llvm/Transforms/Scalar.h" |
41 | | #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
42 | | #include "llvm/Transforms/Utils/LoopUtils.h" |
43 | | |
44 | | using namespace llvm; |
45 | | |
46 | 26 | #define DEBUG_TYPE "loop-interchange" |
47 | | |
48 | | static cl::opt<int> LoopInterchangeCostThreshold( |
49 | | "loop-interchange-threshold", cl::init(0), cl::Hidden, |
50 | | cl::desc("Interchange if you gain more than this number")); |
51 | | |
52 | | namespace { |
53 | | |
54 | | typedef SmallVector<Loop *, 8> LoopVector; |
55 | | |
56 | | // TODO: Check if we can use a sparse matrix here. |
57 | | typedef std::vector<std::vector<char>> CharMatrix; |
58 | | |
59 | | // Maximum number of dependencies that can be handled in the dependency matrix. |
60 | | static const unsigned MaxMemInstrCount = 100; |
61 | | |
62 | | // Maximum loop depth supported. |
63 | | static const unsigned MaxLoopNestDepth = 10; |
64 | | |
65 | | struct LoopInterchange; |
66 | | |
67 | | #ifdef DUMP_DEP_MATRICIES |
68 | | void printDepMatrix(CharMatrix &DepMatrix) { |
69 | | for (auto &Row : DepMatrix) { |
70 | | for (auto D : Row) |
71 | | DEBUG(dbgs() << D << " "); |
72 | | DEBUG(dbgs() << "\n"); |
73 | | } |
74 | | } |
75 | | #endif |
76 | | |
77 | | static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, |
78 | 25 | Loop *L, DependenceInfo *DI) { |
79 | 25 | typedef SmallVector<Value *, 16> ValueVector; |
80 | 25 | ValueVector MemInstr; |
81 | 25 | |
82 | 25 | // For each block. |
83 | 97 | for (BasicBlock *BB : L->blocks()) { |
84 | 97 | // Scan the BB and collect legal loads and stores. |
85 | 472 | for (Instruction &I : *BB) { |
86 | 472 | if (!isa<Instruction>(I)) |
87 | 0 | return false; |
88 | 472 | if (auto *472 Ld472 = dyn_cast<LoadInst>(&I)) { |
89 | 37 | if (!Ld->isSimple()) |
90 | 0 | return false; |
91 | 37 | MemInstr.push_back(&I); |
92 | 472 | } else if (auto *435 St435 = dyn_cast<StoreInst>(&I)) { |
93 | 34 | if (!St->isSimple()) |
94 | 0 | return false; |
95 | 34 | MemInstr.push_back(&I); |
96 | 34 | } |
97 | 472 | } |
98 | 97 | } |
99 | 25 | |
100 | 25 | DEBUG25 (dbgs() << "Found " << MemInstr.size() |
101 | 25 | << " Loads and Stores to analyze\n"); |
102 | 25 | |
103 | 25 | ValueVector::iterator I, IE, J, JE; |
104 | 25 | |
105 | 96 | for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE96 ; ++I71 ) { |
106 | 225 | for (J = I, JE = MemInstr.end(); J != JE225 ; ++J154 ) { |
107 | 154 | std::vector<char> Dep; |
108 | 154 | Instruction *Src = cast<Instruction>(*I); |
109 | 154 | Instruction *Dst = cast<Instruction>(*J); |
110 | 154 | if (Src == Dst) |
111 | 71 | continue; |
112 | 83 | // Ignore Input dependencies. |
113 | 83 | if (83 isa<LoadInst>(Src) && 83 isa<LoadInst>(Dst)68 ) |
114 | 22 | continue; |
115 | 61 | // Track Output, Flow, and Anti dependencies. |
116 | 61 | if (auto 61 D61 = DI->depends(Src, Dst, true)) { |
117 | 29 | assert(D->isOrdered() && "Expected an output, flow or anti dep."); |
118 | 29 | DEBUG(StringRef DepType = |
119 | 29 | D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output"; |
120 | 29 | dbgs() << "Found " << DepType |
121 | 29 | << " dependency between Src and Dst\n" |
122 | 29 | << " Src:" << *Src << "\n Dst:" << *Dst << '\n'); |
123 | 29 | unsigned Levels = D->getLevels(); |
124 | 29 | char Direction; |
125 | 86 | for (unsigned II = 1; II <= Levels86 ; ++II57 ) { |
126 | 57 | const SCEV *Distance = D->getDistance(II); |
127 | 57 | const SCEVConstant *SCEVConst = |
128 | 57 | dyn_cast_or_null<SCEVConstant>(Distance); |
129 | 57 | if (SCEVConst57 ) { |
130 | 44 | const ConstantInt *CI = SCEVConst->getValue(); |
131 | 44 | if (CI->isNegative()) |
132 | 6 | Direction = '<'; |
133 | 38 | else if (38 CI->isZero()38 ) |
134 | 37 | Direction = '='; |
135 | 38 | else |
136 | 1 | Direction = '>'; |
137 | 44 | Dep.push_back(Direction); |
138 | 57 | } else if (13 D->isScalar(II)13 ) { |
139 | 13 | Direction = 'S'; |
140 | 13 | Dep.push_back(Direction); |
141 | 13 | } else { |
142 | 0 | unsigned Dir = D->getDirection(II); |
143 | 0 | if (Dir == Dependence::DVEntry::LT || |
144 | 0 | Dir == Dependence::DVEntry::LE) |
145 | 0 | Direction = '<'; |
146 | 0 | else if (0 Dir == Dependence::DVEntry::GT || |
147 | 0 | Dir == Dependence::DVEntry::GE) |
148 | 0 | Direction = '>'; |
149 | 0 | else if (0 Dir == Dependence::DVEntry::EQ0 ) |
150 | 0 | Direction = '='; |
151 | 0 | else |
152 | 0 | Direction = '*'; |
153 | 13 | Dep.push_back(Direction); |
154 | 13 | } |
155 | 57 | } |
156 | 39 | while (Dep.size() != Level39 ) { |
157 | 10 | Dep.push_back('I'); |
158 | 10 | } |
159 | 29 | |
160 | 29 | DepMatrix.push_back(Dep); |
161 | 29 | if (DepMatrix.size() > MaxMemInstrCount29 ) { |
162 | 0 | DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount |
163 | 0 | << " dependencies inside loop\n"); |
164 | 0 | return false; |
165 | 0 | } |
166 | 29 | } |
167 | 154 | } |
168 | 71 | } |
169 | 25 | |
170 | 25 | // We don't have a DepMatrix to check legality return false. |
171 | 25 | if (25 DepMatrix.size() == 025 ) |
172 | 0 | return false; |
173 | 25 | return true; |
174 | 25 | } |
175 | | |
176 | | // A loop is moved from index 'from' to an index 'to'. Update the Dependence |
177 | | // matrix by exchanging the two columns. |
178 | | static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx, |
179 | 13 | unsigned ToIndx) { |
180 | 13 | unsigned numRows = DepMatrix.size(); |
181 | 29 | for (unsigned i = 0; i < numRows29 ; ++i16 ) { |
182 | 16 | char TmpVal = DepMatrix[i][ToIndx]; |
183 | 16 | DepMatrix[i][ToIndx] = DepMatrix[i][FromIndx]; |
184 | 16 | DepMatrix[i][FromIndx] = TmpVal; |
185 | 16 | } |
186 | 13 | } |
187 | | |
188 | | // Checks if outermost non '=','S'or'I' dependence in the dependence matrix is |
189 | | // '>' |
190 | | static bool isOuterMostDepPositive(CharMatrix &DepMatrix, unsigned Row, |
191 | 33 | unsigned Column) { |
192 | 71 | for (unsigned i = 0; i <= Column71 ; ++i38 ) { |
193 | 42 | if (DepMatrix[Row][i] == '<') |
194 | 3 | return false; |
195 | 39 | if (39 DepMatrix[Row][i] == '>'39 ) |
196 | 1 | return true; |
197 | 42 | } |
198 | 33 | // All dependencies were '=','S' or 'I' |
199 | 29 | return false; |
200 | 33 | } |
201 | | |
202 | | // Checks if no dependence exist in the dependency matrix in Row before Column. |
203 | | static bool containsNoDependence(CharMatrix &DepMatrix, unsigned Row, |
204 | 0 | unsigned Column) { |
205 | 0 | for (unsigned i = 0; i < Column0 ; ++i0 ) { |
206 | 0 | if (DepMatrix[Row][i] != '=' && 0 DepMatrix[Row][i] != 'S'0 && |
207 | 0 | DepMatrix[Row][i] != 'I') |
208 | 0 | return false; |
209 | 0 | } |
210 | 0 | return true; |
211 | 0 | } |
212 | | |
213 | | static bool validDepInterchange(CharMatrix &DepMatrix, unsigned Row, |
214 | | unsigned OuterLoopId, char InnerDep, |
215 | 33 | char OuterDep) { |
216 | 33 | |
217 | 33 | if (isOuterMostDepPositive(DepMatrix, Row, OuterLoopId)) |
218 | 1 | return false; |
219 | 32 | |
220 | 32 | if (32 InnerDep == OuterDep32 ) |
221 | 18 | return true; |
222 | 14 | |
223 | 14 | // It is legal to interchange if and only if after interchange no row has a |
224 | 14 | // '>' direction as the leftmost non-'='. |
225 | 14 | |
226 | 14 | if (14 InnerDep == '=' || 14 InnerDep == 'S'10 || InnerDep == 'I'10 ) |
227 | 13 | return true; |
228 | 1 | |
229 | 1 | if (1 InnerDep == '<'1 ) |
230 | 1 | return true; |
231 | 0 |
|
232 | 0 | if (0 InnerDep == '>'0 ) { |
233 | 0 | // If OuterLoopId represents outermost loop then interchanging will make the |
234 | 0 | // 1st dependency as '>' |
235 | 0 | if (OuterLoopId == 0) |
236 | 0 | return false; |
237 | 0 |
|
238 | 0 | // If all dependencies before OuterloopId are '=','S'or 'I'. Then |
239 | 0 | // interchanging will result in this row having an outermost non '=' |
240 | 0 | // dependency of '>' |
241 | 0 | if (0 !containsNoDependence(DepMatrix, Row, OuterLoopId)0 ) |
242 | 0 | return true; |
243 | 0 | } |
244 | 0 |
|
245 | 0 | return false; |
246 | 0 | } |
247 | | |
248 | | // Checks if it is legal to interchange 2 loops. |
249 | | // [Theorem] A permutation of the loops in a perfect nest is legal if and only |
250 | | // if the direction matrix, after the same permutation is applied to its |
251 | | // columns, has no ">" direction as the leftmost non-"=" direction in any row. |
252 | | static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix, |
253 | | unsigned InnerLoopId, |
254 | 27 | unsigned OuterLoopId) { |
255 | 27 | |
256 | 27 | unsigned NumRows = DepMatrix.size(); |
257 | 27 | // For each row check if it is valid to interchange. |
258 | 59 | for (unsigned Row = 0; Row < NumRows59 ; ++Row32 ) { |
259 | 33 | char InnerDep = DepMatrix[Row][InnerLoopId]; |
260 | 33 | char OuterDep = DepMatrix[Row][OuterLoopId]; |
261 | 33 | if (InnerDep == '*' || 33 OuterDep == '*'33 ) |
262 | 0 | return false; |
263 | 33 | if (33 !validDepInterchange(DepMatrix, Row, OuterLoopId, InnerDep, OuterDep)33 ) |
264 | 1 | return false; |
265 | 33 | } |
266 | 26 | return true; |
267 | 27 | } |
268 | | |
269 | 25 | static void populateWorklist(Loop &L, SmallVector<LoopVector, 8> &V) { |
270 | 25 | |
271 | 25 | DEBUG(dbgs() << "Calling populateWorklist on Func: " |
272 | 25 | << L.getHeader()->getParent()->getName() << " Loop: %" |
273 | 25 | << L.getHeader()->getName() << '\n'); |
274 | 25 | LoopVector LoopList; |
275 | 25 | Loop *CurrentLoop = &L; |
276 | 25 | const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops(); |
277 | 55 | while (!Vec->empty()55 ) { |
278 | 30 | // The current loop has multiple subloops in it hence it is not tightly |
279 | 30 | // nested. |
280 | 30 | // Discard all loops above it added into Worklist. |
281 | 30 | if (Vec->size() != 130 ) { |
282 | 0 | LoopList.clear(); |
283 | 0 | return; |
284 | 0 | } |
285 | 30 | LoopList.push_back(CurrentLoop); |
286 | 30 | CurrentLoop = Vec->front(); |
287 | 30 | Vec = &CurrentLoop->getSubLoops(); |
288 | 30 | } |
289 | 25 | LoopList.push_back(CurrentLoop); |
290 | 25 | V.push_back(std::move(LoopList)); |
291 | 25 | } |
292 | | |
293 | 12 | static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) { |
294 | 12 | PHINode *InnerIndexVar = L->getCanonicalInductionVariable(); |
295 | 12 | if (InnerIndexVar) |
296 | 4 | return InnerIndexVar; |
297 | 8 | if (8 L->getLoopLatch() == nullptr || 8 L->getLoopPredecessor() == nullptr8 ) |
298 | 0 | return nullptr; |
299 | 8 | for (BasicBlock::iterator I = L->getHeader()->begin(); 8 isa<PHINode>(I)8 ; ++I0 ) { |
300 | 8 | PHINode *PhiVar = cast<PHINode>(I); |
301 | 8 | Type *PhiTy = PhiVar->getType(); |
302 | 8 | if (!PhiTy->isIntegerTy() && 8 !PhiTy->isFloatingPointTy()0 && |
303 | 0 | !PhiTy->isPointerTy()) |
304 | 0 | return nullptr; |
305 | 8 | const SCEVAddRecExpr *AddRec = |
306 | 8 | dyn_cast<SCEVAddRecExpr>(SE->getSCEV(PhiVar)); |
307 | 8 | if (!AddRec || 8 !AddRec->isAffine()8 ) |
308 | 0 | continue; |
309 | 8 | const SCEV *Step = AddRec->getStepRecurrence(*SE); |
310 | 8 | if (!isa<SCEVConstant>(Step)) |
311 | 0 | continue; |
312 | 8 | // Found the induction variable. |
313 | 8 | // FIXME: Handle loops with more than one induction variable. Note that, |
314 | 8 | // currently, legality makes sure we have only one induction variable. |
315 | 8 | return PhiVar; |
316 | 8 | } |
317 | 0 | return nullptr; |
318 | 12 | } |
319 | | |
320 | | /// LoopInterchangeLegality checks if it is legal to interchange the loop. |
321 | | class LoopInterchangeLegality { |
322 | | public: |
323 | | LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE, |
324 | | LoopInfo *LI, DominatorTree *DT, bool PreserveLCSSA, |
325 | | OptimizationRemarkEmitter *ORE) |
326 | | : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), |
327 | 27 | PreserveLCSSA(PreserveLCSSA), ORE(ORE), InnerLoopHasReduction(false) {} |
328 | | |
329 | | /// Check if the loops can be interchanged. |
330 | | bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId, |
331 | | CharMatrix &DepMatrix); |
332 | | /// Check if the loop structure is understood. We do not handle triangular |
333 | | /// loops for now. |
334 | | bool isLoopStructureUnderstood(PHINode *InnerInductionVar); |
335 | | |
336 | | bool currentLimitations(); |
337 | | |
338 | 13 | bool hasInnerLoopReduction() { return InnerLoopHasReduction; } |
339 | | |
340 | | private: |
341 | | bool tightlyNested(Loop *Outer, Loop *Inner); |
342 | | bool containsUnsafeInstructionsInHeader(BasicBlock *BB); |
343 | | bool areAllUsesReductions(Instruction *Ins, Loop *L); |
344 | | bool containsUnsafeInstructionsInLatch(BasicBlock *BB); |
345 | | bool findInductionAndReductions(Loop *L, |
346 | | SmallVector<PHINode *, 8> &Inductions, |
347 | | SmallVector<PHINode *, 8> &Reductions); |
348 | | Loop *OuterLoop; |
349 | | Loop *InnerLoop; |
350 | | |
351 | | ScalarEvolution *SE; |
352 | | LoopInfo *LI; |
353 | | DominatorTree *DT; |
354 | | bool PreserveLCSSA; |
355 | | /// Interface to emit optimization remarks. |
356 | | OptimizationRemarkEmitter *ORE; |
357 | | |
358 | | bool InnerLoopHasReduction; |
359 | | }; |
360 | | |
361 | | /// LoopInterchangeProfitability checks if it is profitable to interchange the |
362 | | /// loop. |
363 | | class LoopInterchangeProfitability { |
364 | | public: |
365 | | LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE, |
366 | | OptimizationRemarkEmitter *ORE) |
367 | 17 | : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {} |
368 | | |
369 | | /// Check if the loop interchange is profitable. |
370 | | bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId, |
371 | | CharMatrix &DepMatrix); |
372 | | |
373 | | private: |
374 | | int getInstrOrderCost(); |
375 | | |
376 | | Loop *OuterLoop; |
377 | | Loop *InnerLoop; |
378 | | |
379 | | /// Scev analysis. |
380 | | ScalarEvolution *SE; |
381 | | /// Interface to emit optimization remarks. |
382 | | OptimizationRemarkEmitter *ORE; |
383 | | }; |
384 | | |
385 | | /// LoopInterchangeTransform interchanges the loop. |
386 | | class LoopInterchangeTransform { |
387 | | public: |
388 | | LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE, |
389 | | LoopInfo *LI, DominatorTree *DT, |
390 | | BasicBlock *LoopNestExit, |
391 | | bool InnerLoopContainsReductions) |
392 | | : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), |
393 | | LoopExit(LoopNestExit), |
394 | 13 | InnerLoopHasReduction(InnerLoopContainsReductions) {} |
395 | | |
396 | | /// Interchange OuterLoop and InnerLoop. |
397 | | bool transform(); |
398 | | void restructureLoops(Loop *InnerLoop, Loop *OuterLoop); |
399 | | void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop); |
400 | | |
401 | | private: |
402 | | void splitInnerLoopLatch(Instruction *); |
403 | | void splitInnerLoopHeader(); |
404 | | bool adjustLoopLinks(); |
405 | | void adjustLoopPreheaders(); |
406 | | bool adjustLoopBranches(); |
407 | | void updateIncomingBlock(BasicBlock *CurrBlock, BasicBlock *OldPred, |
408 | | BasicBlock *NewPred); |
409 | | |
410 | | Loop *OuterLoop; |
411 | | Loop *InnerLoop; |
412 | | |
413 | | /// Scev analysis. |
414 | | ScalarEvolution *SE; |
415 | | LoopInfo *LI; |
416 | | DominatorTree *DT; |
417 | | BasicBlock *LoopExit; |
418 | | bool InnerLoopHasReduction; |
419 | | }; |
420 | | |
421 | | // Main LoopInterchange Pass. |
422 | | struct LoopInterchange : public FunctionPass { |
423 | | static char ID; |
424 | | ScalarEvolution *SE; |
425 | | LoopInfo *LI; |
426 | | DependenceInfo *DI; |
427 | | DominatorTree *DT; |
428 | | bool PreserveLCSSA; |
429 | | /// Interface to emit optimization remarks. |
430 | | OptimizationRemarkEmitter *ORE; |
431 | | |
432 | | LoopInterchange() |
433 | 15 | : FunctionPass(ID), SE(nullptr), LI(nullptr), DI(nullptr), DT(nullptr) { |
434 | 15 | initializeLoopInterchangePass(*PassRegistry::getPassRegistry()); |
435 | 15 | } |
436 | | |
437 | 15 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
438 | 15 | AU.addRequired<ScalarEvolutionWrapperPass>(); |
439 | 15 | AU.addRequired<AAResultsWrapperPass>(); |
440 | 15 | AU.addRequired<DominatorTreeWrapperPass>(); |
441 | 15 | AU.addRequired<LoopInfoWrapperPass>(); |
442 | 15 | AU.addRequired<DependenceAnalysisWrapperPass>(); |
443 | 15 | AU.addRequiredID(LoopSimplifyID); |
444 | 15 | AU.addRequiredID(LCSSAID); |
445 | 15 | AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); |
446 | 15 | } |
447 | | |
448 | 25 | bool runOnFunction(Function &F) override { |
449 | 25 | if (skipFunction(F)) |
450 | 0 | return false; |
451 | 25 | |
452 | 25 | SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); |
453 | 25 | LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); |
454 | 25 | DI = &getAnalysis<DependenceAnalysisWrapperPass>().getDI(); |
455 | 25 | auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); |
456 | 25 | DT = DTWP ? &DTWP->getDomTree()25 : nullptr0 ; |
457 | 25 | ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); |
458 | 25 | PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); |
459 | 25 | |
460 | 25 | // Build up a worklist of loop pairs to analyze. |
461 | 25 | SmallVector<LoopVector, 8> Worklist; |
462 | 25 | |
463 | 25 | for (Loop *L : *LI) |
464 | 25 | populateWorklist(*L, Worklist); |
465 | 25 | |
466 | 25 | DEBUG(dbgs() << "Worklist size = " << Worklist.size() << "\n"); |
467 | 25 | bool Changed = true; |
468 | 50 | while (!Worklist.empty()50 ) { |
469 | 25 | LoopVector LoopList = Worklist.pop_back_val(); |
470 | 25 | Changed = processLoopList(LoopList, F); |
471 | 25 | } |
472 | 25 | return Changed; |
473 | 25 | } |
474 | | |
475 | 25 | bool isComputableLoopNest(LoopVector LoopList) { |
476 | 55 | for (Loop *L : LoopList) { |
477 | 55 | const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L); |
478 | 55 | if (ExitCountOuter == SE->getCouldNotCompute()55 ) { |
479 | 0 | DEBUG(dbgs() << "Couldn't compute backedge count\n"); |
480 | 0 | return false; |
481 | 0 | } |
482 | 55 | if (55 L->getNumBackEdges() != 155 ) { |
483 | 0 | DEBUG(dbgs() << "NumBackEdges is not equal to 1\n"); |
484 | 0 | return false; |
485 | 0 | } |
486 | 55 | if (55 !L->getExitingBlock()55 ) { |
487 | 0 | DEBUG(dbgs() << "Loop doesn't have unique exit block\n"); |
488 | 0 | return false; |
489 | 0 | } |
490 | 25 | } |
491 | 25 | return true; |
492 | 25 | } |
493 | | |
494 | 24 | unsigned selectLoopForInterchange(const LoopVector &LoopList) { |
495 | 24 | // TODO: Add a better heuristic to select the loop to be interchanged based |
496 | 24 | // on the dependence matrix. Currently we select the innermost loop. |
497 | 24 | return LoopList.size() - 1; |
498 | 24 | } |
499 | | |
500 | 25 | bool processLoopList(LoopVector LoopList, Function &F) { |
501 | 25 | |
502 | 25 | bool Changed = false; |
503 | 25 | unsigned LoopNestDepth = LoopList.size(); |
504 | 25 | if (LoopNestDepth < 225 ) { |
505 | 0 | DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n"); |
506 | 0 | return false; |
507 | 0 | } |
508 | 25 | if (25 LoopNestDepth > MaxLoopNestDepth25 ) { |
509 | 0 | DEBUG(dbgs() << "Cannot handle loops of depth greater than " |
510 | 0 | << MaxLoopNestDepth << "\n"); |
511 | 0 | return false; |
512 | 0 | } |
513 | 25 | if (25 !isComputableLoopNest(LoopList)25 ) { |
514 | 0 | DEBUG(dbgs() << "Not valid loop candidate for interchange\n"); |
515 | 0 | return false; |
516 | 0 | } |
517 | 25 | |
518 | 25 | DEBUG25 (dbgs() << "Processing LoopList of size = " << LoopNestDepth << "\n"); |
519 | 25 | |
520 | 25 | CharMatrix DependencyMatrix; |
521 | 25 | Loop *OuterMostLoop = *(LoopList.begin()); |
522 | 25 | if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth, |
523 | 25 | OuterMostLoop, DI)) { |
524 | 0 | DEBUG(dbgs() << "Populating dependency matrix failed\n"); |
525 | 0 | return false; |
526 | 0 | } |
527 | | #ifdef DUMP_DEP_MATRICIES |
528 | | DEBUG(dbgs() << "Dependence before interchange\n"); |
529 | | printDepMatrix(DependencyMatrix); |
530 | | #endif |
531 | | |
532 | 25 | BasicBlock *OuterMostLoopLatch = OuterMostLoop->getLoopLatch(); |
533 | 25 | BranchInst *OuterMostLoopLatchBI = |
534 | 25 | dyn_cast<BranchInst>(OuterMostLoopLatch->getTerminator()); |
535 | 25 | if (!OuterMostLoopLatchBI) |
536 | 0 | return false; |
537 | 25 | |
538 | 25 | // Since we currently do not handle LCSSA PHI's any failure in loop |
539 | 25 | // condition will now branch to LoopNestExit. |
540 | 25 | // TODO: This should be removed once we handle LCSSA PHI nodes. |
541 | 25 | |
542 | 25 | // Get the Outermost loop exit. |
543 | 25 | BasicBlock *LoopNestExit; |
544 | 25 | if (OuterMostLoopLatchBI->getSuccessor(0) == OuterMostLoop->getHeader()) |
545 | 7 | LoopNestExit = OuterMostLoopLatchBI->getSuccessor(1); |
546 | 25 | else |
547 | 18 | LoopNestExit = OuterMostLoopLatchBI->getSuccessor(0); |
548 | 25 | |
549 | 25 | if (isa<PHINode>(LoopNestExit->begin())25 ) { |
550 | 1 | DEBUG(dbgs() << "PHI Nodes in loop nest exit is not handled for now " |
551 | 1 | "since on failure all loops branch to loop nest exit.\n"); |
552 | 1 | return false; |
553 | 1 | } |
554 | 24 | |
555 | 24 | unsigned SelecLoopId = selectLoopForInterchange(LoopList); |
556 | 24 | // Move the selected loop outwards to the best possible position. |
557 | 37 | for (unsigned i = SelecLoopId; i > 037 ; i--13 ) { |
558 | 27 | bool Interchanged = |
559 | 27 | processLoop(LoopList, i, i - 1, LoopNestExit, DependencyMatrix); |
560 | 27 | if (!Interchanged) |
561 | 14 | return Changed; |
562 | 13 | // Loops interchanged reflect the same in LoopList |
563 | 13 | std::swap(LoopList[i - 1], LoopList[i]); |
564 | 13 | |
565 | 13 | // Update the DependencyMatrix |
566 | 13 | interChangeDependencies(DependencyMatrix, i, i - 1); |
567 | 13 | DT->recalculate(F); |
568 | | #ifdef DUMP_DEP_MATRICIES |
569 | | DEBUG(dbgs() << "Dependence after interchange\n"); |
570 | | printDepMatrix(DependencyMatrix); |
571 | | #endif |
572 | | Changed |= Interchanged; |
573 | 27 | } |
574 | 10 | return Changed; |
575 | 25 | } |
576 | | |
577 | | bool processLoop(LoopVector LoopList, unsigned InnerLoopId, |
578 | | unsigned OuterLoopId, BasicBlock *LoopNestExit, |
579 | 27 | std::vector<std::vector<char>> &DependencyMatrix) { |
580 | 27 | |
581 | 27 | DEBUG(dbgs() << "Processing Inner Loop Id = " << InnerLoopId |
582 | 27 | << " and OuterLoopId = " << OuterLoopId << "\n"); |
583 | 27 | Loop *InnerLoop = LoopList[InnerLoopId]; |
584 | 27 | Loop *OuterLoop = LoopList[OuterLoopId]; |
585 | 27 | |
586 | 27 | LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, LI, DT, |
587 | 27 | PreserveLCSSA, ORE); |
588 | 27 | if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)27 ) { |
589 | 10 | DEBUG(dbgs() << "Not interchanging Loops. Cannot prove legality\n"); |
590 | 10 | return false; |
591 | 10 | } |
592 | 17 | DEBUG17 (dbgs() << "Loops are legal to interchange\n"); |
593 | 17 | LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE); |
594 | 17 | if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)17 ) { |
595 | 4 | DEBUG(dbgs() << "Interchanging loops not profitable\n"); |
596 | 4 | return false; |
597 | 4 | } |
598 | 13 | |
599 | 13 | ORE->emit(OptimizationRemark(13 DEBUG_TYPE13 , "Interchanged", |
600 | 13 | InnerLoop->getStartLoc(), |
601 | 13 | InnerLoop->getHeader()) |
602 | 13 | << "Loop interchanged with enclosing loop."); |
603 | 13 | |
604 | 13 | LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, |
605 | 13 | LoopNestExit, LIL.hasInnerLoopReduction()); |
606 | 13 | LIT.transform(); |
607 | 13 | DEBUG(dbgs() << "Loops interchanged\n"); |
608 | 27 | return true; |
609 | 27 | } |
610 | | }; |
611 | | |
612 | | } // end of namespace |
613 | 3 | bool LoopInterchangeLegality::areAllUsesReductions(Instruction *Ins, Loop *L) { |
614 | 3 | return none_of(Ins->users(), [=](User *U) -> bool { |
615 | 3 | auto *UserIns = dyn_cast<PHINode>(U); |
616 | 3 | RecurrenceDescriptor RD; |
617 | 3 | return !UserIns || !RecurrenceDescriptor::isReductionPHI(UserIns, L, RD); |
618 | 3 | }); |
619 | 3 | } |
620 | | |
621 | | bool LoopInterchangeLegality::containsUnsafeInstructionsInHeader( |
622 | 21 | BasicBlock *BB) { |
623 | 74 | for (auto I = BB->begin(), E = BB->end(); I != E74 ; ++I53 ) { |
624 | 57 | // Load corresponding to reduction PHI's are safe while concluding if |
625 | 57 | // tightly nested. |
626 | 57 | if (LoadInst *L57 = dyn_cast<LoadInst>(I)) { |
627 | 3 | if (!areAllUsesReductions(L, InnerLoop)) |
628 | 0 | return true; |
629 | 54 | } else if (54 I->mayHaveSideEffects() || 54 I->mayReadFromMemory()50 ) |
630 | 4 | return true; |
631 | 57 | } |
632 | 17 | return false; |
633 | 21 | } |
634 | | |
635 | | bool LoopInterchangeLegality::containsUnsafeInstructionsInLatch( |
636 | 17 | BasicBlock *BB) { |
637 | 80 | for (auto I = BB->begin(), E = BB->end(); I != E80 ; ++I63 ) { |
638 | 63 | // Stores corresponding to reductions are safe while concluding if tightly |
639 | 63 | // nested. |
640 | 63 | if (StoreInst *L63 = dyn_cast<StoreInst>(I)) { |
641 | 3 | if (!isa<PHINode>(L->getOperand(0))) |
642 | 0 | return true; |
643 | 60 | } else if (60 I->mayHaveSideEffects() || 60 I->mayReadFromMemory()60 ) |
644 | 0 | return true; |
645 | 63 | } |
646 | 17 | return false; |
647 | 17 | } |
648 | | |
649 | 21 | bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) { |
650 | 21 | BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); |
651 | 21 | BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); |
652 | 21 | BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch(); |
653 | 21 | |
654 | 21 | DEBUG(dbgs() << "Checking if loops are tightly nested\n"); |
655 | 21 | |
656 | 21 | // A perfectly nested loop will not have any branch in between the outer and |
657 | 21 | // inner block i.e. outer header will branch to either inner preheader and |
658 | 21 | // outerloop latch. |
659 | 21 | BranchInst *OuterLoopHeaderBI = |
660 | 21 | dyn_cast<BranchInst>(OuterLoopHeader->getTerminator()); |
661 | 21 | if (!OuterLoopHeaderBI) |
662 | 0 | return false; |
663 | 21 | |
664 | 21 | for (BasicBlock *Succ : OuterLoopHeaderBI->successors()) |
665 | 23 | if (23 Succ != InnerLoopPreHeader && 23 Succ != OuterLoopLatch2 ) |
666 | 0 | return false; |
667 | 21 | |
668 | 21 | DEBUG21 (dbgs() << "Checking instructions in Loop header and Loop latch\n"); |
669 | 21 | // We do not have any basic block in between now make sure the outer header |
670 | 21 | // and outer loop latch doesn't contain any unsafe instructions. |
671 | 21 | if (containsUnsafeInstructionsInHeader(OuterLoopHeader) || |
672 | 17 | containsUnsafeInstructionsInLatch(OuterLoopLatch)) |
673 | 4 | return false; |
674 | 17 | |
675 | 17 | DEBUG17 (dbgs() << "Loops are perfectly nested\n"); |
676 | 17 | // We have a perfect loop nest. |
677 | 17 | return true; |
678 | 17 | } |
679 | | |
680 | | |
681 | | bool LoopInterchangeLegality::isLoopStructureUnderstood( |
682 | 23 | PHINode *InnerInduction) { |
683 | 23 | |
684 | 23 | unsigned Num = InnerInduction->getNumOperands(); |
685 | 23 | BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader(); |
686 | 69 | for (unsigned i = 0; i < Num69 ; ++i46 ) { |
687 | 46 | Value *Val = InnerInduction->getOperand(i); |
688 | 46 | if (isa<Constant>(Val)) |
689 | 23 | continue; |
690 | 23 | Instruction *I = dyn_cast<Instruction>(Val); |
691 | 23 | if (!I) |
692 | 0 | return false; |
693 | 23 | // TODO: Handle triangular loops. |
694 | 23 | // e.g. for(int i=0;i<N;i++) |
695 | 23 | // for(int j=i;j<N;j++) |
696 | 23 | unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i); |
697 | 23 | if (InnerInduction->getIncomingBlock(IncomBlockIndx) == |
698 | 23 | InnerLoopPreheader && |
699 | 23 | !OuterLoop->isLoopInvariant(I)0 ) { |
700 | 0 | return false; |
701 | 0 | } |
702 | 46 | } |
703 | 23 | return true; |
704 | 23 | } |
705 | | |
706 | | bool LoopInterchangeLegality::findInductionAndReductions( |
707 | | Loop *L, SmallVector<PHINode *, 8> &Inductions, |
708 | 49 | SmallVector<PHINode *, 8> &Reductions) { |
709 | 49 | if (!L->getLoopLatch() || 49 !L->getLoopPredecessor()49 ) |
710 | 0 | return false; |
711 | 104 | for (BasicBlock::iterator I = L->getHeader()->begin(); 49 isa<PHINode>(I)104 ; ++I55 ) { |
712 | 56 | RecurrenceDescriptor RD; |
713 | 56 | InductionDescriptor ID; |
714 | 56 | PHINode *PHI = cast<PHINode>(I); |
715 | 56 | if (InductionDescriptor::isInductionPHI(PHI, L, SE, ID)) |
716 | 49 | Inductions.push_back(PHI); |
717 | 7 | else if (7 RecurrenceDescriptor::isReductionPHI(PHI, L, RD)7 ) |
718 | 6 | Reductions.push_back(PHI); |
719 | 1 | else { |
720 | 1 | DEBUG( |
721 | 7 | dbgs() << "Failed to recognize PHI as an induction or reduction.\n"); |
722 | 7 | return false; |
723 | 7 | } |
724 | 56 | } |
725 | 48 | return true; |
726 | 49 | } |
727 | | |
728 | 46 | static bool containsSafePHI(BasicBlock *Block, bool isOuterLoopExitBlock) { |
729 | 49 | for (auto I = Block->begin(); isa<PHINode>(I)49 ; ++I3 ) { |
730 | 3 | PHINode *PHI = cast<PHINode>(I); |
731 | 3 | // Reduction lcssa phi will have only 1 incoming block that from loop latch. |
732 | 3 | if (PHI->getNumIncomingValues() > 1) |
733 | 0 | return false; |
734 | 3 | Instruction *Ins = dyn_cast<Instruction>(PHI->getIncomingValue(0)); |
735 | 3 | if (!Ins) |
736 | 0 | return false; |
737 | 3 | // Incoming value for lcssa phi's in outer loop exit can only be inner loop |
738 | 3 | // exits lcssa phi else it would not be tightly nested. |
739 | 3 | if (3 !isa<PHINode>(Ins) && 3 isOuterLoopExitBlock3 ) |
740 | 0 | return false; |
741 | 3 | } |
742 | 46 | return true; |
743 | 46 | } |
744 | | |
745 | | static BasicBlock *getLoopLatchExitBlock(BasicBlock *LatchBlock, |
746 | 46 | BasicBlock *LoopHeader) { |
747 | 46 | if (BranchInst *BI46 = dyn_cast<BranchInst>(LatchBlock->getTerminator())) { |
748 | 46 | assert(BI->getNumSuccessors() == 2 && |
749 | 46 | "Branch leaving loop latch must have 2 successors"); |
750 | 59 | for (BasicBlock *Succ : BI->successors()) { |
751 | 59 | if (Succ == LoopHeader) |
752 | 13 | continue; |
753 | 46 | return Succ; |
754 | 46 | } |
755 | 46 | } |
756 | 0 | return nullptr; |
757 | 0 | } |
758 | | |
759 | | // This function indicates the current limitations in the transform as a result |
760 | | // of which we do not proceed. |
761 | 25 | bool LoopInterchangeLegality::currentLimitations() { |
762 | 25 | |
763 | 25 | BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); |
764 | 25 | BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); |
765 | 25 | BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); |
766 | 25 | BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch(); |
767 | 25 | BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); |
768 | 25 | |
769 | 25 | PHINode *InnerInductionVar; |
770 | 25 | SmallVector<PHINode *, 8> Inductions; |
771 | 25 | SmallVector<PHINode *, 8> Reductions; |
772 | 25 | if (!findInductionAndReductions(InnerLoop, Inductions, Reductions)25 ) { |
773 | 1 | DEBUG(dbgs() << "Only inner loops with induction or reduction PHI nodes " |
774 | 1 | << "are supported currently.\n"); |
775 | 1 | ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, |
776 | 1 | "UnsupportedPHIInner", |
777 | 1 | InnerLoop->getStartLoc(), |
778 | 1 | InnerLoop->getHeader()) |
779 | 1 | << "Only inner loops with induction or reduction PHI nodes can be" |
780 | 1 | " interchange currently."); |
781 | 1 | return true; |
782 | 1 | } |
783 | 24 | |
784 | 24 | // TODO: Currently we handle only loops with 1 induction variable. |
785 | 24 | if (24 Inductions.size() != 124 ) { |
786 | 0 | DEBUG(dbgs() << "We currently only support loops with 1 induction variable." |
787 | 0 | << "Failed to interchange due to current limitation\n"); |
788 | 0 | ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, |
789 | 0 | "MultiInductionInner", |
790 | 0 | InnerLoop->getStartLoc(), |
791 | 0 | InnerLoop->getHeader()) |
792 | 0 | << "Only inner loops with 1 induction variable can be " |
793 | 0 | "interchanged currently."); |
794 | 0 | return true; |
795 | 0 | } |
796 | 24 | if (24 Reductions.size() > 024 ) |
797 | 3 | InnerLoopHasReduction = true; |
798 | 24 | |
799 | 24 | InnerInductionVar = Inductions.pop_back_val(); |
800 | 24 | Reductions.clear(); |
801 | 24 | if (!findInductionAndReductions(OuterLoop, Inductions, Reductions)24 ) { |
802 | 0 | DEBUG(dbgs() << "Only outer loops with induction or reduction PHI nodes " |
803 | 0 | << "are supported currently.\n"); |
804 | 0 | ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, |
805 | 0 | "UnsupportedPHIOuter", |
806 | 0 | OuterLoop->getStartLoc(), |
807 | 0 | OuterLoop->getHeader()) |
808 | 0 | << "Only outer loops with induction or reduction PHI nodes can be" |
809 | 0 | " interchanged currently."); |
810 | 0 | return true; |
811 | 0 | } |
812 | 24 | |
813 | 24 | // Outer loop cannot have reduction because then loops will not be tightly |
814 | 24 | // nested. |
815 | 24 | if (24 !Reductions.empty()24 ) { |
816 | 1 | DEBUG(dbgs() << "Outer loops with reductions are not supported " |
817 | 1 | << "currently.\n"); |
818 | 1 | ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, |
819 | 1 | "ReductionsOuter", |
820 | 1 | OuterLoop->getStartLoc(), |
821 | 1 | OuterLoop->getHeader()) |
822 | 1 | << "Outer loops with reductions cannot be interchangeed " |
823 | 1 | "currently."); |
824 | 1 | return true; |
825 | 1 | } |
826 | 23 | // TODO: Currently we handle only loops with 1 induction variable. |
827 | 23 | if (23 Inductions.size() != 123 ) { |
828 | 0 | DEBUG(dbgs() << "Loops with more than 1 induction variables are not " |
829 | 0 | << "supported currently.\n"); |
830 | 0 | ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, |
831 | 0 | "MultiIndutionOuter", |
832 | 0 | OuterLoop->getStartLoc(), |
833 | 0 | OuterLoop->getHeader()) |
834 | 0 | << "Only outer loops with 1 induction variable can be " |
835 | 0 | "interchanged currently."); |
836 | 0 | return true; |
837 | 0 | } |
838 | 23 | |
839 | 23 | // TODO: Triangular loops are not handled for now. |
840 | 23 | if (23 !isLoopStructureUnderstood(InnerInductionVar)23 ) { |
841 | 0 | DEBUG(dbgs() << "Loop structure not understood by pass\n"); |
842 | 0 | ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, |
843 | 0 | "UnsupportedStructureInner", |
844 | 0 | InnerLoop->getStartLoc(), |
845 | 0 | InnerLoop->getHeader()) |
846 | 0 | << "Inner loop structure not understood currently."); |
847 | 0 | return true; |
848 | 0 | } |
849 | 23 | |
850 | 23 | // TODO: We only handle LCSSA PHI's corresponding to reduction for now. |
851 | 23 | BasicBlock *LoopExitBlock = |
852 | 23 | getLoopLatchExitBlock(OuterLoopLatch, OuterLoopHeader); |
853 | 23 | if (!LoopExitBlock || 23 !containsSafePHI(LoopExitBlock, true)23 ) { |
854 | 0 | DEBUG(dbgs() << "Can only handle LCSSA PHIs in outer loops currently.\n"); |
855 | 0 | ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, |
856 | 0 | "NoLCSSAPHIOuter", |
857 | 0 | OuterLoop->getStartLoc(), |
858 | 0 | OuterLoop->getHeader()) |
859 | 0 | << "Only outer loops with LCSSA PHIs can be interchange " |
860 | 0 | "currently."); |
861 | 0 | return true; |
862 | 0 | } |
863 | 23 | |
864 | 23 | LoopExitBlock = getLoopLatchExitBlock(InnerLoopLatch, InnerLoopHeader); |
865 | 23 | if (!LoopExitBlock || 23 !containsSafePHI(LoopExitBlock, false)23 ) { |
866 | 0 | DEBUG(dbgs() << "Can only handle LCSSA PHIs in inner loops currently.\n"); |
867 | 0 | ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, |
868 | 0 | "NoLCSSAPHIOuterInner", |
869 | 0 | InnerLoop->getStartLoc(), |
870 | 0 | InnerLoop->getHeader()) |
871 | 0 | << "Only inner loops with LCSSA PHIs can be interchange " |
872 | 0 | "currently."); |
873 | 0 | return true; |
874 | 0 | } |
875 | 23 | |
876 | 23 | // TODO: Current limitation: Since we split the inner loop latch at the point |
877 | 23 | // were induction variable is incremented (induction.next); We cannot have |
878 | 23 | // more than 1 user of induction.next since it would result in broken code |
879 | 23 | // after split. |
880 | 23 | // e.g. |
881 | 23 | // for(i=0;i<N;i++) { |
882 | 23 | // for(j = 0;j<M;j++) { |
883 | 23 | // A[j+1][i+2] = A[j][i]+k; |
884 | 23 | // } |
885 | 23 | // } |
886 | 23 | Instruction *InnerIndexVarInc = nullptr; |
887 | 23 | if (InnerInductionVar->getIncomingBlock(0) == InnerLoopPreHeader) |
888 | 0 | InnerIndexVarInc = |
889 | 0 | dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(1)); |
890 | 23 | else |
891 | 23 | InnerIndexVarInc = |
892 | 23 | dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(0)); |
893 | 23 | |
894 | 23 | if (!InnerIndexVarInc23 ) { |
895 | 0 | DEBUG(dbgs() << "Did not find an instruction to increment the induction " |
896 | 0 | << "variable.\n"); |
897 | 0 | ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, |
898 | 0 | "NoIncrementInInner", |
899 | 0 | InnerLoop->getStartLoc(), |
900 | 0 | InnerLoop->getHeader()) |
901 | 0 | << "The inner loop does not increment the induction variable."); |
902 | 0 | return true; |
903 | 0 | } |
904 | 23 | |
905 | 23 | // Since we split the inner loop latch on this induction variable. Make sure |
906 | 23 | // we do not have any instruction between the induction variable and branch |
907 | 23 | // instruction. |
908 | 23 | |
909 | 23 | bool FoundInduction = false; |
910 | 84 | for (const Instruction &I : reverse(*InnerLoopLatch)) { |
911 | 84 | if (isa<BranchInst>(I) || 84 isa<CmpInst>(I)61 || isa<TruncInst>(I)38 || |
912 | 24 | isa<ZExtInst>(I)) |
913 | 61 | continue; |
914 | 23 | |
915 | 23 | // We found an instruction. If this is not induction variable then it is not |
916 | 23 | // safe to split this loop latch. |
917 | 23 | if (23 !I.isIdenticalTo(InnerIndexVarInc)23 ) { |
918 | 2 | DEBUG(dbgs() << "Found unsupported instructions between induction " |
919 | 2 | << "variable increment and branch.\n"); |
920 | 2 | ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, |
921 | 2 | "UnsupportedInsBetweenInduction", |
922 | 2 | InnerLoop->getStartLoc(), |
923 | 2 | InnerLoop->getHeader()) |
924 | 2 | << "Found unsupported instruction between induction variable " |
925 | 2 | "increment and branch."); |
926 | 2 | return true; |
927 | 2 | } |
928 | 21 | |
929 | 21 | FoundInduction = true; |
930 | 21 | break; |
931 | 21 | } |
932 | 21 | // The loop latch ended and we didn't find the induction variable return as |
933 | 21 | // current limitation. |
934 | 21 | if (21 !FoundInduction21 ) { |
935 | 0 | DEBUG(dbgs() << "Did not find the induction variable.\n"); |
936 | 0 | ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, |
937 | 0 | "NoIndutionVariable", |
938 | 0 | InnerLoop->getStartLoc(), |
939 | 0 | InnerLoop->getHeader()) |
940 | 0 | << "Did not find the induction variable."); |
941 | 0 | return true; |
942 | 0 | } |
943 | 21 | return false; |
944 | 21 | } |
945 | | |
946 | | bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId, |
947 | | unsigned OuterLoopId, |
948 | 27 | CharMatrix &DepMatrix) { |
949 | 27 | |
950 | 27 | if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)27 ) { |
951 | 1 | DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId |
952 | 1 | << " and OuterLoopId = " << OuterLoopId |
953 | 1 | << " due to dependence\n"); |
954 | 1 | ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, |
955 | 1 | "Dependence", |
956 | 1 | InnerLoop->getStartLoc(), |
957 | 1 | InnerLoop->getHeader()) |
958 | 1 | << "Cannot interchange loops due to dependences."); |
959 | 1 | return false; |
960 | 1 | } |
961 | 26 | |
962 | 26 | // Check if outer and inner loop contain legal instructions only. |
963 | 26 | for (auto *BB : OuterLoop->blocks()) |
964 | 104 | for (Instruction &I : *BB) |
965 | 470 | if (CallInst *470 CI470 = dyn_cast<CallInst>(&I)) { |
966 | 5 | // readnone functions do not prevent interchanging. |
967 | 5 | if (CI->doesNotReadMemory()) |
968 | 4 | continue; |
969 | 1 | DEBUG1 (dbgs() << "Loops with call instructions cannot be interchanged " |
970 | 1 | << "safely."); |
971 | 1 | return false; |
972 | 1 | } |
973 | 25 | |
974 | 25 | // Create unique Preheaders if we already do not have one. |
975 | 25 | BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader(); |
976 | 25 | BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); |
977 | 25 | |
978 | 25 | // Create a unique outer preheader - |
979 | 25 | // 1) If OuterLoop preheader is not present. |
980 | 25 | // 2) If OuterLoop Preheader is same as OuterLoop Header |
981 | 25 | // 3) If OuterLoop Preheader is same as Header of the previous loop. |
982 | 25 | // 4) If OuterLoop Preheader is Entry node. |
983 | 25 | if (!OuterLoopPreHeader || 25 OuterLoopPreHeader == OuterLoop->getHeader()25 || |
984 | 25 | isa<PHINode>(OuterLoopPreHeader->begin()) || |
985 | 25 | !OuterLoopPreHeader->getUniquePredecessor()20 ) { |
986 | 12 | OuterLoopPreHeader = |
987 | 12 | InsertPreheaderForLoop(OuterLoop, DT, LI, PreserveLCSSA); |
988 | 12 | } |
989 | 25 | |
990 | 25 | if (!InnerLoopPreHeader || 25 InnerLoopPreHeader == InnerLoop->getHeader()25 || |
991 | 25 | InnerLoopPreHeader == OuterLoop->getHeader()25 ) { |
992 | 18 | InnerLoopPreHeader = |
993 | 18 | InsertPreheaderForLoop(InnerLoop, DT, LI, PreserveLCSSA); |
994 | 18 | } |
995 | 25 | |
996 | 25 | // TODO: The loops could not be interchanged due to current limitations in the |
997 | 25 | // transform module. |
998 | 25 | if (currentLimitations()25 ) { |
999 | 4 | DEBUG(dbgs() << "Not legal because of current transform limitation\n"); |
1000 | 4 | return false; |
1001 | 4 | } |
1002 | 21 | |
1003 | 21 | // Check if the loops are tightly nested. |
1004 | 21 | if (21 !tightlyNested(OuterLoop, InnerLoop)21 ) { |
1005 | 4 | DEBUG(dbgs() << "Loops not tightly nested\n"); |
1006 | 4 | ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, |
1007 | 4 | "NotTightlyNested", |
1008 | 4 | InnerLoop->getStartLoc(), |
1009 | 4 | InnerLoop->getHeader()) |
1010 | 4 | << "Cannot interchange loops because they are not tightly " |
1011 | 4 | "nested."); |
1012 | 4 | return false; |
1013 | 4 | } |
1014 | 17 | |
1015 | 17 | return true; |
1016 | 17 | } |
1017 | | |
1018 | 17 | int LoopInterchangeProfitability::getInstrOrderCost() { |
1019 | 17 | unsigned GoodOrder, BadOrder; |
1020 | 17 | BadOrder = GoodOrder = 0; |
1021 | 21 | for (BasicBlock *BB : InnerLoop->blocks()) { |
1022 | 177 | for (Instruction &Ins : *BB) { |
1023 | 177 | if (const GetElementPtrInst *GEP177 = dyn_cast<GetElementPtrInst>(&Ins)) { |
1024 | 27 | unsigned NumOp = GEP->getNumOperands(); |
1025 | 27 | bool FoundInnerInduction = false; |
1026 | 27 | bool FoundOuterInduction = false; |
1027 | 111 | for (unsigned i = 0; i < NumOp111 ; ++i84 ) { |
1028 | 109 | const SCEV *OperandVal = SE->getSCEV(GEP->getOperand(i)); |
1029 | 109 | const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OperandVal); |
1030 | 109 | if (!AR) |
1031 | 54 | continue; |
1032 | 55 | |
1033 | 55 | // If we find the inner induction after an outer induction e.g. |
1034 | 55 | // for(int i=0;i<N;i++) |
1035 | 55 | // for(int j=0;j<N;j++) |
1036 | 55 | // A[i][j] = A[i-1][j-1]+k; |
1037 | 55 | // then it is a good order. |
1038 | 55 | if (55 AR->getLoop() == InnerLoop55 ) { |
1039 | 27 | // We found an InnerLoop induction after OuterLoop induction. It is |
1040 | 27 | // a good order. |
1041 | 27 | FoundInnerInduction = true; |
1042 | 27 | if (FoundOuterInduction27 ) { |
1043 | 6 | GoodOrder++; |
1044 | 6 | break; |
1045 | 6 | } |
1046 | 49 | } |
1047 | 49 | // If we find the outer induction after an inner induction e.g. |
1048 | 49 | // for(int i=0;i<N;i++) |
1049 | 49 | // for(int j=0;j<N;j++) |
1050 | 49 | // A[j][i] = A[j-1][i-1]+k; |
1051 | 49 | // then it is a bad order. |
1052 | 49 | if (49 AR->getLoop() == OuterLoop49 ) { |
1053 | 25 | // We found an OuterLoop induction after InnerLoop induction. It is |
1054 | 25 | // a bad order. |
1055 | 25 | FoundOuterInduction = true; |
1056 | 25 | if (FoundInnerInduction25 ) { |
1057 | 19 | BadOrder++; |
1058 | 19 | break; |
1059 | 19 | } |
1060 | 25 | } |
1061 | 109 | } |
1062 | 27 | } |
1063 | 177 | } |
1064 | 21 | } |
1065 | 17 | return GoodOrder - BadOrder; |
1066 | 17 | } |
1067 | | |
1068 | | static bool isProfitableForVectorization(unsigned InnerLoopId, |
1069 | | unsigned OuterLoopId, |
1070 | 4 | CharMatrix &DepMatrix) { |
1071 | 4 | // TODO: Improve this heuristic to catch more cases. |
1072 | 4 | // If the inner loop is loop independent or doesn't carry any dependency it is |
1073 | 4 | // profitable to move this to outer position. |
1074 | 4 | for (auto &Row : DepMatrix) { |
1075 | 4 | if (Row[InnerLoopId] != 'S' && 4 Row[InnerLoopId] != 'I'4 ) |
1076 | 4 | return false; |
1077 | 0 | // TODO: We need to improve this heuristic. |
1078 | 0 | if (0 Row[OuterLoopId] != '='0 ) |
1079 | 0 | return false; |
1080 | 0 | } |
1081 | 0 | // If outer loop has dependence and inner loop is loop independent then it is |
1082 | 0 | // profitable to interchange to enable parallelism. |
1083 | 0 | return true; |
1084 | 0 | } |
1085 | | |
1086 | | bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId, |
1087 | | unsigned OuterLoopId, |
1088 | 17 | CharMatrix &DepMatrix) { |
1089 | 17 | |
1090 | 17 | // TODO: Add better profitability checks. |
1091 | 17 | // e.g |
1092 | 17 | // 1) Construct dependency matrix and move the one with no loop carried dep |
1093 | 17 | // inside to enable vectorization. |
1094 | 17 | |
1095 | 17 | // This is rough cost estimation algorithm. It counts the good and bad order |
1096 | 17 | // of induction variables in the instruction and allows reordering if number |
1097 | 17 | // of bad orders is more than good. |
1098 | 17 | int Cost = getInstrOrderCost(); |
1099 | 17 | DEBUG(dbgs() << "Cost = " << Cost << "\n"); |
1100 | 17 | if (Cost < -LoopInterchangeCostThreshold) |
1101 | 13 | return true; |
1102 | 4 | |
1103 | 4 | // It is not profitable as per current cache profitability model. But check if |
1104 | 4 | // we can move this loop outside to improve parallelism. |
1105 | 4 | if (4 isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix)4 ) |
1106 | 0 | return true; |
1107 | 4 | |
1108 | 4 | ORE->emit(OptimizationRemarkMissed(4 DEBUG_TYPE4 , |
1109 | 17 | "InterchangeNotProfitable", |
1110 | 17 | InnerLoop->getStartLoc(), |
1111 | 17 | InnerLoop->getHeader()) |
1112 | 17 | << "Interchanging loops is too costly (cost=" |
1113 | 17 | << ore::NV("Cost", Cost) << ", threshold=" |
1114 | 17 | << ore::NV("Threshold", LoopInterchangeCostThreshold) << |
1115 | 17 | ") and it does not improve parallelism."); |
1116 | 17 | return false; |
1117 | 17 | } |
1118 | | |
1119 | | void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop, |
1120 | 16 | Loop *InnerLoop) { |
1121 | 16 | for (Loop::iterator I = OuterLoop->begin(), E = OuterLoop->end(); I != E; |
1122 | 16 | ++I0 ) { |
1123 | 16 | if (*I == InnerLoop16 ) { |
1124 | 16 | OuterLoop->removeChildLoop(I); |
1125 | 16 | return; |
1126 | 16 | } |
1127 | 16 | } |
1128 | 0 | llvm_unreachable0 ("Couldn't find loop"); |
1129 | 0 | } |
1130 | | |
1131 | | void LoopInterchangeTransform::restructureLoops(Loop *InnerLoop, |
1132 | 13 | Loop *OuterLoop) { |
1133 | 13 | Loop *OuterLoopParent = OuterLoop->getParentLoop(); |
1134 | 13 | if (OuterLoopParent13 ) { |
1135 | 3 | // Remove the loop from its parent loop. |
1136 | 3 | removeChildLoop(OuterLoopParent, OuterLoop); |
1137 | 3 | removeChildLoop(OuterLoop, InnerLoop); |
1138 | 3 | OuterLoopParent->addChildLoop(InnerLoop); |
1139 | 13 | } else { |
1140 | 10 | removeChildLoop(OuterLoop, InnerLoop); |
1141 | 10 | LI->changeTopLevelLoop(OuterLoop, InnerLoop); |
1142 | 10 | } |
1143 | 13 | |
1144 | 14 | while (!InnerLoop->empty()) |
1145 | 1 | OuterLoop->addChildLoop(InnerLoop->removeChildLoop(InnerLoop->begin())); |
1146 | 13 | |
1147 | 13 | InnerLoop->addChildLoop(OuterLoop); |
1148 | 13 | } |
1149 | | |
1150 | 13 | bool LoopInterchangeTransform::transform() { |
1151 | 13 | bool Transformed = false; |
1152 | 13 | Instruction *InnerIndexVar; |
1153 | 13 | |
1154 | 13 | if (InnerLoop->getSubLoops().size() == 013 ) { |
1155 | 12 | BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); |
1156 | 12 | DEBUG(dbgs() << "Calling Split Inner Loop\n"); |
1157 | 12 | PHINode *InductionPHI = getInductionVariable(InnerLoop, SE); |
1158 | 12 | if (!InductionPHI12 ) { |
1159 | 0 | DEBUG(dbgs() << "Failed to find the point to split loop latch \n"); |
1160 | 0 | return false; |
1161 | 0 | } |
1162 | 12 | |
1163 | 12 | if (12 InductionPHI->getIncomingBlock(0) == InnerLoopPreHeader12 ) |
1164 | 0 | InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(1)); |
1165 | 12 | else |
1166 | 12 | InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(0)); |
1167 | 12 | |
1168 | 12 | // |
1169 | 12 | // Split at the place were the induction variable is |
1170 | 12 | // incremented/decremented. |
1171 | 12 | // TODO: This splitting logic may not work always. Fix this. |
1172 | 12 | splitInnerLoopLatch(InnerIndexVar); |
1173 | 12 | DEBUG(dbgs() << "splitInnerLoopLatch done\n"); |
1174 | 12 | |
1175 | 12 | // Splits the inner loops phi nodes out into a separate basic block. |
1176 | 12 | splitInnerLoopHeader(); |
1177 | 12 | DEBUG(dbgs() << "splitInnerLoopHeader done\n"); |
1178 | 12 | } |
1179 | 13 | |
1180 | 13 | Transformed |= adjustLoopLinks(); |
1181 | 13 | if (!Transformed13 ) { |
1182 | 0 | DEBUG(dbgs() << "adjustLoopLinks failed\n"); |
1183 | 0 | return false; |
1184 | 0 | } |
1185 | 13 | |
1186 | 13 | restructureLoops(InnerLoop, OuterLoop); |
1187 | 13 | return true; |
1188 | 13 | } |
1189 | | |
1190 | 12 | void LoopInterchangeTransform::splitInnerLoopLatch(Instruction *Inc) { |
1191 | 12 | BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); |
1192 | 12 | BasicBlock *InnerLoopLatchPred = InnerLoopLatch; |
1193 | 12 | InnerLoopLatch = SplitBlock(InnerLoopLatchPred, Inc, DT, LI); |
1194 | 12 | } |
1195 | | |
1196 | 12 | void LoopInterchangeTransform::splitInnerLoopHeader() { |
1197 | 12 | |
1198 | 12 | // Split the inner loop header out. Here make sure that the reduction PHI's |
1199 | 12 | // stay in the innerloop body. |
1200 | 12 | BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); |
1201 | 12 | BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); |
1202 | 12 | if (InnerLoopHasReduction12 ) { |
1203 | 2 | // FIXME: Check if the induction PHI will always be the first PHI. |
1204 | 2 | BasicBlock *New = InnerLoopHeader->splitBasicBlock( |
1205 | 2 | ++(InnerLoopHeader->begin()), InnerLoopHeader->getName() + ".split"); |
1206 | 2 | if (LI) |
1207 | 2 | if (Loop *2 L2 = LI->getLoopFor(InnerLoopHeader)) |
1208 | 2 | L->addBasicBlockToLoop(New, *LI); |
1209 | 2 | |
1210 | 2 | // Adjust Reduction PHI's in the block. |
1211 | 2 | SmallVector<PHINode *, 8> PHIVec; |
1212 | 5 | for (auto I = New->begin(); isa<PHINode>(I)5 ; ++I3 ) { |
1213 | 3 | PHINode *PHI = dyn_cast<PHINode>(I); |
1214 | 3 | Value *V = PHI->getIncomingValueForBlock(InnerLoopPreHeader); |
1215 | 3 | PHI->replaceAllUsesWith(V); |
1216 | 3 | PHIVec.push_back((PHI)); |
1217 | 3 | } |
1218 | 3 | for (PHINode *P : PHIVec) { |
1219 | 3 | P->eraseFromParent(); |
1220 | 3 | } |
1221 | 12 | } else { |
1222 | 10 | SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI); |
1223 | 10 | } |
1224 | 12 | |
1225 | 12 | DEBUG(dbgs() << "Output of splitInnerLoopHeader InnerLoopHeaderSucc & " |
1226 | 12 | "InnerLoopHeader\n"); |
1227 | 12 | } |
1228 | | |
1229 | | /// \brief Move all instructions except the terminator from FromBB right before |
1230 | | /// InsertBefore |
1231 | 26 | static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) { |
1232 | 26 | auto &ToList = InsertBefore->getParent()->getInstList(); |
1233 | 26 | auto &FromList = FromBB->getInstList(); |
1234 | 26 | |
1235 | 26 | ToList.splice(InsertBefore->getIterator(), FromList, FromList.begin(), |
1236 | 26 | FromBB->getTerminator()->getIterator()); |
1237 | 26 | } |
1238 | | |
1239 | | void LoopInterchangeTransform::updateIncomingBlock(BasicBlock *CurrBlock, |
1240 | | BasicBlock *OldPred, |
1241 | 26 | BasicBlock *NewPred) { |
1242 | 26 | for (auto I = CurrBlock->begin(); isa<PHINode>(I)26 ; ++I0 ) { |
1243 | 0 | PHINode *PHI = cast<PHINode>(I); |
1244 | 0 | unsigned Num = PHI->getNumIncomingValues(); |
1245 | 0 | for (unsigned i = 0; i < Num0 ; ++i0 ) { |
1246 | 0 | if (PHI->getIncomingBlock(i) == OldPred) |
1247 | 0 | PHI->setIncomingBlock(i, NewPred); |
1248 | 0 | } |
1249 | 0 | } |
1250 | 26 | } |
1251 | | |
1252 | 13 | bool LoopInterchangeTransform::adjustLoopBranches() { |
1253 | 13 | |
1254 | 13 | DEBUG(dbgs() << "adjustLoopBranches called\n"); |
1255 | 13 | // Adjust the loop preheader |
1256 | 13 | BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); |
1257 | 13 | BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); |
1258 | 13 | BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); |
1259 | 13 | BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch(); |
1260 | 13 | BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader(); |
1261 | 13 | BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); |
1262 | 13 | BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor(); |
1263 | 13 | BasicBlock *InnerLoopLatchPredecessor = |
1264 | 13 | InnerLoopLatch->getUniquePredecessor(); |
1265 | 13 | BasicBlock *InnerLoopLatchSuccessor; |
1266 | 13 | BasicBlock *OuterLoopLatchSuccessor; |
1267 | 13 | |
1268 | 13 | BranchInst *OuterLoopLatchBI = |
1269 | 13 | dyn_cast<BranchInst>(OuterLoopLatch->getTerminator()); |
1270 | 13 | BranchInst *InnerLoopLatchBI = |
1271 | 13 | dyn_cast<BranchInst>(InnerLoopLatch->getTerminator()); |
1272 | 13 | BranchInst *OuterLoopHeaderBI = |
1273 | 13 | dyn_cast<BranchInst>(OuterLoopHeader->getTerminator()); |
1274 | 13 | BranchInst *InnerLoopHeaderBI = |
1275 | 13 | dyn_cast<BranchInst>(InnerLoopHeader->getTerminator()); |
1276 | 13 | |
1277 | 13 | if (!OuterLoopPredecessor || 13 !InnerLoopLatchPredecessor13 || |
1278 | 13 | !OuterLoopLatchBI13 || !InnerLoopLatchBI13 || !OuterLoopHeaderBI13 || |
1279 | 13 | !InnerLoopHeaderBI) |
1280 | 0 | return false; |
1281 | 13 | |
1282 | 13 | BranchInst *InnerLoopLatchPredecessorBI = |
1283 | 13 | dyn_cast<BranchInst>(InnerLoopLatchPredecessor->getTerminator()); |
1284 | 13 | BranchInst *OuterLoopPredecessorBI = |
1285 | 13 | dyn_cast<BranchInst>(OuterLoopPredecessor->getTerminator()); |
1286 | 13 | |
1287 | 13 | if (!OuterLoopPredecessorBI || 13 !InnerLoopLatchPredecessorBI13 ) |
1288 | 0 | return false; |
1289 | 13 | BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor(); |
1290 | 13 | if (!InnerLoopHeaderSuccessor) |
1291 | 0 | return false; |
1292 | 13 | |
1293 | 13 | // Adjust Loop Preheader and headers |
1294 | 13 | |
1295 | 13 | unsigned NumSucc = OuterLoopPredecessorBI->getNumSuccessors(); |
1296 | 32 | for (unsigned i = 0; i < NumSucc32 ; ++i19 ) { |
1297 | 19 | if (OuterLoopPredecessorBI->getSuccessor(i) == OuterLoopPreHeader) |
1298 | 13 | OuterLoopPredecessorBI->setSuccessor(i, InnerLoopPreHeader); |
1299 | 19 | } |
1300 | 13 | |
1301 | 13 | NumSucc = OuterLoopHeaderBI->getNumSuccessors(); |
1302 | 28 | for (unsigned i = 0; i < NumSucc28 ; ++i15 ) { |
1303 | 15 | if (OuterLoopHeaderBI->getSuccessor(i) == OuterLoopLatch) |
1304 | 2 | OuterLoopHeaderBI->setSuccessor(i, LoopExit); |
1305 | 13 | else if (13 OuterLoopHeaderBI->getSuccessor(i) == InnerLoopPreHeader13 ) |
1306 | 13 | OuterLoopHeaderBI->setSuccessor(i, InnerLoopHeaderSuccessor); |
1307 | 15 | } |
1308 | 13 | |
1309 | 13 | // Adjust reduction PHI's now that the incoming block has changed. |
1310 | 13 | updateIncomingBlock(InnerLoopHeaderSuccessor, InnerLoopHeader, |
1311 | 13 | OuterLoopHeader); |
1312 | 13 | |
1313 | 13 | BranchInst::Create(OuterLoopPreHeader, InnerLoopHeaderBI); |
1314 | 13 | InnerLoopHeaderBI->eraseFromParent(); |
1315 | 13 | |
1316 | 13 | // -------------Adjust loop latches----------- |
1317 | 13 | if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader) |
1318 | 5 | InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(1); |
1319 | 13 | else |
1320 | 8 | InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0); |
1321 | 13 | |
1322 | 13 | NumSucc = InnerLoopLatchPredecessorBI->getNumSuccessors(); |
1323 | 27 | for (unsigned i = 0; i < NumSucc27 ; ++i14 ) { |
1324 | 14 | if (InnerLoopLatchPredecessorBI->getSuccessor(i) == InnerLoopLatch) |
1325 | 13 | InnerLoopLatchPredecessorBI->setSuccessor(i, InnerLoopLatchSuccessor); |
1326 | 14 | } |
1327 | 13 | |
1328 | 13 | // Adjust PHI nodes in InnerLoopLatchSuccessor. Update all uses of PHI with |
1329 | 13 | // the value and remove this PHI node from inner loop. |
1330 | 13 | SmallVector<PHINode *, 8> LcssaVec; |
1331 | 16 | for (auto I = InnerLoopLatchSuccessor->begin(); isa<PHINode>(I)16 ; ++I3 ) { |
1332 | 3 | PHINode *LcssaPhi = cast<PHINode>(I); |
1333 | 3 | LcssaVec.push_back(LcssaPhi); |
1334 | 3 | } |
1335 | 3 | for (PHINode *P : LcssaVec) { |
1336 | 3 | Value *Incoming = P->getIncomingValueForBlock(InnerLoopLatch); |
1337 | 3 | P->replaceAllUsesWith(Incoming); |
1338 | 3 | P->eraseFromParent(); |
1339 | 3 | } |
1340 | 13 | |
1341 | 13 | if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader) |
1342 | 4 | OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1); |
1343 | 13 | else |
1344 | 9 | OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0); |
1345 | 13 | |
1346 | 13 | if (InnerLoopLatchBI->getSuccessor(1) == InnerLoopLatchSuccessor) |
1347 | 5 | InnerLoopLatchBI->setSuccessor(1, OuterLoopLatchSuccessor); |
1348 | 13 | else |
1349 | 8 | InnerLoopLatchBI->setSuccessor(0, OuterLoopLatchSuccessor); |
1350 | 13 | |
1351 | 13 | updateIncomingBlock(OuterLoopLatchSuccessor, OuterLoopLatch, InnerLoopLatch); |
1352 | 13 | |
1353 | 13 | if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopLatchSuccessor13 ) { |
1354 | 9 | OuterLoopLatchBI->setSuccessor(0, InnerLoopLatch); |
1355 | 13 | } else { |
1356 | 4 | OuterLoopLatchBI->setSuccessor(1, InnerLoopLatch); |
1357 | 4 | } |
1358 | 13 | |
1359 | 13 | return true; |
1360 | 13 | } |
1361 | 13 | void LoopInterchangeTransform::adjustLoopPreheaders() { |
1362 | 13 | |
1363 | 13 | // We have interchanged the preheaders so we need to interchange the data in |
1364 | 13 | // the preheader as well. |
1365 | 13 | // This is because the content of inner preheader was previously executed |
1366 | 13 | // inside the outer loop. |
1367 | 13 | BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader(); |
1368 | 13 | BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); |
1369 | 13 | BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); |
1370 | 13 | BranchInst *InnerTermBI = |
1371 | 13 | cast<BranchInst>(InnerLoopPreHeader->getTerminator()); |
1372 | 13 | |
1373 | 13 | // These instructions should now be executed inside the loop. |
1374 | 13 | // Move instruction into a new block after outer header. |
1375 | 13 | moveBBContents(InnerLoopPreHeader, OuterLoopHeader->getTerminator()); |
1376 | 13 | // These instructions were not executed previously in the loop so move them to |
1377 | 13 | // the older inner loop preheader. |
1378 | 13 | moveBBContents(OuterLoopPreHeader, InnerTermBI); |
1379 | 13 | } |
1380 | | |
1381 | 13 | bool LoopInterchangeTransform::adjustLoopLinks() { |
1382 | 13 | |
1383 | 13 | // Adjust all branches in the inner and outer loop. |
1384 | 13 | bool Changed = adjustLoopBranches(); |
1385 | 13 | if (Changed) |
1386 | 13 | adjustLoopPreheaders(); |
1387 | 13 | return Changed; |
1388 | 13 | } |
1389 | | |
1390 | | char LoopInterchange::ID = 0; |
1391 | 24.6k | INITIALIZE_PASS_BEGIN24.6k (LoopInterchange, "loop-interchange",
|
1392 | 24.6k | "Interchanges loops for cache reuse", false, false) |
1393 | 24.6k | INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) |
1394 | 24.6k | INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass) |
1395 | 24.6k | INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) |
1396 | 24.6k | INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) |
1397 | 24.6k | INITIALIZE_PASS_DEPENDENCY(LoopSimplify) |
1398 | 24.6k | INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass) |
1399 | 24.6k | INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) |
1400 | 24.6k | INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) |
1401 | 24.6k | |
1402 | 24.6k | INITIALIZE_PASS_END(LoopInterchange, "loop-interchange", |
1403 | | "Interchanges loops for cache reuse", false, false) |
1404 | | |
1405 | 0 | Pass *llvm::createLoopInterchangePass() { return new LoopInterchange(); } |