Coverage Report

Created: 2017-10-03 07:32

/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
Line
Count
Source (jump to first uncovered line)
1
//===- LoopInterchange.cpp - Loop interchange pass------------------------===//
2
//
3
//                     The LLVM Compiler Infrastructure
4
//
5
// This file is distributed under the University of Illinois Open Source
6
// License. See LICENSE.TXT for details.
7
//
8
//===----------------------------------------------------------------------===//
9
//
10
// This Pass handles loop interchange transform.
11
// This pass interchanges loops to provide a more cache-friendly memory access
12
// patterns.
13
//
14
//===----------------------------------------------------------------------===//
15
16
#include "llvm/ADT/SmallVector.h"
17
#include "llvm/Analysis/AliasAnalysis.h"
18
#include "llvm/Analysis/AssumptionCache.h"
19
#include "llvm/Analysis/BlockFrequencyInfo.h"
20
#include "llvm/Analysis/CodeMetrics.h"
21
#include "llvm/Analysis/DependenceAnalysis.h"
22
#include "llvm/Analysis/LoopInfo.h"
23
#include "llvm/Analysis/LoopIterator.h"
24
#include "llvm/Analysis/LoopPass.h"
25
#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
26
#include "llvm/Analysis/ScalarEvolution.h"
27
#include "llvm/Analysis/ScalarEvolutionExpander.h"
28
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
29
#include "llvm/Analysis/TargetTransformInfo.h"
30
#include "llvm/Analysis/ValueTracking.h"
31
#include "llvm/IR/Dominators.h"
32
#include "llvm/IR/Function.h"
33
#include "llvm/IR/IRBuilder.h"
34
#include "llvm/IR/InstIterator.h"
35
#include "llvm/IR/IntrinsicInst.h"
36
#include "llvm/IR/Module.h"
37
#include "llvm/Pass.h"
38
#include "llvm/Support/Debug.h"
39
#include "llvm/Support/raw_ostream.h"
40
#include "llvm/Transforms/Scalar.h"
41
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
42
#include "llvm/Transforms/Utils/LoopUtils.h"
43
44
using namespace llvm;
45
46
26
#define DEBUG_TYPE "loop-interchange"
47
48
static cl::opt<int> LoopInterchangeCostThreshold(
49
    "loop-interchange-threshold", cl::init(0), cl::Hidden,
50
    cl::desc("Interchange if you gain more than this number"));
51
52
namespace {
53
54
typedef SmallVector<Loop *, 8> LoopVector;
55
56
// TODO: Check if we can use a sparse matrix here.
57
typedef std::vector<std::vector<char>> CharMatrix;
58
59
// Maximum number of dependencies that can be handled in the dependency matrix.
60
static const unsigned MaxMemInstrCount = 100;
61
62
// Maximum loop depth supported.
63
static const unsigned MaxLoopNestDepth = 10;
64
65
struct LoopInterchange;
66
67
#ifdef DUMP_DEP_MATRICIES
68
void printDepMatrix(CharMatrix &DepMatrix) {
69
  for (auto &Row : DepMatrix) {
70
    for (auto D : Row)
71
      DEBUG(dbgs() << D << " ");
72
    DEBUG(dbgs() << "\n");
73
  }
74
}
75
#endif
76
77
static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
78
25
                                     Loop *L, DependenceInfo *DI) {
79
25
  typedef SmallVector<Value *, 16> ValueVector;
80
25
  ValueVector MemInstr;
81
25
82
25
  // For each block.
83
97
  for (BasicBlock *BB : L->blocks()) {
84
97
    // Scan the BB and collect legal loads and stores.
85
472
    for (Instruction &I : *BB) {
86
472
      if (!isa<Instruction>(I))
87
0
        return false;
88
472
      
if (auto *472
Ld472
= dyn_cast<LoadInst>(&I)) {
89
37
        if (!Ld->isSimple())
90
0
          return false;
91
37
        MemInstr.push_back(&I);
92
472
      } else 
if (auto *435
St435
= dyn_cast<StoreInst>(&I)) {
93
34
        if (!St->isSimple())
94
0
          return false;
95
34
        MemInstr.push_back(&I);
96
34
      }
97
472
    }
98
97
  }
99
25
100
25
  
DEBUG25
(dbgs() << "Found " << MemInstr.size()
101
25
               << " Loads and Stores to analyze\n");
102
25
103
25
  ValueVector::iterator I, IE, J, JE;
104
25
105
96
  for (I = MemInstr.begin(), IE = MemInstr.end(); 
I != IE96
;
++I71
) {
106
225
    for (J = I, JE = MemInstr.end(); 
J != JE225
;
++J154
) {
107
154
      std::vector<char> Dep;
108
154
      Instruction *Src = cast<Instruction>(*I);
109
154
      Instruction *Dst = cast<Instruction>(*J);
110
154
      if (Src == Dst)
111
71
        continue;
112
83
      // Ignore Input dependencies.
113
83
      
if (83
isa<LoadInst>(Src) && 83
isa<LoadInst>(Dst)68
)
114
22
        continue;
115
61
      // Track Output, Flow, and Anti dependencies.
116
61
      
if (auto 61
D61
= DI->depends(Src, Dst, true)) {
117
29
        assert(D->isOrdered() && "Expected an output, flow or anti dep.");
118
29
        DEBUG(StringRef DepType =
119
29
                  D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
120
29
              dbgs() << "Found " << DepType
121
29
                     << " dependency between Src and Dst\n"
122
29
                     << " Src:" << *Src << "\n Dst:" << *Dst << '\n');
123
29
        unsigned Levels = D->getLevels();
124
29
        char Direction;
125
86
        for (unsigned II = 1; 
II <= Levels86
;
++II57
) {
126
57
          const SCEV *Distance = D->getDistance(II);
127
57
          const SCEVConstant *SCEVConst =
128
57
              dyn_cast_or_null<SCEVConstant>(Distance);
129
57
          if (
SCEVConst57
) {
130
44
            const ConstantInt *CI = SCEVConst->getValue();
131
44
            if (CI->isNegative())
132
6
              Direction = '<';
133
38
            else 
if (38
CI->isZero()38
)
134
37
              Direction = '=';
135
38
            else
136
1
              Direction = '>';
137
44
            Dep.push_back(Direction);
138
57
          } else 
if (13
D->isScalar(II)13
) {
139
13
            Direction = 'S';
140
13
            Dep.push_back(Direction);
141
13
          } else {
142
0
            unsigned Dir = D->getDirection(II);
143
0
            if (Dir == Dependence::DVEntry::LT ||
144
0
                Dir == Dependence::DVEntry::LE)
145
0
              Direction = '<';
146
0
            else 
if (0
Dir == Dependence::DVEntry::GT ||
147
0
                     Dir == Dependence::DVEntry::GE)
148
0
              Direction = '>';
149
0
            else 
if (0
Dir == Dependence::DVEntry::EQ0
)
150
0
              Direction = '=';
151
0
            else
152
0
              Direction = '*';
153
13
            Dep.push_back(Direction);
154
13
          }
155
57
        }
156
39
        while (
Dep.size() != Level39
) {
157
10
          Dep.push_back('I');
158
10
        }
159
29
160
29
        DepMatrix.push_back(Dep);
161
29
        if (
DepMatrix.size() > MaxMemInstrCount29
) {
162
0
          DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount
163
0
                       << " dependencies inside loop\n");
164
0
          return false;
165
0
        }
166
29
      }
167
154
    }
168
71
  }
169
25
170
25
  // We don't have a DepMatrix to check legality return false.
171
25
  
if (25
DepMatrix.size() == 025
)
172
0
    return false;
173
25
  return true;
174
25
}
175
176
// A loop is moved from index 'from' to an index 'to'. Update the Dependence
177
// matrix by exchanging the two columns.
178
static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx,
179
13
                                    unsigned ToIndx) {
180
13
  unsigned numRows = DepMatrix.size();
181
29
  for (unsigned i = 0; 
i < numRows29
;
++i16
) {
182
16
    char TmpVal = DepMatrix[i][ToIndx];
183
16
    DepMatrix[i][ToIndx] = DepMatrix[i][FromIndx];
184
16
    DepMatrix[i][FromIndx] = TmpVal;
185
16
  }
186
13
}
187
188
// Checks if outermost non '=','S'or'I' dependence in the dependence matrix is
189
// '>'
190
static bool isOuterMostDepPositive(CharMatrix &DepMatrix, unsigned Row,
191
33
                                   unsigned Column) {
192
71
  for (unsigned i = 0; 
i <= Column71
;
++i38
) {
193
42
    if (DepMatrix[Row][i] == '<')
194
3
      return false;
195
39
    
if (39
DepMatrix[Row][i] == '>'39
)
196
1
      return true;
197
42
  }
198
33
  // All dependencies were '=','S' or 'I'
199
29
  return false;
200
33
}
201
202
// Checks if no dependence exist in the dependency matrix in Row before Column.
203
static bool containsNoDependence(CharMatrix &DepMatrix, unsigned Row,
204
0
                                 unsigned Column) {
205
0
  for (unsigned i = 0; 
i < Column0
;
++i0
) {
206
0
    if (
DepMatrix[Row][i] != '=' && 0
DepMatrix[Row][i] != 'S'0
&&
207
0
        DepMatrix[Row][i] != 'I')
208
0
      return false;
209
0
  }
210
0
  return true;
211
0
}
212
213
static bool validDepInterchange(CharMatrix &DepMatrix, unsigned Row,
214
                                unsigned OuterLoopId, char InnerDep,
215
33
                                char OuterDep) {
216
33
217
33
  if (isOuterMostDepPositive(DepMatrix, Row, OuterLoopId))
218
1
    return false;
219
32
220
32
  
if (32
InnerDep == OuterDep32
)
221
18
    return true;
222
14
223
14
  // It is legal to interchange if and only if after interchange no row has a
224
14
  // '>' direction as the leftmost non-'='.
225
14
226
14
  
if (14
InnerDep == '=' || 14
InnerDep == 'S'10
||
InnerDep == 'I'10
)
227
13
    return true;
228
1
229
1
  
if (1
InnerDep == '<'1
)
230
1
    return true;
231
0
232
0
  
if (0
InnerDep == '>'0
) {
233
0
    // If OuterLoopId represents outermost loop then interchanging will make the
234
0
    // 1st dependency as '>'
235
0
    if (OuterLoopId == 0)
236
0
      return false;
237
0
238
0
    // If all dependencies before OuterloopId are '=','S'or 'I'. Then
239
0
    // interchanging will result in this row having an outermost non '='
240
0
    // dependency of '>'
241
0
    
if (0
!containsNoDependence(DepMatrix, Row, OuterLoopId)0
)
242
0
      return true;
243
0
  }
244
0
245
0
  return false;
246
0
}
247
248
// Checks if it is legal to interchange 2 loops.
249
// [Theorem] A permutation of the loops in a perfect nest is legal if and only
250
// if the direction matrix, after the same permutation is applied to its
251
// columns, has no ">" direction as the leftmost non-"=" direction in any row.
252
static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
253
                                      unsigned InnerLoopId,
254
27
                                      unsigned OuterLoopId) {
255
27
256
27
  unsigned NumRows = DepMatrix.size();
257
27
  // For each row check if it is valid to interchange.
258
59
  for (unsigned Row = 0; 
Row < NumRows59
;
++Row32
) {
259
33
    char InnerDep = DepMatrix[Row][InnerLoopId];
260
33
    char OuterDep = DepMatrix[Row][OuterLoopId];
261
33
    if (
InnerDep == '*' || 33
OuterDep == '*'33
)
262
0
      return false;
263
33
    
if (33
!validDepInterchange(DepMatrix, Row, OuterLoopId, InnerDep, OuterDep)33
)
264
1
      return false;
265
33
  }
266
26
  return true;
267
27
}
268
269
25
static void populateWorklist(Loop &L, SmallVector<LoopVector, 8> &V) {
270
25
271
25
  DEBUG(dbgs() << "Calling populateWorklist on Func: "
272
25
               << L.getHeader()->getParent()->getName() << " Loop: %"
273
25
               << L.getHeader()->getName() << '\n');
274
25
  LoopVector LoopList;
275
25
  Loop *CurrentLoop = &L;
276
25
  const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops();
277
55
  while (
!Vec->empty()55
) {
278
30
    // The current loop has multiple subloops in it hence it is not tightly
279
30
    // nested.
280
30
    // Discard all loops above it added into Worklist.
281
30
    if (
Vec->size() != 130
) {
282
0
      LoopList.clear();
283
0
      return;
284
0
    }
285
30
    LoopList.push_back(CurrentLoop);
286
30
    CurrentLoop = Vec->front();
287
30
    Vec = &CurrentLoop->getSubLoops();
288
30
  }
289
25
  LoopList.push_back(CurrentLoop);
290
25
  V.push_back(std::move(LoopList));
291
25
}
292
293
12
static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) {
294
12
  PHINode *InnerIndexVar = L->getCanonicalInductionVariable();
295
12
  if (InnerIndexVar)
296
4
    return InnerIndexVar;
297
8
  
if (8
L->getLoopLatch() == nullptr || 8
L->getLoopPredecessor() == nullptr8
)
298
0
    return nullptr;
299
8
  
for (BasicBlock::iterator I = L->getHeader()->begin(); 8
isa<PHINode>(I)8
;
++I0
) {
300
8
    PHINode *PhiVar = cast<PHINode>(I);
301
8
    Type *PhiTy = PhiVar->getType();
302
8
    if (
!PhiTy->isIntegerTy() && 8
!PhiTy->isFloatingPointTy()0
&&
303
0
        !PhiTy->isPointerTy())
304
0
      return nullptr;
305
8
    const SCEVAddRecExpr *AddRec =
306
8
        dyn_cast<SCEVAddRecExpr>(SE->getSCEV(PhiVar));
307
8
    if (
!AddRec || 8
!AddRec->isAffine()8
)
308
0
      continue;
309
8
    const SCEV *Step = AddRec->getStepRecurrence(*SE);
310
8
    if (!isa<SCEVConstant>(Step))
311
0
      continue;
312
8
    // Found the induction variable.
313
8
    // FIXME: Handle loops with more than one induction variable. Note that,
314
8
    // currently, legality makes sure we have only one induction variable.
315
8
    return PhiVar;
316
8
  }
317
0
  return nullptr;
318
12
}
319
320
/// LoopInterchangeLegality checks if it is legal to interchange the loop.
321
class LoopInterchangeLegality {
322
public:
323
  LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
324
                          LoopInfo *LI, DominatorTree *DT, bool PreserveLCSSA,
325
                          OptimizationRemarkEmitter *ORE)
326
      : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT),
327
27
        PreserveLCSSA(PreserveLCSSA), ORE(ORE), InnerLoopHasReduction(false) {}
328
329
  /// Check if the loops can be interchanged.
330
  bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
331
                           CharMatrix &DepMatrix);
332
  /// Check if the loop structure is understood. We do not handle triangular
333
  /// loops for now.
334
  bool isLoopStructureUnderstood(PHINode *InnerInductionVar);
335
336
  bool currentLimitations();
337
338
13
  bool hasInnerLoopReduction() { return InnerLoopHasReduction; }
339
340
private:
341
  bool tightlyNested(Loop *Outer, Loop *Inner);
342
  bool containsUnsafeInstructionsInHeader(BasicBlock *BB);
343
  bool areAllUsesReductions(Instruction *Ins, Loop *L);
344
  bool containsUnsafeInstructionsInLatch(BasicBlock *BB);
345
  bool findInductionAndReductions(Loop *L,
346
                                  SmallVector<PHINode *, 8> &Inductions,
347
                                  SmallVector<PHINode *, 8> &Reductions);
348
  Loop *OuterLoop;
349
  Loop *InnerLoop;
350
351
  ScalarEvolution *SE;
352
  LoopInfo *LI;
353
  DominatorTree *DT;
354
  bool PreserveLCSSA;
355
  /// Interface to emit optimization remarks.
356
  OptimizationRemarkEmitter *ORE;
357
358
  bool InnerLoopHasReduction;
359
};
360
361
/// LoopInterchangeProfitability checks if it is profitable to interchange the
362
/// loop.
363
class LoopInterchangeProfitability {
364
public:
365
  LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
366
                               OptimizationRemarkEmitter *ORE)
367
17
      : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
368
369
  /// Check if the loop interchange is profitable.
370
  bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId,
371
                    CharMatrix &DepMatrix);
372
373
private:
374
  int getInstrOrderCost();
375
376
  Loop *OuterLoop;
377
  Loop *InnerLoop;
378
379
  /// Scev analysis.
380
  ScalarEvolution *SE;
381
  /// Interface to emit optimization remarks.
382
  OptimizationRemarkEmitter *ORE;
383
};
384
385
/// LoopInterchangeTransform interchanges the loop.
386
class LoopInterchangeTransform {
387
public:
388
  LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
389
                           LoopInfo *LI, DominatorTree *DT,
390
                           BasicBlock *LoopNestExit,
391
                           bool InnerLoopContainsReductions)
392
      : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT),
393
        LoopExit(LoopNestExit),
394
13
        InnerLoopHasReduction(InnerLoopContainsReductions) {}
395
396
  /// Interchange OuterLoop and InnerLoop.
397
  bool transform();
398
  void restructureLoops(Loop *InnerLoop, Loop *OuterLoop);
399
  void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop);
400
401
private:
402
  void splitInnerLoopLatch(Instruction *);
403
  void splitInnerLoopHeader();
404
  bool adjustLoopLinks();
405
  void adjustLoopPreheaders();
406
  bool adjustLoopBranches();
407
  void updateIncomingBlock(BasicBlock *CurrBlock, BasicBlock *OldPred,
408
                           BasicBlock *NewPred);
409
410
  Loop *OuterLoop;
411
  Loop *InnerLoop;
412
413
  /// Scev analysis.
414
  ScalarEvolution *SE;
415
  LoopInfo *LI;
416
  DominatorTree *DT;
417
  BasicBlock *LoopExit;
418
  bool InnerLoopHasReduction;
419
};
420
421
// Main LoopInterchange Pass.
422
struct LoopInterchange : public FunctionPass {
423
  static char ID;
424
  ScalarEvolution *SE;
425
  LoopInfo *LI;
426
  DependenceInfo *DI;
427
  DominatorTree *DT;
428
  bool PreserveLCSSA;
429
  /// Interface to emit optimization remarks.
430
  OptimizationRemarkEmitter *ORE;
431
432
  LoopInterchange()
433
15
      : FunctionPass(ID), SE(nullptr), LI(nullptr), DI(nullptr), DT(nullptr) {
434
15
    initializeLoopInterchangePass(*PassRegistry::getPassRegistry());
435
15
  }
436
437
15
  void getAnalysisUsage(AnalysisUsage &AU) const override {
438
15
    AU.addRequired<ScalarEvolutionWrapperPass>();
439
15
    AU.addRequired<AAResultsWrapperPass>();
440
15
    AU.addRequired<DominatorTreeWrapperPass>();
441
15
    AU.addRequired<LoopInfoWrapperPass>();
442
15
    AU.addRequired<DependenceAnalysisWrapperPass>();
443
15
    AU.addRequiredID(LoopSimplifyID);
444
15
    AU.addRequiredID(LCSSAID);
445
15
    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
446
15
  }
447
448
25
  bool runOnFunction(Function &F) override {
449
25
    if (skipFunction(F))
450
0
      return false;
451
25
452
25
    SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
453
25
    LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
454
25
    DI = &getAnalysis<DependenceAnalysisWrapperPass>().getDI();
455
25
    auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
456
25
    DT = DTWP ? 
&DTWP->getDomTree()25
:
nullptr0
;
457
25
    ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
458
25
    PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
459
25
460
25
    // Build up a worklist of loop pairs to analyze.
461
25
    SmallVector<LoopVector, 8> Worklist;
462
25
463
25
    for (Loop *L : *LI)
464
25
      populateWorklist(*L, Worklist);
465
25
466
25
    DEBUG(dbgs() << "Worklist size = " << Worklist.size() << "\n");
467
25
    bool Changed = true;
468
50
    while (
!Worklist.empty()50
) {
469
25
      LoopVector LoopList = Worklist.pop_back_val();
470
25
      Changed = processLoopList(LoopList, F);
471
25
    }
472
25
    return Changed;
473
25
  }
474
475
25
  bool isComputableLoopNest(LoopVector LoopList) {
476
55
    for (Loop *L : LoopList) {
477
55
      const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);
478
55
      if (
ExitCountOuter == SE->getCouldNotCompute()55
) {
479
0
        DEBUG(dbgs() << "Couldn't compute backedge count\n");
480
0
        return false;
481
0
      }
482
55
      
if (55
L->getNumBackEdges() != 155
) {
483
0
        DEBUG(dbgs() << "NumBackEdges is not equal to 1\n");
484
0
        return false;
485
0
      }
486
55
      
if (55
!L->getExitingBlock()55
) {
487
0
        DEBUG(dbgs() << "Loop doesn't have unique exit block\n");
488
0
        return false;
489
0
      }
490
25
    }
491
25
    return true;
492
25
  }
493
494
24
  unsigned selectLoopForInterchange(const LoopVector &LoopList) {
495
24
    // TODO: Add a better heuristic to select the loop to be interchanged based
496
24
    // on the dependence matrix. Currently we select the innermost loop.
497
24
    return LoopList.size() - 1;
498
24
  }
499
500
25
  bool processLoopList(LoopVector LoopList, Function &F) {
501
25
502
25
    bool Changed = false;
503
25
    unsigned LoopNestDepth = LoopList.size();
504
25
    if (
LoopNestDepth < 225
) {
505
0
      DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n");
506
0
      return false;
507
0
    }
508
25
    
if (25
LoopNestDepth > MaxLoopNestDepth25
) {
509
0
      DEBUG(dbgs() << "Cannot handle loops of depth greater than "
510
0
                   << MaxLoopNestDepth << "\n");
511
0
      return false;
512
0
    }
513
25
    
if (25
!isComputableLoopNest(LoopList)25
) {
514
0
      DEBUG(dbgs() << "Not valid loop candidate for interchange\n");
515
0
      return false;
516
0
    }
517
25
518
25
    
DEBUG25
(dbgs() << "Processing LoopList of size = " << LoopNestDepth << "\n");
519
25
520
25
    CharMatrix DependencyMatrix;
521
25
    Loop *OuterMostLoop = *(LoopList.begin());
522
25
    if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth,
523
25
                                  OuterMostLoop, DI)) {
524
0
      DEBUG(dbgs() << "Populating dependency matrix failed\n");
525
0
      return false;
526
0
    }
527
#ifdef DUMP_DEP_MATRICIES
528
    DEBUG(dbgs() << "Dependence before interchange\n");
529
    printDepMatrix(DependencyMatrix);
530
#endif
531
532
25
    BasicBlock *OuterMostLoopLatch = OuterMostLoop->getLoopLatch();
533
25
    BranchInst *OuterMostLoopLatchBI =
534
25
        dyn_cast<BranchInst>(OuterMostLoopLatch->getTerminator());
535
25
    if (!OuterMostLoopLatchBI)
536
0
      return false;
537
25
538
25
    // Since we currently do not handle LCSSA PHI's any failure in loop
539
25
    // condition will now branch to LoopNestExit.
540
25
    // TODO: This should be removed once we handle LCSSA PHI nodes.
541
25
542
25
    // Get the Outermost loop exit.
543
25
    BasicBlock *LoopNestExit;
544
25
    if (OuterMostLoopLatchBI->getSuccessor(0) == OuterMostLoop->getHeader())
545
7
      LoopNestExit = OuterMostLoopLatchBI->getSuccessor(1);
546
25
    else
547
18
      LoopNestExit = OuterMostLoopLatchBI->getSuccessor(0);
548
25
549
25
    if (
isa<PHINode>(LoopNestExit->begin())25
) {
550
1
      DEBUG(dbgs() << "PHI Nodes in loop nest exit is not handled for now "
551
1
                      "since on failure all loops branch to loop nest exit.\n");
552
1
      return false;
553
1
    }
554
24
555
24
    unsigned SelecLoopId = selectLoopForInterchange(LoopList);
556
24
    // Move the selected loop outwards to the best possible position.
557
37
    for (unsigned i = SelecLoopId; 
i > 037
;
i--13
) {
558
27
      bool Interchanged =
559
27
          processLoop(LoopList, i, i - 1, LoopNestExit, DependencyMatrix);
560
27
      if (!Interchanged)
561
14
        return Changed;
562
13
      // Loops interchanged reflect the same in LoopList
563
13
      std::swap(LoopList[i - 1], LoopList[i]);
564
13
565
13
      // Update the DependencyMatrix
566
13
      interChangeDependencies(DependencyMatrix, i, i - 1);
567
13
      DT->recalculate(F);
568
#ifdef DUMP_DEP_MATRICIES
569
      DEBUG(dbgs() << "Dependence after interchange\n");
570
      printDepMatrix(DependencyMatrix);
571
#endif
572
      Changed |= Interchanged;
573
27
    }
574
10
    return Changed;
575
25
  }
576
577
  bool processLoop(LoopVector LoopList, unsigned InnerLoopId,
578
                   unsigned OuterLoopId, BasicBlock *LoopNestExit,
579
27
                   std::vector<std::vector<char>> &DependencyMatrix) {
580
27
581
27
    DEBUG(dbgs() << "Processing Inner Loop Id = " << InnerLoopId
582
27
                 << " and OuterLoopId = " << OuterLoopId << "\n");
583
27
    Loop *InnerLoop = LoopList[InnerLoopId];
584
27
    Loop *OuterLoop = LoopList[OuterLoopId];
585
27
586
27
    LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, LI, DT,
587
27
                                PreserveLCSSA, ORE);
588
27
    if (
!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)27
) {
589
10
      DEBUG(dbgs() << "Not interchanging Loops. Cannot prove legality\n");
590
10
      return false;
591
10
    }
592
17
    
DEBUG17
(dbgs() << "Loops are legal to interchange\n");
593
17
    LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
594
17
    if (
!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)17
) {
595
4
      DEBUG(dbgs() << "Interchanging loops not profitable\n");
596
4
      return false;
597
4
    }
598
13
599
13
    
ORE->emit(OptimizationRemark(13
DEBUG_TYPE13
, "Interchanged",
600
13
                                 InnerLoop->getStartLoc(),
601
13
                                 InnerLoop->getHeader())
602
13
              << "Loop interchanged with enclosing loop.");
603
13
604
13
    LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT,
605
13
                                 LoopNestExit, LIL.hasInnerLoopReduction());
606
13
    LIT.transform();
607
13
    DEBUG(dbgs() << "Loops interchanged\n");
608
27
    return true;
609
27
  }
610
};
611
612
} // end of namespace
613
3
bool LoopInterchangeLegality::areAllUsesReductions(Instruction *Ins, Loop *L) {
614
3
  return none_of(Ins->users(), [=](User *U) -> bool {
615
3
    auto *UserIns = dyn_cast<PHINode>(U);
616
3
    RecurrenceDescriptor RD;
617
3
    return !UserIns || !RecurrenceDescriptor::isReductionPHI(UserIns, L, RD);
618
3
  });
619
3
}
620
621
bool LoopInterchangeLegality::containsUnsafeInstructionsInHeader(
622
21
    BasicBlock *BB) {
623
74
  for (auto I = BB->begin(), E = BB->end(); 
I != E74
;
++I53
) {
624
57
    // Load corresponding to reduction PHI's are safe while concluding if
625
57
    // tightly nested.
626
57
    if (LoadInst *
L57
= dyn_cast<LoadInst>(I)) {
627
3
      if (!areAllUsesReductions(L, InnerLoop))
628
0
        return true;
629
54
    } else 
if (54
I->mayHaveSideEffects() || 54
I->mayReadFromMemory()50
)
630
4
      return true;
631
57
  }
632
17
  return false;
633
21
}
634
635
bool LoopInterchangeLegality::containsUnsafeInstructionsInLatch(
636
17
    BasicBlock *BB) {
637
80
  for (auto I = BB->begin(), E = BB->end(); 
I != E80
;
++I63
) {
638
63
    // Stores corresponding to reductions are safe while concluding if tightly
639
63
    // nested.
640
63
    if (StoreInst *
L63
= dyn_cast<StoreInst>(I)) {
641
3
      if (!isa<PHINode>(L->getOperand(0)))
642
0
        return true;
643
60
    } else 
if (60
I->mayHaveSideEffects() || 60
I->mayReadFromMemory()60
)
644
0
      return true;
645
63
  }
646
17
  return false;
647
17
}
648
649
21
bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
650
21
  BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
651
21
  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
652
21
  BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
653
21
654
21
  DEBUG(dbgs() << "Checking if loops are tightly nested\n");
655
21
656
21
  // A perfectly nested loop will not have any branch in between the outer and
657
21
  // inner block i.e. outer header will branch to either inner preheader and
658
21
  // outerloop latch.
659
21
  BranchInst *OuterLoopHeaderBI =
660
21
      dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
661
21
  if (!OuterLoopHeaderBI)
662
0
    return false;
663
21
664
21
  for (BasicBlock *Succ : OuterLoopHeaderBI->successors())
665
23
    
if (23
Succ != InnerLoopPreHeader && 23
Succ != OuterLoopLatch2
)
666
0
      return false;
667
21
668
21
  
DEBUG21
(dbgs() << "Checking instructions in Loop header and Loop latch\n");
669
21
  // We do not have any basic block in between now make sure the outer header
670
21
  // and outer loop latch doesn't contain any unsafe instructions.
671
21
  if (containsUnsafeInstructionsInHeader(OuterLoopHeader) ||
672
17
      containsUnsafeInstructionsInLatch(OuterLoopLatch))
673
4
    return false;
674
17
675
17
  
DEBUG17
(dbgs() << "Loops are perfectly nested\n");
676
17
  // We have a perfect loop nest.
677
17
  return true;
678
17
}
679
680
681
bool LoopInterchangeLegality::isLoopStructureUnderstood(
682
23
    PHINode *InnerInduction) {
683
23
684
23
  unsigned Num = InnerInduction->getNumOperands();
685
23
  BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
686
69
  for (unsigned i = 0; 
i < Num69
;
++i46
) {
687
46
    Value *Val = InnerInduction->getOperand(i);
688
46
    if (isa<Constant>(Val))
689
23
      continue;
690
23
    Instruction *I = dyn_cast<Instruction>(Val);
691
23
    if (!I)
692
0
      return false;
693
23
    // TODO: Handle triangular loops.
694
23
    // e.g. for(int i=0;i<N;i++)
695
23
    //        for(int j=i;j<N;j++)
696
23
    unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
697
23
    if (InnerInduction->getIncomingBlock(IncomBlockIndx) ==
698
23
            InnerLoopPreheader &&
699
23
        
!OuterLoop->isLoopInvariant(I)0
) {
700
0
      return false;
701
0
    }
702
46
  }
703
23
  return true;
704
23
}
705
706
bool LoopInterchangeLegality::findInductionAndReductions(
707
    Loop *L, SmallVector<PHINode *, 8> &Inductions,
708
49
    SmallVector<PHINode *, 8> &Reductions) {
709
49
  if (
!L->getLoopLatch() || 49
!L->getLoopPredecessor()49
)
710
0
    return false;
711
104
  
for (BasicBlock::iterator I = L->getHeader()->begin(); 49
isa<PHINode>(I)104
;
++I55
) {
712
56
    RecurrenceDescriptor RD;
713
56
    InductionDescriptor ID;
714
56
    PHINode *PHI = cast<PHINode>(I);
715
56
    if (InductionDescriptor::isInductionPHI(PHI, L, SE, ID))
716
49
      Inductions.push_back(PHI);
717
7
    else 
if (7
RecurrenceDescriptor::isReductionPHI(PHI, L, RD)7
)
718
6
      Reductions.push_back(PHI);
719
1
    else {
720
1
      DEBUG(
721
7
          dbgs() << "Failed to recognize PHI as an induction or reduction.\n");
722
7
      return false;
723
7
    }
724
56
  }
725
48
  return true;
726
49
}
727
728
46
static bool containsSafePHI(BasicBlock *Block, bool isOuterLoopExitBlock) {
729
49
  for (auto I = Block->begin(); 
isa<PHINode>(I)49
;
++I3
) {
730
3
    PHINode *PHI = cast<PHINode>(I);
731
3
    // Reduction lcssa phi will have only 1 incoming block that from loop latch.
732
3
    if (PHI->getNumIncomingValues() > 1)
733
0
      return false;
734
3
    Instruction *Ins = dyn_cast<Instruction>(PHI->getIncomingValue(0));
735
3
    if (!Ins)
736
0
      return false;
737
3
    // Incoming value for lcssa phi's in outer loop exit can only be inner loop
738
3
    // exits lcssa phi else it would not be tightly nested.
739
3
    
if (3
!isa<PHINode>(Ins) && 3
isOuterLoopExitBlock3
)
740
0
      return false;
741
3
  }
742
46
  return true;
743
46
}
744
745
static BasicBlock *getLoopLatchExitBlock(BasicBlock *LatchBlock,
746
46
                                         BasicBlock *LoopHeader) {
747
46
  if (BranchInst *
BI46
= dyn_cast<BranchInst>(LatchBlock->getTerminator())) {
748
46
    assert(BI->getNumSuccessors() == 2 &&
749
46
           "Branch leaving loop latch must have 2 successors");
750
59
    for (BasicBlock *Succ : BI->successors()) {
751
59
      if (Succ == LoopHeader)
752
13
        continue;
753
46
      return Succ;
754
46
    }
755
46
  }
756
0
  return nullptr;
757
0
}
758
759
// This function indicates the current limitations in the transform as a result
760
// of which we do not proceed.
761
25
bool LoopInterchangeLegality::currentLimitations() {
762
25
763
25
  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
764
25
  BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
765
25
  BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
766
25
  BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
767
25
  BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
768
25
769
25
  PHINode *InnerInductionVar;
770
25
  SmallVector<PHINode *, 8> Inductions;
771
25
  SmallVector<PHINode *, 8> Reductions;
772
25
  if (
!findInductionAndReductions(InnerLoop, Inductions, Reductions)25
) {
773
1
    DEBUG(dbgs() << "Only inner loops with induction or reduction PHI nodes "
774
1
                 << "are supported currently.\n");
775
1
    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
776
1
                                       "UnsupportedPHIInner",
777
1
                                       InnerLoop->getStartLoc(),
778
1
                                       InnerLoop->getHeader())
779
1
              << "Only inner loops with induction or reduction PHI nodes can be"
780
1
                 " interchange currently.");
781
1
    return true;
782
1
  }
783
24
784
24
  // TODO: Currently we handle only loops with 1 induction variable.
785
24
  
if (24
Inductions.size() != 124
) {
786
0
    DEBUG(dbgs() << "We currently only support loops with 1 induction variable."
787
0
                 << "Failed to interchange due to current limitation\n");
788
0
    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
789
0
                                       "MultiInductionInner",
790
0
                                       InnerLoop->getStartLoc(),
791
0
                                       InnerLoop->getHeader())
792
0
              << "Only inner loops with 1 induction variable can be "
793
0
                 "interchanged currently.");
794
0
    return true;
795
0
  }
796
24
  
if (24
Reductions.size() > 024
)
797
3
    InnerLoopHasReduction = true;
798
24
799
24
  InnerInductionVar = Inductions.pop_back_val();
800
24
  Reductions.clear();
801
24
  if (
!findInductionAndReductions(OuterLoop, Inductions, Reductions)24
) {
802
0
    DEBUG(dbgs() << "Only outer loops with induction or reduction PHI nodes "
803
0
                 << "are supported currently.\n");
804
0
    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
805
0
                                       "UnsupportedPHIOuter",
806
0
                                       OuterLoop->getStartLoc(),
807
0
                                       OuterLoop->getHeader())
808
0
              << "Only outer loops with induction or reduction PHI nodes can be"
809
0
                 " interchanged currently.");
810
0
    return true;
811
0
  }
812
24
813
24
  // Outer loop cannot have reduction because then loops will not be tightly
814
24
  // nested.
815
24
  
if (24
!Reductions.empty()24
) {
816
1
    DEBUG(dbgs() << "Outer loops with reductions are not supported "
817
1
                 << "currently.\n");
818
1
    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
819
1
                                       "ReductionsOuter",
820
1
                                       OuterLoop->getStartLoc(),
821
1
                                       OuterLoop->getHeader())
822
1
              << "Outer loops with reductions cannot be interchangeed "
823
1
                 "currently.");
824
1
    return true;
825
1
  }
826
23
  // TODO: Currently we handle only loops with 1 induction variable.
827
23
  
if (23
Inductions.size() != 123
) {
828
0
    DEBUG(dbgs() << "Loops with more than 1 induction variables are not "
829
0
                 << "supported currently.\n");
830
0
    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
831
0
                                       "MultiIndutionOuter",
832
0
                                       OuterLoop->getStartLoc(),
833
0
                                       OuterLoop->getHeader())
834
0
              << "Only outer loops with 1 induction variable can be "
835
0
                 "interchanged currently.");
836
0
    return true;
837
0
  }
838
23
839
23
  // TODO: Triangular loops are not handled for now.
840
23
  
if (23
!isLoopStructureUnderstood(InnerInductionVar)23
) {
841
0
    DEBUG(dbgs() << "Loop structure not understood by pass\n");
842
0
    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
843
0
                                       "UnsupportedStructureInner",
844
0
                                       InnerLoop->getStartLoc(),
845
0
                                       InnerLoop->getHeader())
846
0
              << "Inner loop structure not understood currently.");
847
0
    return true;
848
0
  }
849
23
850
23
  // TODO: We only handle LCSSA PHI's corresponding to reduction for now.
851
23
  BasicBlock *LoopExitBlock =
852
23
      getLoopLatchExitBlock(OuterLoopLatch, OuterLoopHeader);
853
23
  if (
!LoopExitBlock || 23
!containsSafePHI(LoopExitBlock, true)23
) {
854
0
    DEBUG(dbgs() << "Can only handle LCSSA PHIs in outer loops currently.\n");
855
0
    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
856
0
                                       "NoLCSSAPHIOuter",
857
0
                                       OuterLoop->getStartLoc(),
858
0
                                       OuterLoop->getHeader())
859
0
              << "Only outer loops with LCSSA PHIs can be interchange "
860
0
                 "currently.");
861
0
    return true;
862
0
  }
863
23
864
23
  LoopExitBlock = getLoopLatchExitBlock(InnerLoopLatch, InnerLoopHeader);
865
23
  if (
!LoopExitBlock || 23
!containsSafePHI(LoopExitBlock, false)23
) {
866
0
    DEBUG(dbgs() << "Can only handle LCSSA PHIs in inner loops currently.\n");
867
0
    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
868
0
                                       "NoLCSSAPHIOuterInner",
869
0
                                       InnerLoop->getStartLoc(),
870
0
                                       InnerLoop->getHeader())
871
0
              << "Only inner loops with LCSSA PHIs can be interchange "
872
0
                 "currently.");
873
0
    return true;
874
0
  }
875
23
876
23
  // TODO: Current limitation: Since we split the inner loop latch at the point
877
23
  // were induction variable is incremented (induction.next); We cannot have
878
23
  // more than 1 user of induction.next since it would result in broken code
879
23
  // after split.
880
23
  // e.g.
881
23
  // for(i=0;i<N;i++) {
882
23
  //    for(j = 0;j<M;j++) {
883
23
  //      A[j+1][i+2] = A[j][i]+k;
884
23
  //  }
885
23
  // }
886
23
  Instruction *InnerIndexVarInc = nullptr;
887
23
  if (InnerInductionVar->getIncomingBlock(0) == InnerLoopPreHeader)
888
0
    InnerIndexVarInc =
889
0
        dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(1));
890
23
  else
891
23
    InnerIndexVarInc =
892
23
        dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(0));
893
23
894
23
  if (
!InnerIndexVarInc23
) {
895
0
    DEBUG(dbgs() << "Did not find an instruction to increment the induction "
896
0
                 << "variable.\n");
897
0
    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
898
0
                                       "NoIncrementInInner",
899
0
                                       InnerLoop->getStartLoc(),
900
0
                                       InnerLoop->getHeader())
901
0
              << "The inner loop does not increment the induction variable.");
902
0
    return true;
903
0
  }
904
23
905
23
  // Since we split the inner loop latch on this induction variable. Make sure
906
23
  // we do not have any instruction between the induction variable and branch
907
23
  // instruction.
908
23
909
23
  bool FoundInduction = false;
910
84
  for (const Instruction &I : reverse(*InnerLoopLatch)) {
911
84
    if (
isa<BranchInst>(I) || 84
isa<CmpInst>(I)61
||
isa<TruncInst>(I)38
||
912
24
        isa<ZExtInst>(I))
913
61
      continue;
914
23
915
23
    // We found an instruction. If this is not induction variable then it is not
916
23
    // safe to split this loop latch.
917
23
    
if (23
!I.isIdenticalTo(InnerIndexVarInc)23
) {
918
2
      DEBUG(dbgs() << "Found unsupported instructions between induction "
919
2
                   << "variable increment and branch.\n");
920
2
    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
921
2
                                       "UnsupportedInsBetweenInduction",
922
2
                                       InnerLoop->getStartLoc(),
923
2
                                       InnerLoop->getHeader())
924
2
              << "Found unsupported instruction between induction variable "
925
2
                 "increment and branch.");
926
2
      return true;
927
2
    }
928
21
929
21
    FoundInduction = true;
930
21
    break;
931
21
  }
932
21
  // The loop latch ended and we didn't find the induction variable return as
933
21
  // current limitation.
934
21
  
if (21
!FoundInduction21
) {
935
0
    DEBUG(dbgs() << "Did not find the induction variable.\n");
936
0
    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
937
0
                                       "NoIndutionVariable",
938
0
                                       InnerLoop->getStartLoc(),
939
0
                                       InnerLoop->getHeader())
940
0
              << "Did not find the induction variable.");
941
0
    return true;
942
0
  }
943
21
  return false;
944
21
}
945
946
bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
947
                                                  unsigned OuterLoopId,
948
27
                                                  CharMatrix &DepMatrix) {
949
27
950
27
  if (
!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)27
) {
951
1
    DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
952
1
                 << " and OuterLoopId = " << OuterLoopId
953
1
                 << " due to dependence\n");
954
1
    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
955
1
                                       "Dependence",
956
1
                                       InnerLoop->getStartLoc(),
957
1
                                       InnerLoop->getHeader())
958
1
              << "Cannot interchange loops due to dependences.");
959
1
    return false;
960
1
  }
961
26
962
26
  // Check if outer and inner loop contain legal instructions only.
963
26
  for (auto *BB : OuterLoop->blocks())
964
104
    for (Instruction &I : *BB)
965
470
      
if (CallInst *470
CI470
= dyn_cast<CallInst>(&I)) {
966
5
        // readnone functions do not prevent interchanging.
967
5
        if (CI->doesNotReadMemory())
968
4
          continue;
969
1
        
DEBUG1
(dbgs() << "Loops with call instructions cannot be interchanged "
970
1
                     << "safely.");
971
1
        return false;
972
1
      }
973
25
974
25
  // Create unique Preheaders if we already do not have one.
975
25
  BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
976
25
  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
977
25
978
25
  // Create  a unique outer preheader -
979
25
  // 1) If OuterLoop preheader is not present.
980
25
  // 2) If OuterLoop Preheader is same as OuterLoop Header
981
25
  // 3) If OuterLoop Preheader is same as Header of the previous loop.
982
25
  // 4) If OuterLoop Preheader is Entry node.
983
25
  if (
!OuterLoopPreHeader || 25
OuterLoopPreHeader == OuterLoop->getHeader()25
||
984
25
      isa<PHINode>(OuterLoopPreHeader->begin()) ||
985
25
      
!OuterLoopPreHeader->getUniquePredecessor()20
) {
986
12
    OuterLoopPreHeader =
987
12
        InsertPreheaderForLoop(OuterLoop, DT, LI, PreserveLCSSA);
988
12
  }
989
25
990
25
  if (
!InnerLoopPreHeader || 25
InnerLoopPreHeader == InnerLoop->getHeader()25
||
991
25
      
InnerLoopPreHeader == OuterLoop->getHeader()25
) {
992
18
    InnerLoopPreHeader =
993
18
        InsertPreheaderForLoop(InnerLoop, DT, LI, PreserveLCSSA);
994
18
  }
995
25
996
25
  // TODO: The loops could not be interchanged due to current limitations in the
997
25
  // transform module.
998
25
  if (
currentLimitations()25
) {
999
4
    DEBUG(dbgs() << "Not legal because of current transform limitation\n");
1000
4
    return false;
1001
4
  }
1002
21
1003
21
  // Check if the loops are tightly nested.
1004
21
  
if (21
!tightlyNested(OuterLoop, InnerLoop)21
) {
1005
4
    DEBUG(dbgs() << "Loops not tightly nested\n");
1006
4
    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
1007
4
                                       "NotTightlyNested",
1008
4
                                       InnerLoop->getStartLoc(),
1009
4
                                       InnerLoop->getHeader())
1010
4
              << "Cannot interchange loops because they are not tightly "
1011
4
                 "nested.");
1012
4
    return false;
1013
4
  }
1014
17
1015
17
  return true;
1016
17
}
1017
1018
17
int LoopInterchangeProfitability::getInstrOrderCost() {
1019
17
  unsigned GoodOrder, BadOrder;
1020
17
  BadOrder = GoodOrder = 0;
1021
21
  for (BasicBlock *BB : InnerLoop->blocks()) {
1022
177
    for (Instruction &Ins : *BB) {
1023
177
      if (const GetElementPtrInst *
GEP177
= dyn_cast<GetElementPtrInst>(&Ins)) {
1024
27
        unsigned NumOp = GEP->getNumOperands();
1025
27
        bool FoundInnerInduction = false;
1026
27
        bool FoundOuterInduction = false;
1027
111
        for (unsigned i = 0; 
i < NumOp111
;
++i84
) {
1028
109
          const SCEV *OperandVal = SE->getSCEV(GEP->getOperand(i));
1029
109
          const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OperandVal);
1030
109
          if (!AR)
1031
54
            continue;
1032
55
1033
55
          // If we find the inner induction after an outer induction e.g.
1034
55
          // for(int i=0;i<N;i++)
1035
55
          //   for(int j=0;j<N;j++)
1036
55
          //     A[i][j] = A[i-1][j-1]+k;
1037
55
          // then it is a good order.
1038
55
          
if (55
AR->getLoop() == InnerLoop55
) {
1039
27
            // We found an InnerLoop induction after OuterLoop induction. It is
1040
27
            // a good order.
1041
27
            FoundInnerInduction = true;
1042
27
            if (
FoundOuterInduction27
) {
1043
6
              GoodOrder++;
1044
6
              break;
1045
6
            }
1046
49
          }
1047
49
          // If we find the outer induction after an inner induction e.g.
1048
49
          // for(int i=0;i<N;i++)
1049
49
          //   for(int j=0;j<N;j++)
1050
49
          //     A[j][i] = A[j-1][i-1]+k;
1051
49
          // then it is a bad order.
1052
49
          
if (49
AR->getLoop() == OuterLoop49
) {
1053
25
            // We found an OuterLoop induction after InnerLoop induction. It is
1054
25
            // a bad order.
1055
25
            FoundOuterInduction = true;
1056
25
            if (
FoundInnerInduction25
) {
1057
19
              BadOrder++;
1058
19
              break;
1059
19
            }
1060
25
          }
1061
109
        }
1062
27
      }
1063
177
    }
1064
21
  }
1065
17
  return GoodOrder - BadOrder;
1066
17
}
1067
1068
static bool isProfitableForVectorization(unsigned InnerLoopId,
1069
                                         unsigned OuterLoopId,
1070
4
                                         CharMatrix &DepMatrix) {
1071
4
  // TODO: Improve this heuristic to catch more cases.
1072
4
  // If the inner loop is loop independent or doesn't carry any dependency it is
1073
4
  // profitable to move this to outer position.
1074
4
  for (auto &Row : DepMatrix) {
1075
4
    if (
Row[InnerLoopId] != 'S' && 4
Row[InnerLoopId] != 'I'4
)
1076
4
      return false;
1077
0
    // TODO: We need to improve this heuristic.
1078
0
    
if (0
Row[OuterLoopId] != '='0
)
1079
0
      return false;
1080
0
  }
1081
0
  // If outer loop has dependence and inner loop is loop independent then it is
1082
0
  // profitable to interchange to enable parallelism.
1083
0
  return true;
1084
0
}
1085
1086
bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,
1087
                                                unsigned OuterLoopId,
1088
17
                                                CharMatrix &DepMatrix) {
1089
17
1090
17
  // TODO: Add better profitability checks.
1091
17
  // e.g
1092
17
  // 1) Construct dependency matrix and move the one with no loop carried dep
1093
17
  //    inside to enable vectorization.
1094
17
1095
17
  // This is rough cost estimation algorithm. It counts the good and bad order
1096
17
  // of induction variables in the instruction and allows reordering if number
1097
17
  // of bad orders is more than good.
1098
17
  int Cost = getInstrOrderCost();
1099
17
  DEBUG(dbgs() << "Cost = " << Cost << "\n");
1100
17
  if (Cost < -LoopInterchangeCostThreshold)
1101
13
    return true;
1102
4
1103
4
  // It is not profitable as per current cache profitability model. But check if
1104
4
  // we can move this loop outside to improve parallelism.
1105
4
  
if (4
isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix)4
)
1106
0
    return true;
1107
4
1108
4
  
ORE->emit(OptimizationRemarkMissed(4
DEBUG_TYPE4
,
1109
17
                                     "InterchangeNotProfitable",
1110
17
                                     InnerLoop->getStartLoc(),
1111
17
                                     InnerLoop->getHeader())
1112
17
            << "Interchanging loops is too costly (cost="
1113
17
            << ore::NV("Cost", Cost) << ", threshold="
1114
17
            << ore::NV("Threshold", LoopInterchangeCostThreshold) <<
1115
17
            ") and it does not improve parallelism.");
1116
17
  return false;
1117
17
}
1118
1119
void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
1120
16
                                               Loop *InnerLoop) {
1121
16
  for (Loop::iterator I = OuterLoop->begin(), E = OuterLoop->end(); I != E;
1122
16
       
++I0
) {
1123
16
    if (
*I == InnerLoop16
) {
1124
16
      OuterLoop->removeChildLoop(I);
1125
16
      return;
1126
16
    }
1127
16
  }
1128
0
  
llvm_unreachable0
("Couldn't find loop");
1129
0
}
1130
1131
void LoopInterchangeTransform::restructureLoops(Loop *InnerLoop,
1132
13
                                                Loop *OuterLoop) {
1133
13
  Loop *OuterLoopParent = OuterLoop->getParentLoop();
1134
13
  if (
OuterLoopParent13
) {
1135
3
    // Remove the loop from its parent loop.
1136
3
    removeChildLoop(OuterLoopParent, OuterLoop);
1137
3
    removeChildLoop(OuterLoop, InnerLoop);
1138
3
    OuterLoopParent->addChildLoop(InnerLoop);
1139
13
  } else {
1140
10
    removeChildLoop(OuterLoop, InnerLoop);
1141
10
    LI->changeTopLevelLoop(OuterLoop, InnerLoop);
1142
10
  }
1143
13
1144
14
  while (!InnerLoop->empty())
1145
1
    OuterLoop->addChildLoop(InnerLoop->removeChildLoop(InnerLoop->begin()));
1146
13
1147
13
  InnerLoop->addChildLoop(OuterLoop);
1148
13
}
1149
1150
13
bool LoopInterchangeTransform::transform() {
1151
13
  bool Transformed = false;
1152
13
  Instruction *InnerIndexVar;
1153
13
1154
13
  if (
InnerLoop->getSubLoops().size() == 013
) {
1155
12
    BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1156
12
    DEBUG(dbgs() << "Calling Split Inner Loop\n");
1157
12
    PHINode *InductionPHI = getInductionVariable(InnerLoop, SE);
1158
12
    if (
!InductionPHI12
) {
1159
0
      DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
1160
0
      return false;
1161
0
    }
1162
12
1163
12
    
if (12
InductionPHI->getIncomingBlock(0) == InnerLoopPreHeader12
)
1164
0
      InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(1));
1165
12
    else
1166
12
      InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(0));
1167
12
1168
12
    //
1169
12
    // Split at the place were the induction variable is
1170
12
    // incremented/decremented.
1171
12
    // TODO: This splitting logic may not work always. Fix this.
1172
12
    splitInnerLoopLatch(InnerIndexVar);
1173
12
    DEBUG(dbgs() << "splitInnerLoopLatch done\n");
1174
12
1175
12
    // Splits the inner loops phi nodes out into a separate basic block.
1176
12
    splitInnerLoopHeader();
1177
12
    DEBUG(dbgs() << "splitInnerLoopHeader done\n");
1178
12
  }
1179
13
1180
13
  Transformed |= adjustLoopLinks();
1181
13
  if (
!Transformed13
) {
1182
0
    DEBUG(dbgs() << "adjustLoopLinks failed\n");
1183
0
    return false;
1184
0
  }
1185
13
1186
13
  restructureLoops(InnerLoop, OuterLoop);
1187
13
  return true;
1188
13
}
1189
1190
12
void LoopInterchangeTransform::splitInnerLoopLatch(Instruction *Inc) {
1191
12
  BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1192
12
  BasicBlock *InnerLoopLatchPred = InnerLoopLatch;
1193
12
  InnerLoopLatch = SplitBlock(InnerLoopLatchPred, Inc, DT, LI);
1194
12
}
1195
1196
12
void LoopInterchangeTransform::splitInnerLoopHeader() {
1197
12
1198
12
  // Split the inner loop header out. Here make sure that the reduction PHI's
1199
12
  // stay in the innerloop body.
1200
12
  BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
1201
12
  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1202
12
  if (
InnerLoopHasReduction12
) {
1203
2
    // FIXME: Check if the induction PHI will always be the first PHI.
1204
2
    BasicBlock *New = InnerLoopHeader->splitBasicBlock(
1205
2
        ++(InnerLoopHeader->begin()), InnerLoopHeader->getName() + ".split");
1206
2
    if (LI)
1207
2
      
if (Loop *2
L2
= LI->getLoopFor(InnerLoopHeader))
1208
2
        L->addBasicBlockToLoop(New, *LI);
1209
2
1210
2
    // Adjust Reduction PHI's in the block.
1211
2
    SmallVector<PHINode *, 8> PHIVec;
1212
5
    for (auto I = New->begin(); 
isa<PHINode>(I)5
;
++I3
) {
1213
3
      PHINode *PHI = dyn_cast<PHINode>(I);
1214
3
      Value *V = PHI->getIncomingValueForBlock(InnerLoopPreHeader);
1215
3
      PHI->replaceAllUsesWith(V);
1216
3
      PHIVec.push_back((PHI));
1217
3
    }
1218
3
    for (PHINode *P : PHIVec) {
1219
3
      P->eraseFromParent();
1220
3
    }
1221
12
  } else {
1222
10
    SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI);
1223
10
  }
1224
12
1225
12
  DEBUG(dbgs() << "Output of splitInnerLoopHeader InnerLoopHeaderSucc & "
1226
12
                  "InnerLoopHeader\n");
1227
12
}
1228
1229
/// \brief Move all instructions except the terminator from FromBB right before
1230
/// InsertBefore
1231
26
static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
1232
26
  auto &ToList = InsertBefore->getParent()->getInstList();
1233
26
  auto &FromList = FromBB->getInstList();
1234
26
1235
26
  ToList.splice(InsertBefore->getIterator(), FromList, FromList.begin(),
1236
26
                FromBB->getTerminator()->getIterator());
1237
26
}
1238
1239
void LoopInterchangeTransform::updateIncomingBlock(BasicBlock *CurrBlock,
1240
                                                   BasicBlock *OldPred,
1241
26
                                                   BasicBlock *NewPred) {
1242
26
  for (auto I = CurrBlock->begin(); 
isa<PHINode>(I)26
;
++I0
) {
1243
0
    PHINode *PHI = cast<PHINode>(I);
1244
0
    unsigned Num = PHI->getNumIncomingValues();
1245
0
    for (unsigned i = 0; 
i < Num0
;
++i0
) {
1246
0
      if (PHI->getIncomingBlock(i) == OldPred)
1247
0
        PHI->setIncomingBlock(i, NewPred);
1248
0
    }
1249
0
  }
1250
26
}
1251
1252
13
bool LoopInterchangeTransform::adjustLoopBranches() {
1253
13
1254
13
  DEBUG(dbgs() << "adjustLoopBranches called\n");
1255
13
  // Adjust the loop preheader
1256
13
  BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
1257
13
  BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
1258
13
  BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1259
13
  BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
1260
13
  BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
1261
13
  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1262
13
  BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor();
1263
13
  BasicBlock *InnerLoopLatchPredecessor =
1264
13
      InnerLoopLatch->getUniquePredecessor();
1265
13
  BasicBlock *InnerLoopLatchSuccessor;
1266
13
  BasicBlock *OuterLoopLatchSuccessor;
1267
13
1268
13
  BranchInst *OuterLoopLatchBI =
1269
13
      dyn_cast<BranchInst>(OuterLoopLatch->getTerminator());
1270
13
  BranchInst *InnerLoopLatchBI =
1271
13
      dyn_cast<BranchInst>(InnerLoopLatch->getTerminator());
1272
13
  BranchInst *OuterLoopHeaderBI =
1273
13
      dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
1274
13
  BranchInst *InnerLoopHeaderBI =
1275
13
      dyn_cast<BranchInst>(InnerLoopHeader->getTerminator());
1276
13
1277
13
  if (
!OuterLoopPredecessor || 13
!InnerLoopLatchPredecessor13
||
1278
13
      
!OuterLoopLatchBI13
||
!InnerLoopLatchBI13
||
!OuterLoopHeaderBI13
||
1279
13
      !InnerLoopHeaderBI)
1280
0
    return false;
1281
13
1282
13
  BranchInst *InnerLoopLatchPredecessorBI =
1283
13
      dyn_cast<BranchInst>(InnerLoopLatchPredecessor->getTerminator());
1284
13
  BranchInst *OuterLoopPredecessorBI =
1285
13
      dyn_cast<BranchInst>(OuterLoopPredecessor->getTerminator());
1286
13
1287
13
  if (
!OuterLoopPredecessorBI || 13
!InnerLoopLatchPredecessorBI13
)
1288
0
    return false;
1289
13
  BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor();
1290
13
  if (!InnerLoopHeaderSuccessor)
1291
0
    return false;
1292
13
1293
13
  // Adjust Loop Preheader and headers
1294
13
1295
13
  unsigned NumSucc = OuterLoopPredecessorBI->getNumSuccessors();
1296
32
  for (unsigned i = 0; 
i < NumSucc32
;
++i19
) {
1297
19
    if (OuterLoopPredecessorBI->getSuccessor(i) == OuterLoopPreHeader)
1298
13
      OuterLoopPredecessorBI->setSuccessor(i, InnerLoopPreHeader);
1299
19
  }
1300
13
1301
13
  NumSucc = OuterLoopHeaderBI->getNumSuccessors();
1302
28
  for (unsigned i = 0; 
i < NumSucc28
;
++i15
) {
1303
15
    if (OuterLoopHeaderBI->getSuccessor(i) == OuterLoopLatch)
1304
2
      OuterLoopHeaderBI->setSuccessor(i, LoopExit);
1305
13
    else 
if (13
OuterLoopHeaderBI->getSuccessor(i) == InnerLoopPreHeader13
)
1306
13
      OuterLoopHeaderBI->setSuccessor(i, InnerLoopHeaderSuccessor);
1307
15
  }
1308
13
1309
13
  // Adjust reduction PHI's now that the incoming block has changed.
1310
13
  updateIncomingBlock(InnerLoopHeaderSuccessor, InnerLoopHeader,
1311
13
                      OuterLoopHeader);
1312
13
1313
13
  BranchInst::Create(OuterLoopPreHeader, InnerLoopHeaderBI);
1314
13
  InnerLoopHeaderBI->eraseFromParent();
1315
13
1316
13
  // -------------Adjust loop latches-----------
1317
13
  if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader)
1318
5
    InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(1);
1319
13
  else
1320
8
    InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0);
1321
13
1322
13
  NumSucc = InnerLoopLatchPredecessorBI->getNumSuccessors();
1323
27
  for (unsigned i = 0; 
i < NumSucc27
;
++i14
) {
1324
14
    if (InnerLoopLatchPredecessorBI->getSuccessor(i) == InnerLoopLatch)
1325
13
      InnerLoopLatchPredecessorBI->setSuccessor(i, InnerLoopLatchSuccessor);
1326
14
  }
1327
13
1328
13
  // Adjust PHI nodes in InnerLoopLatchSuccessor. Update all uses of PHI with
1329
13
  // the value and remove this PHI node from inner loop.
1330
13
  SmallVector<PHINode *, 8> LcssaVec;
1331
16
  for (auto I = InnerLoopLatchSuccessor->begin(); 
isa<PHINode>(I)16
;
++I3
) {
1332
3
    PHINode *LcssaPhi = cast<PHINode>(I);
1333
3
    LcssaVec.push_back(LcssaPhi);
1334
3
  }
1335
3
  for (PHINode *P : LcssaVec) {
1336
3
    Value *Incoming = P->getIncomingValueForBlock(InnerLoopLatch);
1337
3
    P->replaceAllUsesWith(Incoming);
1338
3
    P->eraseFromParent();
1339
3
  }
1340
13
1341
13
  if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader)
1342
4
    OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1);
1343
13
  else
1344
9
    OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0);
1345
13
1346
13
  if (InnerLoopLatchBI->getSuccessor(1) == InnerLoopLatchSuccessor)
1347
5
    InnerLoopLatchBI->setSuccessor(1, OuterLoopLatchSuccessor);
1348
13
  else
1349
8
    InnerLoopLatchBI->setSuccessor(0, OuterLoopLatchSuccessor);
1350
13
1351
13
  updateIncomingBlock(OuterLoopLatchSuccessor, OuterLoopLatch, InnerLoopLatch);
1352
13
1353
13
  if (
OuterLoopLatchBI->getSuccessor(0) == OuterLoopLatchSuccessor13
) {
1354
9
    OuterLoopLatchBI->setSuccessor(0, InnerLoopLatch);
1355
13
  } else {
1356
4
    OuterLoopLatchBI->setSuccessor(1, InnerLoopLatch);
1357
4
  }
1358
13
1359
13
  return true;
1360
13
}
1361
13
void LoopInterchangeTransform::adjustLoopPreheaders() {
1362
13
1363
13
  // We have interchanged the preheaders so we need to interchange the data in
1364
13
  // the preheader as well.
1365
13
  // This is because the content of inner preheader was previously executed
1366
13
  // inside the outer loop.
1367
13
  BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
1368
13
  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1369
13
  BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
1370
13
  BranchInst *InnerTermBI =
1371
13
      cast<BranchInst>(InnerLoopPreHeader->getTerminator());
1372
13
1373
13
  // These instructions should now be executed inside the loop.
1374
13
  // Move instruction into a new block after outer header.
1375
13
  moveBBContents(InnerLoopPreHeader, OuterLoopHeader->getTerminator());
1376
13
  // These instructions were not executed previously in the loop so move them to
1377
13
  // the older inner loop preheader.
1378
13
  moveBBContents(OuterLoopPreHeader, InnerTermBI);
1379
13
}
1380
1381
13
bool LoopInterchangeTransform::adjustLoopLinks() {
1382
13
1383
13
  // Adjust all branches in the inner and outer loop.
1384
13
  bool Changed = adjustLoopBranches();
1385
13
  if (Changed)
1386
13
    adjustLoopPreheaders();
1387
13
  return Changed;
1388
13
}
1389
1390
char LoopInterchange::ID = 0;
1391
24.6k
INITIALIZE_PASS_BEGIN24.6k
(LoopInterchange, "loop-interchange",
1392
24.6k
                      "Interchanges loops for cache reuse", false, false)
1393
24.6k
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
1394
24.6k
INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass)
1395
24.6k
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
1396
24.6k
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
1397
24.6k
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
1398
24.6k
INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
1399
24.6k
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
1400
24.6k
INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
1401
24.6k
1402
24.6k
INITIALIZE_PASS_END(LoopInterchange, "loop-interchange",
1403
                    "Interchanges loops for cache reuse", false, false)
1404
1405
0
Pass *llvm::createLoopInterchangePass() { return new LoopInterchange(); }