Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
Line
Count
Source (jump to first uncovered line)
1
//===- LoopInterchange.cpp - Loop interchange pass-------------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This Pass handles loop interchange transform.
10
// This pass interchanges loops to provide a more cache-friendly memory access
11
// patterns.
12
//
13
//===----------------------------------------------------------------------===//
14
15
#include "llvm/ADT/STLExtras.h"
16
#include "llvm/ADT/SmallVector.h"
17
#include "llvm/ADT/Statistic.h"
18
#include "llvm/ADT/StringRef.h"
19
#include "llvm/Analysis/DependenceAnalysis.h"
20
#include "llvm/Analysis/LoopInfo.h"
21
#include "llvm/Analysis/LoopPass.h"
22
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
23
#include "llvm/Analysis/ScalarEvolution.h"
24
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
25
#include "llvm/IR/BasicBlock.h"
26
#include "llvm/IR/Constants.h"
27
#include "llvm/IR/DiagnosticInfo.h"
28
#include "llvm/IR/Dominators.h"
29
#include "llvm/IR/Function.h"
30
#include "llvm/IR/InstrTypes.h"
31
#include "llvm/IR/Instruction.h"
32
#include "llvm/IR/Instructions.h"
33
#include "llvm/IR/Type.h"
34
#include "llvm/IR/User.h"
35
#include "llvm/IR/Value.h"
36
#include "llvm/Pass.h"
37
#include "llvm/Support/Casting.h"
38
#include "llvm/Support/CommandLine.h"
39
#include "llvm/Support/Debug.h"
40
#include "llvm/Support/ErrorHandling.h"
41
#include "llvm/Support/raw_ostream.h"
42
#include "llvm/Transforms/Scalar.h"
43
#include "llvm/Transforms/Utils.h"
44
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
45
#include "llvm/Transforms/Utils/LoopUtils.h"
46
#include <cassert>
47
#include <utility>
48
#include <vector>
49
50
using namespace llvm;
51
52
25
#define DEBUG_TYPE "loop-interchange"
53
54
STATISTIC(LoopsInterchanged, "Number of loops interchanged");
55
56
static cl::opt<int> LoopInterchangeCostThreshold(
57
    "loop-interchange-threshold", cl::init(0), cl::Hidden,
58
    cl::desc("Interchange if you gain more than this number"));
59
60
namespace {
61
62
using LoopVector = SmallVector<Loop *, 8>;
63
64
// TODO: Check if we can use a sparse matrix here.
65
using CharMatrix = std::vector<std::vector<char>>;
66
67
} // end anonymous namespace
68
69
// Maximum number of dependencies that can be handled in the dependency matrix.
70
static const unsigned MaxMemInstrCount = 100;
71
72
// Maximum loop depth supported.
73
static const unsigned MaxLoopNestDepth = 10;
74
75
#ifdef DUMP_DEP_MATRICIES
76
static void printDepMatrix(CharMatrix &DepMatrix) {
77
  for (auto &Row : DepMatrix) {
78
    for (auto D : Row)
79
      LLVM_DEBUG(dbgs() << D << " ");
80
    LLVM_DEBUG(dbgs() << "\n");
81
  }
82
}
83
#endif
84
85
static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
86
33
                                     Loop *L, DependenceInfo *DI) {
87
33
  using ValueVector = SmallVector<Value *, 16>;
88
33
89
33
  ValueVector MemInstr;
90
33
91
33
  // For each block.
92
119
  for (BasicBlock *BB : L->blocks()) {
93
119
    // Scan the BB and collect legal loads and stores.
94
527
    for (Instruction &I : *BB) {
95
527
      if (!isa<Instruction>(I))
96
0
        return false;
97
527
      if (auto *Ld = dyn_cast<LoadInst>(&I)) {
98
42
        if (!Ld->isSimple())
99
0
          return false;
100
42
        MemInstr.push_back(&I);
101
485
      } else if (auto *St = dyn_cast<StoreInst>(&I)) {
102
31
        if (!St->isSimple())
103
0
          return false;
104
31
        MemInstr.push_back(&I);
105
31
      }
106
527
    }
107
119
  }
108
33
109
33
  LLVM_DEBUG(dbgs() << "Found " << MemInstr.size()
110
33
                    << " Loads and Stores to analyze\n");
111
33
112
33
  ValueVector::iterator I, IE, J, JE;
113
33
114
106
  for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; 
++I73
) {
115
205
    for (J = I, JE = MemInstr.end(); J != JE; 
++J132
) {
116
132
      std::vector<char> Dep;
117
132
      Instruction *Src = cast<Instruction>(*I);
118
132
      Instruction *Dst = cast<Instruction>(*J);
119
132
      if (Src == Dst)
120
73
        continue;
121
59
      // Ignore Input dependencies.
122
59
      if (isa<LoadInst>(Src) && 
isa<LoadInst>(Dst)55
)
123
18
        continue;
124
41
      // Track Output, Flow, and Anti dependencies.
125
41
      if (auto D = DI->depends(Src, Dst, true)) {
126
26
        assert(D->isOrdered() && "Expected an output, flow or anti dep.");
127
26
        LLVM_DEBUG(StringRef DepType =
128
26
                       D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
129
26
                   dbgs() << "Found " << DepType
130
26
                          << " dependency between Src and Dst\n"
131
26
                          << " Src:" << *Src << "\n Dst:" << *Dst << '\n');
132
26
        unsigned Levels = D->getLevels();
133
26
        char Direction;
134
69
        for (unsigned II = 1; II <= Levels; 
++II43
) {
135
43
          const SCEV *Distance = D->getDistance(II);
136
43
          const SCEVConstant *SCEVConst =
137
43
              dyn_cast_or_null<SCEVConstant>(Distance);
138
43
          if (SCEVConst) {
139
0
            const ConstantInt *CI = SCEVConst->getValue();
140
0
            if (CI->isNegative())
141
0
              Direction = '<';
142
0
            else if (CI->isZero())
143
0
              Direction = '=';
144
0
            else
145
0
              Direction = '>';
146
0
            Dep.push_back(Direction);
147
43
          } else if (D->isScalar(II)) {
148
3
            Direction = 'S';
149
3
            Dep.push_back(Direction);
150
40
          } else {
151
40
            unsigned Dir = D->getDirection(II);
152
40
            if (Dir == Dependence::DVEntry::LT ||
153
40
                Dir == Dependence::DVEntry::LE)
154
1
              Direction = '<';
155
39
            else if (Dir == Dependence::DVEntry::GT ||
156
39
                     
Dir == Dependence::DVEntry::GE36
)
157
3
              Direction = '>';
158
36
            else if (Dir == Dependence::DVEntry::EQ)
159
28
              Direction = '=';
160
8
            else
161
8
              Direction = '*';
162
40
            Dep.push_back(Direction);
163
40
          }
164
43
        }
165
37
        while (Dep.size() != Level) {
166
11
          Dep.push_back('I');
167
11
        }
168
26
169
26
        DepMatrix.push_back(Dep);
170
26
        if (DepMatrix.size() > MaxMemInstrCount) {
171
0
          LLVM_DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount
172
0
                            << " dependencies inside loop\n");
173
0
          return false;
174
0
        }
175
26
      }
176
41
    }
177
73
  }
178
33
179
33
  return true;
180
33
}
181
182
// A loop is moved from index 'from' to an index 'to'. Update the Dependence
183
// matrix by exchanging the two columns.
184
static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx,
185
15
                                    unsigned ToIndx) {
186
15
  unsigned numRows = DepMatrix.size();
187
25
  for (unsigned i = 0; i < numRows; 
++i10
) {
188
10
    char TmpVal = DepMatrix[i][ToIndx];
189
10
    DepMatrix[i][ToIndx] = DepMatrix[i][FromIndx];
190
10
    DepMatrix[i][FromIndx] = TmpVal;
191
10
  }
192
15
}
193
194
// Checks if outermost non '=','S'or'I' dependence in the dependence matrix is
195
// '>'
196
static bool isOuterMostDepPositive(CharMatrix &DepMatrix, unsigned Row,
197
23
                                   unsigned Column) {
198
45
  for (unsigned i = 0; i <= Column; 
++i22
) {
199
25
    if (DepMatrix[Row][i] == '<')
200
1
      return false;
201
24
    if (DepMatrix[Row][i] == '>')
202
2
      return true;
203
24
  }
204
23
  // All dependencies were '=','S' or 'I'
205
23
  
return false20
;
206
23
}
207
208
// Checks if no dependence exist in the dependency matrix in Row before Column.
209
static bool containsNoDependence(CharMatrix &DepMatrix, unsigned Row,
210
0
                                 unsigned Column) {
211
0
  for (unsigned i = 0; i < Column; ++i) {
212
0
    if (DepMatrix[Row][i] != '=' && DepMatrix[Row][i] != 'S' &&
213
0
        DepMatrix[Row][i] != 'I')
214
0
      return false;
215
0
  }
216
0
  return true;
217
0
}
218
219
static bool validDepInterchange(CharMatrix &DepMatrix, unsigned Row,
220
                                unsigned OuterLoopId, char InnerDep,
221
23
                                char OuterDep) {
222
23
  if (isOuterMostDepPositive(DepMatrix, Row, OuterLoopId))
223
2
    return false;
224
21
225
21
  if (InnerDep == OuterDep)
226
18
    return true;
227
3
228
3
  // It is legal to interchange if and only if after interchange no row has a
229
3
  // '>' direction as the leftmost non-'='.
230
3
231
3
  if (InnerDep == '=' || 
InnerDep == 'S'2
||
InnerDep == 'I'2
)
232
2
    return true;
233
1
234
1
  if (InnerDep == '<')
235
0
    return true;
236
1
237
1
  if (InnerDep == '>') {
238
1
    // If OuterLoopId represents outermost loop then interchanging will make the
239
1
    // 1st dependency as '>'
240
1
    if (OuterLoopId == 0)
241
1
      return false;
242
0
243
0
    // If all dependencies before OuterloopId are '=','S'or 'I'. Then
244
0
    // interchanging will result in this row having an outermost non '='
245
0
    // dependency of '>'
246
0
    if (!containsNoDependence(DepMatrix, Row, OuterLoopId))
247
0
      return true;
248
0
  }
249
0
250
0
  return false;
251
0
}
252
253
// Checks if it is legal to interchange 2 loops.
254
// [Theorem] A permutation of the loops in a perfect nest is legal if and only
255
// if the direction matrix, after the same permutation is applied to its
256
// columns, has no ">" direction as the leftmost non-"=" direction in any row.
257
static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
258
                                      unsigned InnerLoopId,
259
34
                                      unsigned OuterLoopId) {
260
34
  unsigned NumRows = DepMatrix.size();
261
34
  // For each row check if it is valid to interchange.
262
54
  for (unsigned Row = 0; Row < NumRows; 
++Row20
) {
263
27
    char InnerDep = DepMatrix[Row][InnerLoopId];
264
27
    char OuterDep = DepMatrix[Row][OuterLoopId];
265
27
    if (InnerDep == '*' || 
OuterDep == '*'23
)
266
4
      return false;
267
23
    if (!validDepInterchange(DepMatrix, Row, OuterLoopId, InnerDep, OuterDep))
268
3
      return false;
269
23
  }
270
34
  
return true27
;
271
34
}
272
273
33
static LoopVector populateWorklist(Loop &L) {
274
33
  LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: "
275
33
                    << L.getHeader()->getParent()->getName() << " Loop: %"
276
33
                    << L.getHeader()->getName() << '\n');
277
33
  LoopVector LoopList;
278
33
  Loop *CurrentLoop = &L;
279
33
  const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops();
280
68
  while (!Vec->empty()) {
281
35
    // The current loop has multiple subloops in it hence it is not tightly
282
35
    // nested.
283
35
    // Discard all loops above it added into Worklist.
284
35
    if (Vec->size() != 1)
285
0
      return {};
286
35
287
35
    LoopList.push_back(CurrentLoop);
288
35
    CurrentLoop = Vec->front();
289
35
    Vec = &CurrentLoop->getSubLoops();
290
35
  }
291
33
  LoopList.push_back(CurrentLoop);
292
33
  return LoopList;
293
33
}
294
295
14
static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) {
296
14
  PHINode *InnerIndexVar = L->getCanonicalInductionVariable();
297
14
  if (InnerIndexVar)
298
5
    return InnerIndexVar;
299
9
  if (L->getLoopLatch() == nullptr || L->getLoopPredecessor() == nullptr)
300
0
    return nullptr;
301
9
  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); 
++I0
) {
302
9
    PHINode *PhiVar = cast<PHINode>(I);
303
9
    Type *PhiTy = PhiVar->getType();
304
9
    if (!PhiTy->isIntegerTy() && 
!PhiTy->isFloatingPointTy()0
&&
305
9
        
!PhiTy->isPointerTy()0
)
306
0
      return nullptr;
307
9
    const SCEVAddRecExpr *AddRec =
308
9
        dyn_cast<SCEVAddRecExpr>(SE->getSCEV(PhiVar));
309
9
    if (!AddRec || !AddRec->isAffine())
310
0
      continue;
311
9
    const SCEV *Step = AddRec->getStepRecurrence(*SE);
312
9
    if (!isa<SCEVConstant>(Step))
313
0
      continue;
314
9
    // Found the induction variable.
315
9
    // FIXME: Handle loops with more than one induction variable. Note that,
316
9
    // currently, legality makes sure we have only one induction variable.
317
9
    return PhiVar;
318
9
  }
319
9
  
return nullptr0
;
320
9
}
321
322
namespace {
323
324
/// LoopInterchangeLegality checks if it is legal to interchange the loop.
325
class LoopInterchangeLegality {
326
public:
327
  LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
328
                          OptimizationRemarkEmitter *ORE)
329
34
      : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
330
331
  /// Check if the loops can be interchanged.
332
  bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
333
                           CharMatrix &DepMatrix);
334
335
  /// Check if the loop structure is understood. We do not handle triangular
336
  /// loops for now.
337
  bool isLoopStructureUnderstood(PHINode *InnerInductionVar);
338
339
  bool currentLimitations();
340
341
15
  const SmallPtrSetImpl<PHINode *> &getOuterInnerReductions() const {
342
15
    return OuterInnerReductions;
343
15
  }
344
345
private:
346
  bool tightlyNested(Loop *Outer, Loop *Inner);
347
  bool containsUnsafeInstructions(BasicBlock *BB);
348
349
  /// Discover induction and reduction PHIs in the header of \p L. Induction
350
  /// PHIs are added to \p Inductions, reductions are added to
351
  /// OuterInnerReductions. When the outer loop is passed, the inner loop needs
352
  /// to be passed as \p InnerLoop.
353
  bool findInductionAndReductions(Loop *L,
354
                                  SmallVector<PHINode *, 8> &Inductions,
355
                                  Loop *InnerLoop);
356
357
  Loop *OuterLoop;
358
  Loop *InnerLoop;
359
360
  ScalarEvolution *SE;
361
362
  /// Interface to emit optimization remarks.
363
  OptimizationRemarkEmitter *ORE;
364
365
  /// Set of reduction PHIs taking part of a reduction across the inner and
366
  /// outer loop.
367
  SmallPtrSet<PHINode *, 4> OuterInnerReductions;
368
};
369
370
/// LoopInterchangeProfitability checks if it is profitable to interchange the
371
/// loop.
372
class LoopInterchangeProfitability {
373
public:
374
  LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
375
                               OptimizationRemarkEmitter *ORE)
376
17
      : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
377
378
  /// Check if the loop interchange is profitable.
379
  bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId,
380
                    CharMatrix &DepMatrix);
381
382
private:
383
  int getInstrOrderCost();
384
385
  Loop *OuterLoop;
386
  Loop *InnerLoop;
387
388
  /// Scev analysis.
389
  ScalarEvolution *SE;
390
391
  /// Interface to emit optimization remarks.
392
  OptimizationRemarkEmitter *ORE;
393
};
394
395
/// LoopInterchangeTransform interchanges the loop.
396
class LoopInterchangeTransform {
397
public:
398
  LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
399
                           LoopInfo *LI, DominatorTree *DT,
400
                           BasicBlock *LoopNestExit,
401
                           const LoopInterchangeLegality &LIL)
402
      : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT),
403
15
        LoopExit(LoopNestExit), LIL(LIL) {}
404
405
  /// Interchange OuterLoop and InnerLoop.
406
  bool transform();
407
  void restructureLoops(Loop *NewInner, Loop *NewOuter,
408
                        BasicBlock *OrigInnerPreHeader,
409
                        BasicBlock *OrigOuterPreHeader);
410
  void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop);
411
412
private:
413
  void splitInnerLoopLatch(Instruction *);
414
  void splitInnerLoopHeader();
415
  bool adjustLoopLinks();
416
  void adjustLoopPreheaders();
417
  bool adjustLoopBranches();
418
419
  Loop *OuterLoop;
420
  Loop *InnerLoop;
421
422
  /// Scev analysis.
423
  ScalarEvolution *SE;
424
425
  LoopInfo *LI;
426
  DominatorTree *DT;
427
  BasicBlock *LoopExit;
428
429
  const LoopInterchangeLegality &LIL;
430
};
431
432
// Main LoopInterchange Pass.
433
struct LoopInterchange : public LoopPass {
434
  static char ID;
435
  ScalarEvolution *SE = nullptr;
436
  LoopInfo *LI = nullptr;
437
  DependenceInfo *DI = nullptr;
438
  DominatorTree *DT = nullptr;
439
440
  /// Interface to emit optimization remarks.
441
  OptimizationRemarkEmitter *ORE;
442
443
12
  LoopInterchange() : LoopPass(ID) {
444
12
    initializeLoopInterchangePass(*PassRegistry::getPassRegistry());
445
12
  }
446
447
12
  void getAnalysisUsage(AnalysisUsage &AU) const override {
448
12
    AU.addRequired<DependenceAnalysisWrapperPass>();
449
12
    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
450
12
451
12
    getLoopAnalysisUsage(AU);
452
12
  }
453
454
68
  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
455
68
    if (skipLoop(L) || L->getParentLoop())
456
35
      return false;
457
33
458
33
    SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
459
33
    LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
460
33
    DI = &getAnalysis<DependenceAnalysisWrapperPass>().getDI();
461
33
    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
462
33
    ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
463
33
464
33
    return processLoopList(populateWorklist(*L));
465
33
  }
466
467
33
  bool isComputableLoopNest(LoopVector LoopList) {
468
68
    for (Loop *L : LoopList) {
469
68
      const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);
470
68
      if (ExitCountOuter == SE->getCouldNotCompute()) {
471
0
        LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n");
472
0
        return false;
473
0
      }
474
68
      if (L->getNumBackEdges() != 1) {
475
0
        LLVM_DEBUG(dbgs() << "NumBackEdges is not equal to 1\n");
476
0
        return false;
477
0
      }
478
68
      if (!L->getExitingBlock()) {
479
0
        LLVM_DEBUG(dbgs() << "Loop doesn't have unique exit block\n");
480
0
        return false;
481
0
      }
482
68
    }
483
33
    return true;
484
33
  }
485
486
33
  unsigned selectLoopForInterchange(const LoopVector &LoopList) {
487
33
    // TODO: Add a better heuristic to select the loop to be interchanged based
488
33
    // on the dependence matrix. Currently we select the innermost loop.
489
33
    return LoopList.size() - 1;
490
33
  }
491
492
33
  bool processLoopList(LoopVector LoopList) {
493
33
    bool Changed = false;
494
33
    unsigned LoopNestDepth = LoopList.size();
495
33
    if (LoopNestDepth < 2) {
496
0
      LLVM_DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n");
497
0
      return false;
498
0
    }
499
33
    if (LoopNestDepth > MaxLoopNestDepth) {
500
0
      LLVM_DEBUG(dbgs() << "Cannot handle loops of depth greater than "
501
0
                        << MaxLoopNestDepth << "\n");
502
0
      return false;
503
0
    }
504
33
    if (!isComputableLoopNest(LoopList)) {
505
0
      LLVM_DEBUG(dbgs() << "Not valid loop candidate for interchange\n");
506
0
      return false;
507
0
    }
508
33
509
33
    LLVM_DEBUG(dbgs() << "Processing LoopList of size = " << LoopNestDepth
510
33
                      << "\n");
511
33
512
33
    CharMatrix DependencyMatrix;
513
33
    Loop *OuterMostLoop = *(LoopList.begin());
514
33
    if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth,
515
33
                                  OuterMostLoop, DI)) {
516
0
      LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");
517
0
      return false;
518
0
    }
519
#ifdef DUMP_DEP_MATRICIES
520
    LLVM_DEBUG(dbgs() << "Dependence before interchange\n");
521
    printDepMatrix(DependencyMatrix);
522
#endif
523
524
33
    // Get the Outermost loop exit.
525
33
    BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock();
526
33
    if (!LoopNestExit) {
527
0
      LLVM_DEBUG(dbgs() << "OuterMostLoop needs an unique exit block");
528
0
      return false;
529
0
    }
530
33
531
33
    unsigned SelecLoopId = selectLoopForInterchange(LoopList);
532
33
    // Move the selected loop outwards to the best possible position.
533
48
    for (unsigned i = SelecLoopId; i > 0; 
i--15
) {
534
34
      bool Interchanged =
535
34
          processLoop(LoopList, i, i - 1, LoopNestExit, DependencyMatrix);
536
34
      if (!Interchanged)
537
19
        return Changed;
538
15
      // Loops interchanged reflect the same in LoopList
539
15
      std::swap(LoopList[i - 1], LoopList[i]);
540
15
541
15
      // Update the DependencyMatrix
542
15
      interChangeDependencies(DependencyMatrix, i, i - 1);
543
#ifdef DUMP_DEP_MATRICIES
544
      LLVM_DEBUG(dbgs() << "Dependence after interchange\n");
545
      printDepMatrix(DependencyMatrix);
546
#endif
547
      Changed |= Interchanged;
548
15
    }
549
33
    
return Changed14
;
550
33
  }
551
552
  bool processLoop(LoopVector LoopList, unsigned InnerLoopId,
553
                   unsigned OuterLoopId, BasicBlock *LoopNestExit,
554
34
                   std::vector<std::vector<char>> &DependencyMatrix) {
555
34
    LLVM_DEBUG(dbgs() << "Processing Inner Loop Id = " << InnerLoopId
556
34
                      << " and OuterLoopId = " << OuterLoopId << "\n");
557
34
    Loop *InnerLoop = LoopList[InnerLoopId];
558
34
    Loop *OuterLoop = LoopList[OuterLoopId];
559
34
560
34
    LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE);
561
34
    if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
562
17
      LLVM_DEBUG(dbgs() << "Not interchanging loops. Cannot prove legality.\n");
563
17
      return false;
564
17
    }
565
17
    LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n");
566
17
    LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
567
17
    if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) {
568
2
      LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n");
569
2
      return false;
570
2
    }
571
15
572
15
    ORE->emit([&]() {
573
7
      return OptimizationRemark(DEBUG_TYPE, "Interchanged",
574
7
                                InnerLoop->getStartLoc(),
575
7
                                InnerLoop->getHeader())
576
7
             << "Loop interchanged with enclosing loop.";
577
7
    });
578
15
579
15
    LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LoopNestExit,
580
15
                                 LIL);
581
15
    LIT.transform();
582
15
    LLVM_DEBUG(dbgs() << "Loops interchanged.\n");
583
15
    LoopsInterchanged++;
584
15
    return true;
585
15
  }
586
};
587
588
} // end anonymous namespace
589
590
40
bool LoopInterchangeLegality::containsUnsafeInstructions(BasicBlock *BB) {
591
113
  return any_of(*BB, [](const Instruction &I) {
592
113
    return I.mayHaveSideEffects() || I.mayReadFromMemory();
593
113
  });
594
40
}
595
596
20
bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
597
20
  BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
598
20
  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
599
20
  BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
600
20
601
20
  LLVM_DEBUG(dbgs() << "Checking if loops are tightly nested\n");
602
20
603
20
  // A perfectly nested loop will not have any branch in between the outer and
604
20
  // inner block i.e. outer header will branch to either inner preheader and
605
20
  // outerloop latch.
606
20
  BranchInst *OuterLoopHeaderBI =
607
20
      dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
608
20
  if (!OuterLoopHeaderBI)
609
0
    return false;
610
20
611
20
  for (BasicBlock *Succ : successors(OuterLoopHeaderBI))
612
24
    if (Succ != InnerLoopPreHeader && 
Succ != InnerLoop->getHeader()19
&&
613
24
        
Succ != OuterLoopLatch4
)
614
0
      return false;
615
20
616
20
  LLVM_DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");
617
20
  // We do not have any basic block in between now make sure the outer header
618
20
  // and outer loop latch doesn't contain any unsafe instructions.
619
20
  if (containsUnsafeInstructions(OuterLoopHeader) ||
620
20
      containsUnsafeInstructions(OuterLoopLatch))
621
0
    return false;
622
20
623
20
  LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n");
624
20
  // We have a perfect loop nest.
625
20
  return true;
626
20
}
627
628
bool LoopInterchangeLegality::isLoopStructureUnderstood(
629
20
    PHINode *InnerInduction) {
630
20
  unsigned Num = InnerInduction->getNumOperands();
631
20
  BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
632
60
  for (unsigned i = 0; i < Num; 
++i40
) {
633
40
    Value *Val = InnerInduction->getOperand(i);
634
40
    if (isa<Constant>(Val))
635
20
      continue;
636
20
    Instruction *I = dyn_cast<Instruction>(Val);
637
20
    if (!I)
638
0
      return false;
639
20
    // TODO: Handle triangular loops.
640
20
    // e.g. for(int i=0;i<N;i++)
641
20
    //        for(int j=i;j<N;j++)
642
20
    unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
643
20
    if (InnerInduction->getIncomingBlock(IncomBlockIndx) ==
644
20
            InnerLoopPreheader &&
645
20
        
!OuterLoop->isLoopInvariant(I)0
) {
646
0
      return false;
647
0
    }
648
20
  }
649
20
  return true;
650
20
}
651
652
// If SV is a LCSSA PHI node with a single incoming value, return the incoming
653
// value.
654
9
static Value *followLCSSA(Value *SV) {
655
9
  PHINode *PHI = dyn_cast<PHINode>(SV);
656
9
  if (!PHI)
657
6
    return SV;
658
3
659
3
  if (PHI->getNumIncomingValues() != 1)
660
0
    return SV;
661
3
  return followLCSSA(PHI->getIncomingValue(0));
662
3
}
663
664
// Check V's users to see if it is involved in a reduction in L.
665
6
static PHINode *findInnerReductionPhi(Loop *L, Value *V) {
666
11
  for (Value *User : V->users()) {
667
11
    if (PHINode *PHI = dyn_cast<PHINode>(User)) {
668
11
      if (PHI->getNumIncomingValues() == 1)
669
5
        continue;
670
6
      RecurrenceDescriptor RD;
671
6
      if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD))
672
3
        return PHI;
673
3
      return nullptr;
674
3
    }
675
11
  }
676
6
677
6
  
return nullptr0
;
678
6
}
679
680
bool LoopInterchangeLegality::findInductionAndReductions(
681
47
    Loop *L, SmallVector<PHINode *, 8> &Inductions, Loop *InnerLoop) {
682
47
  if (!L->getLoopLatch() || !L->getLoopPredecessor())
683
0
    return false;
684
55
  
for (PHINode &PHI : L->getHeader()->phis())47
{
685
55
    RecurrenceDescriptor RD;
686
55
    InductionDescriptor ID;
687
55
    if (InductionDescriptor::isInductionPHI(&PHI, L, SE, ID))
688
47
      Inductions.push_back(&PHI);
689
8
    else {
690
8
      // PHIs in inner loops need to be part of a reduction in the outer loop,
691
8
      // discovered when checking the PHIs of the outer loop earlier.
692
8
      if (!InnerLoop) {
693
2
        if (OuterInnerReductions.find(&PHI) == OuterInnerReductions.end()) {
694
1
          LLVM_DEBUG(dbgs() << "Inner loop PHI is not part of reductions "
695
1
                               "across the outer loop.\n");
696
1
          return false;
697
1
        }
698
6
      } else {
699
6
        assert(PHI.getNumIncomingValues() == 2 &&
700
6
               "Phis in loop header should have exactly 2 incoming values");
701
6
        // Check if we have a PHI node in the outer loop that has a reduction
702
6
        // result from the inner loop as an incoming value.
703
6
        Value *V = followLCSSA(PHI.getIncomingValueForBlock(L->getLoopLatch()));
704
6
        PHINode *InnerRedPhi = findInnerReductionPhi(InnerLoop, V);
705
6
        if (!InnerRedPhi ||
706
6
            !llvm::any_of(InnerRedPhi->incoming_values(),
707
5
                          [&PHI](Value *V) { return V == &PHI; })) {
708
5
          LLVM_DEBUG(
709
5
              dbgs()
710
5
              << "Failed to recognize PHI as an induction or reduction.\n");
711
5
          return false;
712
5
        }
713
1
        OuterInnerReductions.insert(&PHI);
714
1
        OuterInnerReductions.insert(InnerRedPhi);
715
1
      }
716
8
    }
717
55
  }
718
47
  
return true41
;
719
47
}
720
721
20
static bool containsSafePHI(BasicBlock *Block, bool isOuterLoopExitBlock) {
722
20
  for (PHINode &PHI : Block->phis()) {
723
8
    // Reduction lcssa phi will have only 1 incoming block that from loop latch.
724
8
    if (PHI.getNumIncomingValues() > 1)
725
0
      return false;
726
8
    Instruction *Ins = dyn_cast<Instruction>(PHI.getIncomingValue(0));
727
8
    if (!Ins)
728
0
      return false;
729
8
    // Incoming value for lcssa phi's in outer loop exit can only be inner loop
730
8
    // exits lcssa phi else it would not be tightly nested.
731
8
    if (!isa<PHINode>(Ins) && 
isOuterLoopExitBlock4
)
732
0
      return false;
733
8
  }
734
20
  return true;
735
20
}
736
737
// This function indicates the current limitations in the transform as a result
738
// of which we do not proceed.
739
27
bool LoopInterchangeLegality::currentLimitations() {
740
27
  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
741
27
  BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
742
27
743
27
  // transform currently expects the loop latches to also be the exiting
744
27
  // blocks.
745
27
  if (InnerLoop->getExitingBlock() != InnerLoopLatch ||
746
27
      
OuterLoop->getExitingBlock() != OuterLoop->getLoopLatch()26
||
747
27
      
!isa<BranchInst>(InnerLoopLatch->getTerminator())26
||
748
27
      
!isa<BranchInst>(OuterLoop->getLoopLatch()->getTerminator())26
) {
749
1
    LLVM_DEBUG(
750
1
        dbgs() << "Loops where the latch is not the exiting block are not"
751
1
               << " supported currently.\n");
752
1
    ORE->emit([&]() {
753
1
      return OptimizationRemarkMissed(DEBUG_TYPE, "ExitingNotLatch",
754
1
                                      OuterLoop->getStartLoc(),
755
1
                                      OuterLoop->getHeader())
756
1
             << "Loops where the latch is not the exiting block cannot be"
757
1
                " interchange currently.";
758
1
    });
759
1
    return true;
760
1
  }
761
26
762
26
  PHINode *InnerInductionVar;
763
26
  SmallVector<PHINode *, 8> Inductions;
764
26
  if (!findInductionAndReductions(OuterLoop, Inductions, InnerLoop)) {
765
5
    LLVM_DEBUG(
766
5
        dbgs() << "Only outer loops with induction or reduction PHI nodes "
767
5
               << "are supported currently.\n");
768
5
    ORE->emit([&]() {
769
5
      return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter",
770
5
                                      OuterLoop->getStartLoc(),
771
5
                                      OuterLoop->getHeader())
772
5
             << "Only outer loops with induction or reduction PHI nodes can be"
773
5
                " interchanged currently.";
774
5
    });
775
5
    return true;
776
5
  }
777
21
778
21
  // TODO: Currently we handle only loops with 1 induction variable.
779
21
  if (Inductions.size() != 1) {
780
0
    LLVM_DEBUG(dbgs() << "Loops with more than 1 induction variables are not "
781
0
                      << "supported currently.\n");
782
0
    ORE->emit([&]() {
783
0
      return OptimizationRemarkMissed(DEBUG_TYPE, "MultiIndutionOuter",
784
0
                                      OuterLoop->getStartLoc(),
785
0
                                      OuterLoop->getHeader())
786
0
             << "Only outer loops with 1 induction variable can be "
787
0
                "interchanged currently.";
788
0
    });
789
0
    return true;
790
0
  }
791
21
792
21
  Inductions.clear();
793
21
  if (!findInductionAndReductions(InnerLoop, Inductions, nullptr)) {
794
1
    LLVM_DEBUG(
795
1
        dbgs() << "Only inner loops with induction or reduction PHI nodes "
796
1
               << "are supported currently.\n");
797
1
    ORE->emit([&]() {
798
1
      return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner",
799
1
                                      InnerLoop->getStartLoc(),
800
1
                                      InnerLoop->getHeader())
801
1
             << "Only inner loops with induction or reduction PHI nodes can be"
802
1
                " interchange currently.";
803
1
    });
804
1
    return true;
805
1
  }
806
20
807
20
  // TODO: Currently we handle only loops with 1 induction variable.
808
20
  if (Inductions.size() != 1) {
809
0
    LLVM_DEBUG(
810
0
        dbgs() << "We currently only support loops with 1 induction variable."
811
0
               << "Failed to interchange due to current limitation\n");
812
0
    ORE->emit([&]() {
813
0
      return OptimizationRemarkMissed(DEBUG_TYPE, "MultiInductionInner",
814
0
                                      InnerLoop->getStartLoc(),
815
0
                                      InnerLoop->getHeader())
816
0
             << "Only inner loops with 1 induction variable can be "
817
0
                "interchanged currently.";
818
0
    });
819
0
    return true;
820
0
  }
821
20
  InnerInductionVar = Inductions.pop_back_val();
822
20
823
20
  // TODO: Triangular loops are not handled for now.
824
20
  if (!isLoopStructureUnderstood(InnerInductionVar)) {
825
0
    LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n");
826
0
    ORE->emit([&]() {
827
0
      return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner",
828
0
                                      InnerLoop->getStartLoc(),
829
0
                                      InnerLoop->getHeader())
830
0
             << "Inner loop structure not understood currently.";
831
0
    });
832
0
    return true;
833
0
  }
834
20
835
20
  // TODO: We only handle LCSSA PHI's corresponding to reduction for now.
836
20
  BasicBlock *InnerExit = InnerLoop->getExitBlock();
837
20
  if (!containsSafePHI(InnerExit, false)) {
838
0
    LLVM_DEBUG(
839
0
        dbgs() << "Can only handle LCSSA PHIs in inner loops currently.\n");
840
0
    ORE->emit([&]() {
841
0
      return OptimizationRemarkMissed(DEBUG_TYPE, "NoLCSSAPHIOuterInner",
842
0
                                      InnerLoop->getStartLoc(),
843
0
                                      InnerLoop->getHeader())
844
0
             << "Only inner loops with LCSSA PHIs can be interchange "
845
0
                "currently.";
846
0
    });
847
0
    return true;
848
0
  }
849
20
850
20
  // TODO: Current limitation: Since we split the inner loop latch at the point
851
20
  // were induction variable is incremented (induction.next); We cannot have
852
20
  // more than 1 user of induction.next since it would result in broken code
853
20
  // after split.
854
20
  // e.g.
855
20
  // for(i=0;i<N;i++) {
856
20
  //    for(j = 0;j<M;j++) {
857
20
  //      A[j+1][i+2] = A[j][i]+k;
858
20
  //  }
859
20
  // }
860
20
  Instruction *InnerIndexVarInc = nullptr;
861
20
  if (InnerInductionVar->getIncomingBlock(0) == InnerLoopPreHeader)
862
11
    InnerIndexVarInc =
863
11
        dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(1));
864
9
  else
865
9
    InnerIndexVarInc =
866
9
        dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(0));
867
20
868
20
  if (!InnerIndexVarInc) {
869
0
    LLVM_DEBUG(
870
0
        dbgs() << "Did not find an instruction to increment the induction "
871
0
               << "variable.\n");
872
0
    ORE->emit([&]() {
873
0
      return OptimizationRemarkMissed(DEBUG_TYPE, "NoIncrementInInner",
874
0
                                      InnerLoop->getStartLoc(),
875
0
                                      InnerLoop->getHeader())
876
0
             << "The inner loop does not increment the induction variable.";
877
0
    });
878
0
    return true;
879
0
  }
880
20
881
20
  // Since we split the inner loop latch on this induction variable. Make sure
882
20
  // we do not have any instruction between the induction variable and branch
883
20
  // instruction.
884
20
885
20
  bool FoundInduction = false;
886
20
  for (const Instruction &I :
887
60
       llvm::reverse(InnerLoopLatch->instructionsWithoutDebug())) {
888
60
    if (isa<BranchInst>(I) || 
isa<CmpInst>(I)40
||
isa<TruncInst>(I)21
||
889
60
        
isa<ZExtInst>(I)21
)
890
40
      continue;
891
20
892
20
    // We found an instruction. If this is not induction variable then it is not
893
20
    // safe to split this loop latch.
894
20
    if (!I.isIdenticalTo(InnerIndexVarInc)) {
895
0
      LLVM_DEBUG(dbgs() << "Found unsupported instructions between induction "
896
0
                        << "variable increment and branch.\n");
897
0
      ORE->emit([&]() {
898
0
        return OptimizationRemarkMissed(
899
0
                   DEBUG_TYPE, "UnsupportedInsBetweenInduction",
900
0
                   InnerLoop->getStartLoc(), InnerLoop->getHeader())
901
0
               << "Found unsupported instruction between induction variable "
902
0
                  "increment and branch.";
903
0
      });
904
0
      return true;
905
0
    }
906
20
907
20
    FoundInduction = true;
908
20
    break;
909
20
  }
910
20
  // The loop latch ended and we didn't find the induction variable return as
911
20
  // current limitation.
912
20
  if (!FoundInduction) {
913
0
    LLVM_DEBUG(dbgs() << "Did not find the induction variable.\n");
914
0
    ORE->emit([&]() {
915
0
      return OptimizationRemarkMissed(DEBUG_TYPE, "NoIndutionVariable",
916
0
                                      InnerLoop->getStartLoc(),
917
0
                                      InnerLoop->getHeader())
918
0
             << "Did not find the induction variable.";
919
0
    });
920
0
    return true;
921
0
  }
922
20
  return false;
923
20
}
924
925
// We currently support LCSSA PHI nodes in the outer loop exit, if their
926
// incoming values do not come from the outer loop latch or if the
927
// outer loop latch has a single predecessor. In that case, the value will
928
// be available if both the inner and outer loop conditions are true, which
929
// will still be true after interchanging. If we have multiple predecessor,
930
// that may not be the case, e.g. because the outer loop latch may be executed
931
// if the inner loop is not executed.
932
20
static bool areLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
933
20
  BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock();
934
20
  for (PHINode &PHI : LoopNestExit->phis()) {
935
14
    //  FIXME: We currently are not able to detect floating point reductions
936
14
    //         and have to use floating point PHIs as a proxy to prevent
937
14
    //         interchanging in the presence of floating point reductions.
938
14
    if (PHI.getType()->isFloatingPointTy())
939
0
      return false;
940
25
    
for (unsigned i = 0; 14
i < PHI.getNumIncomingValues();
i++11
) {
941
14
     Instruction *IncomingI = dyn_cast<Instruction>(PHI.getIncomingValue(i));
942
14
     if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch())
943
2
       continue;
944
12
945
12
     // The incoming value is defined in the outer loop latch. Currently we
946
12
     // only support that in case the outer loop latch has a single predecessor.
947
12
     // This guarantees that the outer loop latch is executed if and only if
948
12
     // the inner loop is executed (because tightlyNested() guarantees that the
949
12
     // outer loop header only branches to the inner loop or the outer loop
950
12
     // latch).
951
12
     // FIXME: We could weaken this logic and allow multiple predecessors,
952
12
     //        if the values are produced outside the loop latch. We would need
953
12
     //        additional logic to update the PHI nodes in the exit block as
954
12
     //        well.
955
12
     if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr)
956
3
       return false;
957
12
    }
958
14
  }
959
20
  
return true17
;
960
20
}
961
962
bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
963
                                                  unsigned OuterLoopId,
964
34
                                                  CharMatrix &DepMatrix) {
965
34
  if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) {
966
7
    LLVM_DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
967
7
                      << " and OuterLoopId = " << OuterLoopId
968
7
                      << " due to dependence\n");
969
7
    ORE->emit([&]() {
970
6
      return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
971
6
                                      InnerLoop->getStartLoc(),
972
6
                                      InnerLoop->getHeader())
973
6
             << "Cannot interchange loops due to dependences.";
974
6
    });
975
7
    return false;
976
7
  }
977
27
  // Check if outer and inner loop contain legal instructions only.
978
27
  for (auto *BB : OuterLoop->blocks())
979
99
    for (Instruction &I : BB->instructionsWithoutDebug())
980
417
      if (CallInst *CI = dyn_cast<CallInst>(&I)) {
981
0
        // readnone functions do not prevent interchanging.
982
0
        if (CI->doesNotReadMemory())
983
0
          continue;
984
0
        LLVM_DEBUG(
985
0
            dbgs() << "Loops with call instructions cannot be interchanged "
986
0
                   << "safely.");
987
0
        ORE->emit([&]() {
988
0
          return OptimizationRemarkMissed(DEBUG_TYPE, "CallInst",
989
0
                                          CI->getDebugLoc(),
990
0
                                          CI->getParent())
991
0
                 << "Cannot interchange loops due to call instruction.";
992
0
        });
993
0
994
0
        return false;
995
0
      }
996
27
997
27
  // TODO: The loops could not be interchanged due to current limitations in the
998
27
  // transform module.
999
27
  if (currentLimitations()) {
1000
7
    LLVM_DEBUG(dbgs() << "Not legal because of current transform limitation\n");
1001
7
    return false;
1002
7
  }
1003
20
1004
20
  // Check if the loops are tightly nested.
1005
20
  if (!tightlyNested(OuterLoop, InnerLoop)) {
1006
0
    LLVM_DEBUG(dbgs() << "Loops not tightly nested\n");
1007
0
    ORE->emit([&]() {
1008
0
      return OptimizationRemarkMissed(DEBUG_TYPE, "NotTightlyNested",
1009
0
                                      InnerLoop->getStartLoc(),
1010
0
                                      InnerLoop->getHeader())
1011
0
             << "Cannot interchange loops because they are not tightly "
1012
0
                "nested.";
1013
0
    });
1014
0
    return false;
1015
0
  }
1016
20
1017
20
  if (!areLoopExitPHIsSupported(OuterLoop, InnerLoop)) {
1018
3
    LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n");
1019
3
    ORE->emit([&]() {
1020
3
      return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
1021
3
                                      OuterLoop->getStartLoc(),
1022
3
                                      OuterLoop->getHeader())
1023
3
             << "Found unsupported PHI node in loop exit.";
1024
3
    });
1025
3
    return false;
1026
3
  }
1027
17
1028
17
  return true;
1029
17
}
1030
1031
17
int LoopInterchangeProfitability::getInstrOrderCost() {
1032
17
  unsigned GoodOrder, BadOrder;
1033
17
  BadOrder = GoodOrder = 0;
1034
24
  for (BasicBlock *BB : InnerLoop->blocks()) {
1035
153
    for (Instruction &Ins : *BB) {
1036
153
      if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Ins)) {
1037
21
        unsigned NumOp = GEP->getNumOperands();
1038
21
        bool FoundInnerInduction = false;
1039
21
        bool FoundOuterInduction = false;
1040
82
        for (unsigned i = 0; i < NumOp; 
++i61
) {
1041
80
          const SCEV *OperandVal = SE->getSCEV(GEP->getOperand(i));
1042
80
          const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OperandVal);
1043
80
          if (!AR)
1044
40
            continue;
1045
40
1046
40
          // If we find the inner induction after an outer induction e.g.
1047
40
          // for(int i=0;i<N;i++)
1048
40
          //   for(int j=0;j<N;j++)
1049
40
          //     A[i][j] = A[i-1][j-1]+k;
1050
40
          // then it is a good order.
1051
40
          if (AR->getLoop() == InnerLoop) {
1052
21
            // We found an InnerLoop induction after OuterLoop induction. It is
1053
21
            // a good order.
1054
21
            FoundInnerInduction = true;
1055
21
            if (FoundOuterInduction) {
1056
3
              GoodOrder++;
1057
3
              break;
1058
3
            }
1059
37
          }
1060
37
          // If we find the outer induction after an inner induction e.g.
1061
37
          // for(int i=0;i<N;i++)
1062
37
          //   for(int j=0;j<N;j++)
1063
37
          //     A[j][i] = A[j-1][i-1]+k;
1064
37
          // then it is a bad order.
1065
37
          if (AR->getLoop() == OuterLoop) {
1066
19
            // We found an OuterLoop induction after InnerLoop induction. It is
1067
19
            // a bad order.
1068
19
            FoundOuterInduction = true;
1069
19
            if (FoundInnerInduction) {
1070
16
              BadOrder++;
1071
16
              break;
1072
16
            }
1073
19
          }
1074
37
        }
1075
21
      }
1076
153
    }
1077
24
  }
1078
17
  return GoodOrder - BadOrder;
1079
17
}
1080
1081
static bool isProfitableForVectorization(unsigned InnerLoopId,
1082
                                         unsigned OuterLoopId,
1083
2
                                         CharMatrix &DepMatrix) {
1084
2
  // TODO: Improve this heuristic to catch more cases.
1085
2
  // If the inner loop is loop independent or doesn't carry any dependency it is
1086
2
  // profitable to move this to outer position.
1087
2
  for (auto &Row : DepMatrix) {
1088
2
    if (Row[InnerLoopId] != 'S' && Row[InnerLoopId] != 'I')
1089
2
      return false;
1090
0
    // TODO: We need to improve this heuristic.
1091
0
    if (Row[OuterLoopId] != '=')
1092
0
      return false;
1093
0
  }
1094
2
  // If outer loop has dependence and inner loop is loop independent then it is
1095
2
  // profitable to interchange to enable parallelism.
1096
2
  // If there are no dependences, interchanging will not improve anything.
1097
2
  
return !DepMatrix.empty()0
;
1098
2
}
1099
1100
bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,
1101
                                                unsigned OuterLoopId,
1102
17
                                                CharMatrix &DepMatrix) {
1103
17
  // TODO: Add better profitability checks.
1104
17
  // e.g
1105
17
  // 1) Construct dependency matrix and move the one with no loop carried dep
1106
17
  //    inside to enable vectorization.
1107
17
1108
17
  // This is rough cost estimation algorithm. It counts the good and bad order
1109
17
  // of induction variables in the instruction and allows reordering if number
1110
17
  // of bad orders is more than good.
1111
17
  int Cost = getInstrOrderCost();
1112
17
  LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
1113
17
  if (Cost < -LoopInterchangeCostThreshold)
1114
15
    return true;
1115
2
1116
2
  // It is not profitable as per current cache profitability model. But check if
1117
2
  // we can move this loop outside to improve parallelism.
1118
2
  if (isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix))
1119
0
    return true;
1120
2
1121
2
  ORE->emit([&]() {
1122
2
    return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
1123
2
                                    InnerLoop->getStartLoc(),
1124
2
                                    InnerLoop->getHeader())
1125
2
           << "Interchanging loops is too costly (cost="
1126
2
           << ore::NV("Cost", Cost) << ", threshold="
1127
2
           << ore::NV("Threshold", LoopInterchangeCostThreshold)
1128
2
           << ") and it does not improve parallelism.";
1129
2
  });
1130
2
  return false;
1131
2
}
1132
1133
void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
1134
16
                                               Loop *InnerLoop) {
1135
16
  for (Loop *L : *OuterLoop)
1136
16
    if (L == InnerLoop) {
1137
16
      OuterLoop->removeChildLoop(L);
1138
16
      return;
1139
16
    }
1140
16
  
llvm_unreachable0
("Couldn't find loop");
1141
16
}
1142
1143
/// Update LoopInfo, after interchanging. NewInner and NewOuter refer to the
1144
/// new inner and outer loop after interchanging: NewInner is the original
1145
/// outer loop and NewOuter is the original inner loop.
1146
///
1147
/// Before interchanging, we have the following structure
1148
/// Outer preheader
1149
//  Outer header
1150
//    Inner preheader
1151
//    Inner header
1152
//      Inner body
1153
//      Inner latch
1154
//   outer bbs
1155
//   Outer latch
1156
//
1157
// After interchanging:
1158
// Inner preheader
1159
// Inner header
1160
//   Outer preheader
1161
//   Outer header
1162
//     Inner body
1163
//     outer bbs
1164
//     Outer latch
1165
//   Inner latch
1166
void LoopInterchangeTransform::restructureLoops(
1167
    Loop *NewInner, Loop *NewOuter, BasicBlock *OrigInnerPreHeader,
1168
15
    BasicBlock *OrigOuterPreHeader) {
1169
15
  Loop *OuterLoopParent = OuterLoop->getParentLoop();
1170
15
  // The original inner loop preheader moves from the new inner loop to
1171
15
  // the parent loop, if there is one.
1172
15
  NewInner->removeBlockFromLoop(OrigInnerPreHeader);
1173
15
  LI->changeLoopFor(OrigInnerPreHeader, OuterLoopParent);
1174
15
1175
15
  // Switch the loop levels.
1176
15
  if (OuterLoopParent) {
1177
1
    // Remove the loop from its parent loop.
1178
1
    removeChildLoop(OuterLoopParent, NewInner);
1179
1
    removeChildLoop(NewInner, NewOuter);
1180
1
    OuterLoopParent->addChildLoop(NewOuter);
1181
14
  } else {
1182
14
    removeChildLoop(NewInner, NewOuter);
1183
14
    LI->changeTopLevelLoop(NewInner, NewOuter);
1184
14
  }
1185
16
  while (!NewOuter->empty())
1186
1
    NewInner->addChildLoop(NewOuter->removeChildLoop(NewOuter->begin()));
1187
15
  NewOuter->addChildLoop(NewInner);
1188
15
1189
15
  // BBs from the original inner loop.
1190
15
  SmallVector<BasicBlock *, 8> OrigInnerBBs(NewOuter->blocks());
1191
15
1192
15
  // Add BBs from the original outer loop to the original inner loop (excluding
1193
15
  // BBs already in inner loop)
1194
15
  for (BasicBlock *BB : NewInner->blocks())
1195
82
    if (LI->getLoopFor(BB) == NewInner)
1196
32
      NewOuter->addBlockEntry(BB);
1197
15
1198
15
  // Now remove inner loop header and latch from the new inner loop and move
1199
15
  // other BBs (the loop body) to the new inner loop.
1200
15
  BasicBlock *OuterHeader = NewOuter->getHeader();
1201
15
  BasicBlock *OuterLatch = NewOuter->getLoopLatch();
1202
50
  for (BasicBlock *BB : OrigInnerBBs) {
1203
50
    // Nothing will change for BBs in child loops.
1204
50
    if (LI->getLoopFor(BB) != NewOuter)
1205
3
      continue;
1206
47
    // Remove the new outer loop header and latch from the new inner loop.
1207
47
    if (BB == OuterHeader || 
BB == OuterLatch32
)
1208
30
      NewInner->removeBlockFromLoop(BB);
1209
17
    else
1210
17
      LI->changeLoopFor(BB, NewInner);
1211
47
  }
1212
15
1213
15
  // The preheader of the original outer loop becomes part of the new
1214
15
  // outer loop.
1215
15
  NewOuter->addBlockEntry(OrigOuterPreHeader);
1216
15
  LI->changeLoopFor(OrigOuterPreHeader, NewOuter);
1217
15
1218
15
  // Tell SE that we move the loops around.
1219
15
  SE->forgetLoop(NewOuter);
1220
15
  SE->forgetLoop(NewInner);
1221
15
}
1222
1223
15
bool LoopInterchangeTransform::transform() {
1224
15
  bool Transformed = false;
1225
15
  Instruction *InnerIndexVar;
1226
15
1227
15
  if (InnerLoop->getSubLoops().empty()) {
1228
14
    BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1229
14
    LLVM_DEBUG(dbgs() << "Calling Split Inner Loop\n");
1230
14
    PHINode *InductionPHI = getInductionVariable(InnerLoop, SE);
1231
14
    if (!InductionPHI) {
1232
0
      LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
1233
0
      return false;
1234
0
    }
1235
14
1236
14
    if (InductionPHI->getIncomingBlock(0) == InnerLoopPreHeader)
1237
9
      InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(1));
1238
5
    else
1239
5
      InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(0));
1240
14
1241
14
    // Ensure that InductionPHI is the first Phi node.
1242
14
    if (&InductionPHI->getParent()->front() != InductionPHI)
1243
0
      InductionPHI->moveBefore(&InductionPHI->getParent()->front());
1244
14
1245
14
    // Split at the place were the induction variable is
1246
14
    // incremented/decremented.
1247
14
    // TODO: This splitting logic may not work always. Fix this.
1248
14
    splitInnerLoopLatch(InnerIndexVar);
1249
14
    LLVM_DEBUG(dbgs() << "splitInnerLoopLatch done\n");
1250
14
1251
14
    // Splits the inner loops phi nodes out into a separate basic block.
1252
14
    BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
1253
14
    SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI);
1254
14
    LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n");
1255
14
  }
1256
15
1257
15
  Transformed |= adjustLoopLinks();
1258
15
  if (!Transformed) {
1259
0
    LLVM_DEBUG(dbgs() << "adjustLoopLinks failed\n");
1260
0
    return false;
1261
0
  }
1262
15
1263
15
  return true;
1264
15
}
1265
1266
14
void LoopInterchangeTransform::splitInnerLoopLatch(Instruction *Inc) {
1267
14
  SplitBlock(InnerLoop->getLoopLatch(), Inc, DT, LI);
1268
14
}
1269
1270
/// \brief Move all instructions except the terminator from FromBB right before
1271
/// InsertBefore
1272
30
static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
1273
30
  auto &ToList = InsertBefore->getParent()->getInstList();
1274
30
  auto &FromList = FromBB->getInstList();
1275
30
1276
30
  ToList.splice(InsertBefore->getIterator(), FromList, FromList.begin(),
1277
30
                FromBB->getTerminator()->getIterator());
1278
30
}
1279
1280
/// Update BI to jump to NewBB instead of OldBB. Records updates to
1281
/// the dominator tree in DTUpdates, if DT should be preserved.
1282
static void updateSuccessor(BranchInst *BI, BasicBlock *OldBB,
1283
                            BasicBlock *NewBB,
1284
105
                            std::vector<DominatorTree::UpdateType> &DTUpdates) {
1285
105
  assert(llvm::count_if(successors(BI),
1286
105
                        [OldBB](BasicBlock *BB) { return BB == OldBB; }) < 2 &&
1287
105
         "BI must jump to OldBB at most once.");
1288
131
  for (unsigned i = 0, e = BI->getNumSuccessors(); i < e; 
++i26
) {
1289
117
    if (BI->getSuccessor(i) == OldBB) {
1290
91
      BI->setSuccessor(i, NewBB);
1291
91
1292
91
      DTUpdates.push_back(
1293
91
          {DominatorTree::UpdateKind::Insert, BI->getParent(), NewBB});
1294
91
      DTUpdates.push_back(
1295
91
          {DominatorTree::UpdateKind::Delete, BI->getParent(), OldBB});
1296
91
      break;
1297
91
    }
1298
117
  }
1299
105
}
1300
1301
// Move Lcssa PHIs to the right place.
1302
static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader,
1303
                          BasicBlock *InnerLatch, BasicBlock *OuterHeader,
1304
15
                          BasicBlock *OuterLatch, BasicBlock *OuterExit) {
1305
15
1306
15
  // Deal with LCSSA PHI nodes in the exit block of the inner loop, that are
1307
15
  // defined either in the header or latch. Those blocks will become header and
1308
15
  // latch of the new outer loop, and the only possible users can PHI nodes
1309
15
  // in the exit block of the loop nest or the outer loop header (reduction
1310
15
  // PHIs, in that case, the incoming value must be defined in the inner loop
1311
15
  // header). We can just substitute the user with the incoming value and remove
1312
15
  // the PHI.
1313
15
  for (PHINode &P : make_early_inc_range(InnerExit->phis())) {
1314
7
    assert(P.getNumIncomingValues() == 1 &&
1315
7
           "Only loops with a single exit are supported!");
1316
7
1317
7
    // Incoming values are guaranteed be instructions currently.
1318
7
    auto IncI = cast<Instruction>(P.getIncomingValueForBlock(InnerLatch));
1319
7
    // Skip phis with incoming values from the inner loop body, excluding the
1320
7
    // header and latch.
1321
7
    if (IncI->getParent() != InnerLatch && 
IncI->getParent() != InnerHeader5
)
1322
1
      continue;
1323
6
1324
6
    assert(all_of(P.users(),
1325
6
                  [OuterHeader, OuterExit, IncI, InnerHeader](User *U) {
1326
6
                    return (cast<PHINode>(U)->getParent() == OuterHeader &&
1327
6
                            IncI->getParent() == InnerHeader) ||
1328
6
                           cast<PHINode>(U)->getParent() == OuterExit;
1329
6
                  }) &&
1330
6
           "Can only replace phis iff the uses are in the loop nest exit or "
1331
6
           "the incoming value is defined in the inner header (it will "
1332
6
           "dominate all loop blocks after interchanging)");
1333
6
    P.replaceAllUsesWith(IncI);
1334
6
    P.eraseFromParent();
1335
6
  }
1336
15
1337
15
  SmallVector<PHINode *, 8> LcssaInnerExit;
1338
15
  for (PHINode &P : InnerExit->phis())
1339
1
    LcssaInnerExit.push_back(&P);
1340
15
1341
15
  SmallVector<PHINode *, 8> LcssaInnerLatch;
1342
15
  for (PHINode &P : InnerLatch->phis())
1343
0
    LcssaInnerLatch.push_back(&P);
1344
15
1345
15
  // Lcssa PHIs for values used outside the inner loop are in InnerExit.
1346
15
  // If a PHI node has users outside of InnerExit, it has a use outside the
1347
15
  // interchanged loop and we have to preserve it. We move these to
1348
15
  // InnerLatch, which will become the new exit block for the innermost
1349
15
  // loop after interchanging.
1350
15
  for (PHINode *P : LcssaInnerExit)
1351
1
    P->moveBefore(InnerLatch->getFirstNonPHI());
1352
15
1353
15
  // If the inner loop latch contains LCSSA PHIs, those come from a child loop
1354
15
  // and we have to move them to the new inner latch.
1355
15
  for (PHINode *P : LcssaInnerLatch)
1356
0
    P->moveBefore(InnerExit->getFirstNonPHI());
1357
15
1358
15
  // Deal with LCSSA PHI nodes in the loop nest exit block. For PHIs that have
1359
15
  // incoming values from the outer latch or header, we have to add a new PHI
1360
15
  // in the inner loop latch, which became the exit block of the outer loop,
1361
15
  // after interchanging.
1362
15
  if (OuterExit) {
1363
14
    for (PHINode &P : OuterExit->phis()) {
1364
11
      if (P.getNumIncomingValues() != 1)
1365
0
        continue;
1366
11
      // Skip Phis with incoming values not defined in the outer loop's header
1367
11
      // and latch. Also skip incoming phis defined in the latch. Those should
1368
11
      // already have been updated.
1369
11
      auto I = dyn_cast<Instruction>(P.getIncomingValue(0));
1370
11
      if (!I || ((I->getParent() != OuterLatch || 
isa<PHINode>(I)2
) &&
1371
11
                 
I->getParent() != OuterHeader9
))
1372
8
        continue;
1373
3
1374
3
      PHINode *NewPhi = dyn_cast<PHINode>(P.clone());
1375
3
      NewPhi->setIncomingValue(0, P.getIncomingValue(0));
1376
3
      NewPhi->setIncomingBlock(0, OuterLatch);
1377
3
      NewPhi->insertBefore(InnerLatch->getFirstNonPHI());
1378
3
      P.setIncomingValue(0, NewPhi);
1379
3
    }
1380
14
  }
1381
15
1382
15
  // Now adjust the incoming blocks for the LCSSA PHIs.
1383
15
  // For PHIs moved from Inner's exit block, we need to replace Inner's latch
1384
15
  // with the new latch.
1385
15
  InnerLatch->replacePhiUsesWith(InnerLatch, OuterLatch);
1386
15
}
1387
1388
15
bool LoopInterchangeTransform::adjustLoopBranches() {
1389
15
  LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n");
1390
15
  std::vector<DominatorTree::UpdateType> DTUpdates;
1391
15
1392
15
  BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
1393
15
  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1394
15
1395
15
  assert(OuterLoopPreHeader != OuterLoop->getHeader() &&
1396
15
         InnerLoopPreHeader != InnerLoop->getHeader() && OuterLoopPreHeader &&
1397
15
         InnerLoopPreHeader && "Guaranteed by loop-simplify form");
1398
15
  // Ensure that both preheaders do not contain PHI nodes and have single
1399
15
  // predecessors. This allows us to move them easily. We use
1400
15
  // InsertPreHeaderForLoop to create an 'extra' preheader, if the existing
1401
15
  // preheaders do not satisfy those conditions.
1402
15
  if (isa<PHINode>(OuterLoopPreHeader->begin()) ||
1403
15
      
!OuterLoopPreHeader->getUniquePredecessor()14
)
1404
14
    OuterLoopPreHeader =
1405
14
        InsertPreheaderForLoop(OuterLoop, DT, LI, nullptr, true);
1406
15
  if (InnerLoopPreHeader == OuterLoop->getHeader())
1407
13
    InnerLoopPreHeader =
1408
13
        InsertPreheaderForLoop(InnerLoop, DT, LI, nullptr, true);
1409
15
1410
15
  // Adjust the loop preheader
1411
15
  BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
1412
15
  BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
1413
15
  BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1414
15
  BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
1415
15
  BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor();
1416
15
  BasicBlock *InnerLoopLatchPredecessor =
1417
15
      InnerLoopLatch->getUniquePredecessor();
1418
15
  BasicBlock *InnerLoopLatchSuccessor;
1419
15
  BasicBlock *OuterLoopLatchSuccessor;
1420
15
1421
15
  BranchInst *OuterLoopLatchBI =
1422
15
      dyn_cast<BranchInst>(OuterLoopLatch->getTerminator());
1423
15
  BranchInst *InnerLoopLatchBI =
1424
15
      dyn_cast<BranchInst>(InnerLoopLatch->getTerminator());
1425
15
  BranchInst *OuterLoopHeaderBI =
1426
15
      dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
1427
15
  BranchInst *InnerLoopHeaderBI =
1428
15
      dyn_cast<BranchInst>(InnerLoopHeader->getTerminator());
1429
15
1430
15
  if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor ||
1431
15
      !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI ||
1432
15
      !InnerLoopHeaderBI)
1433
0
    return false;
1434
15
1435
15
  BranchInst *InnerLoopLatchPredecessorBI =
1436
15
      dyn_cast<BranchInst>(InnerLoopLatchPredecessor->getTerminator());
1437
15
  BranchInst *OuterLoopPredecessorBI =
1438
15
      dyn_cast<BranchInst>(OuterLoopPredecessor->getTerminator());
1439
15
1440
15
  if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI)
1441
0
    return false;
1442
15
  BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor();
1443
15
  if (!InnerLoopHeaderSuccessor)
1444
0
    return false;
1445
15
1446
15
  // Adjust Loop Preheader and headers
1447
15
  updateSuccessor(OuterLoopPredecessorBI, OuterLoopPreHeader,
1448
15
                  InnerLoopPreHeader, DTUpdates);
1449
15
  updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, LoopExit, DTUpdates);
1450
15
  updateSuccessor(OuterLoopHeaderBI, InnerLoopPreHeader,
1451
15
                  InnerLoopHeaderSuccessor, DTUpdates);
1452
15
1453
15
  // Adjust reduction PHI's now that the incoming block has changed.
1454
15
  InnerLoopHeaderSuccessor->replacePhiUsesWith(InnerLoopHeader,
1455
15
                                               OuterLoopHeader);
1456
15
1457
15
  updateSuccessor(InnerLoopHeaderBI, InnerLoopHeaderSuccessor,
1458
15
                  OuterLoopPreHeader, DTUpdates);
1459
15
1460
15
  // -------------Adjust loop latches-----------
1461
15
  if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader)
1462
3
    InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(1);
1463
12
  else
1464
12
    InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0);
1465
15
1466
15
  updateSuccessor(InnerLoopLatchPredecessorBI, InnerLoopLatch,
1467
15
                  InnerLoopLatchSuccessor, DTUpdates);
1468
15
1469
15
1470
15
  if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader)
1471
8
    OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1);
1472
7
  else
1473
7
    OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0);
1474
15
1475
15
  updateSuccessor(InnerLoopLatchBI, InnerLoopLatchSuccessor,
1476
15
                  OuterLoopLatchSuccessor, DTUpdates);
1477
15
  updateSuccessor(OuterLoopLatchBI, OuterLoopLatchSuccessor, InnerLoopLatch,
1478
15
                  DTUpdates);
1479
15
1480
15
  DT->applyUpdates(DTUpdates);
1481
15
  restructureLoops(OuterLoop, InnerLoop, InnerLoopPreHeader,
1482
15
                   OuterLoopPreHeader);
1483
15
1484
15
  moveLCSSAPhis(InnerLoopLatchSuccessor, InnerLoopHeader, InnerLoopLatch,
1485
15
                OuterLoopHeader, OuterLoopLatch, InnerLoop->getExitBlock());
1486
15
  // For PHIs in the exit block of the outer loop, outer's latch has been
1487
15
  // replaced by Inners'.
1488
15
  OuterLoopLatchSuccessor->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
1489
15
1490
15
  // Now update the reduction PHIs in the inner and outer loop headers.
1491
15
  SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs;
1492
15
  for (PHINode &PHI : drop_begin(InnerLoopHeader->phis(), 1))
1493
1
    InnerLoopPHIs.push_back(cast<PHINode>(&PHI));
1494
15
  for (PHINode &PHI : drop_begin(OuterLoopHeader->phis(), 1))
1495
1
    OuterLoopPHIs.push_back(cast<PHINode>(&PHI));
1496
15
1497
15
  auto &OuterInnerReductions = LIL.getOuterInnerReductions();
1498
15
  (void)OuterInnerReductions;
1499
15
1500
15
  // Now move the remaining reduction PHIs from outer to inner loop header and
1501
15
  // vice versa. The PHI nodes must be part of a reduction across the inner and
1502
15
  // outer loop and all the remains to do is and updating the incoming blocks.
1503
15
  for (PHINode *PHI : OuterLoopPHIs) {
1504
1
    PHI->moveBefore(InnerLoopHeader->getFirstNonPHI());
1505
1
    assert(OuterInnerReductions.find(PHI) != OuterInnerReductions.end() &&
1506
1
           "Expected a reduction PHI node");
1507
1
  }
1508
15
  for (PHINode *PHI : InnerLoopPHIs) {
1509
1
    PHI->moveBefore(OuterLoopHeader->getFirstNonPHI());
1510
1
    assert(OuterInnerReductions.find(PHI) != OuterInnerReductions.end() &&
1511
1
           "Expected a reduction PHI node");
1512
1
  }
1513
15
1514
15
  // Update the incoming blocks for moved PHI nodes.
1515
15
  OuterLoopHeader->replacePhiUsesWith(InnerLoopPreHeader, OuterLoopPreHeader);
1516
15
  OuterLoopHeader->replacePhiUsesWith(InnerLoopLatch, OuterLoopLatch);
1517
15
  InnerLoopHeader->replacePhiUsesWith(OuterLoopPreHeader, InnerLoopPreHeader);
1518
15
  InnerLoopHeader->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
1519
15
1520
15
  return true;
1521
15
}
1522
1523
15
void LoopInterchangeTransform::adjustLoopPreheaders() {
1524
15
  // We have interchanged the preheaders so we need to interchange the data in
1525
15
  // the preheader as well.
1526
15
  // This is because the content of inner preheader was previously executed
1527
15
  // inside the outer loop.
1528
15
  BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
1529
15
  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1530
15
  BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
1531
15
  BranchInst *InnerTermBI =
1532
15
      cast<BranchInst>(InnerLoopPreHeader->getTerminator());
1533
15
1534
15
  // These instructions should now be executed inside the loop.
1535
15
  // Move instruction into a new block after outer header.
1536
15
  moveBBContents(InnerLoopPreHeader, OuterLoopHeader->getTerminator());
1537
15
  // These instructions were not executed previously in the loop so move them to
1538
15
  // the older inner loop preheader.
1539
15
  moveBBContents(OuterLoopPreHeader, InnerTermBI);
1540
15
}
1541
1542
15
bool LoopInterchangeTransform::adjustLoopLinks() {
1543
15
  // Adjust all branches in the inner and outer loop.
1544
15
  bool Changed = adjustLoopBranches();
1545
15
  if (Changed)
1546
15
    adjustLoopPreheaders();
1547
15
  return Changed;
1548
15
}
1549
1550
char LoopInterchange::ID = 0;
1551
1552
36.0k
INITIALIZE_PASS_BEGIN(LoopInterchange, "loop-interchange",
1553
36.0k
                      "Interchanges loops for cache reuse", false, false)
1554
36.0k
INITIALIZE_PASS_DEPENDENCY(LoopPass)
1555
36.0k
INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass)
1556
36.0k
INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
1557
36.0k
1558
36.0k
INITIALIZE_PASS_END(LoopInterchange, "loop-interchange",
1559
                    "Interchanges loops for cache reuse", false, false)
1560
1561
0
Pass *llvm::createLoopInterchangePass() { return new LoopInterchange(); }