Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
Line
Count
Source (jump to first uncovered line)
1
//===-- LoopUnrollAndJam.cpp - Loop unrolling utilities -------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This file implements loop unroll and jam as a routine, much like
10
// LoopUnroll.cpp implements loop unroll.
11
//
12
//===----------------------------------------------------------------------===//
13
14
#include "llvm/ADT/SmallPtrSet.h"
15
#include "llvm/ADT/Statistic.h"
16
#include "llvm/Analysis/AssumptionCache.h"
17
#include "llvm/Analysis/DependenceAnalysis.h"
18
#include "llvm/Analysis/InstructionSimplify.h"
19
#include "llvm/Analysis/LoopAnalysisManager.h"
20
#include "llvm/Analysis/LoopIterator.h"
21
#include "llvm/Analysis/LoopPass.h"
22
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
23
#include "llvm/Analysis/ScalarEvolution.h"
24
#include "llvm/Analysis/ScalarEvolutionExpander.h"
25
#include "llvm/Analysis/Utils/Local.h"
26
#include "llvm/IR/BasicBlock.h"
27
#include "llvm/IR/DataLayout.h"
28
#include "llvm/IR/DebugInfoMetadata.h"
29
#include "llvm/IR/Dominators.h"
30
#include "llvm/IR/IntrinsicInst.h"
31
#include "llvm/IR/LLVMContext.h"
32
#include "llvm/Support/Debug.h"
33
#include "llvm/Support/raw_ostream.h"
34
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
35
#include "llvm/Transforms/Utils/Cloning.h"
36
#include "llvm/Transforms/Utils/LoopSimplify.h"
37
#include "llvm/Transforms/Utils/LoopUtils.h"
38
#include "llvm/Transforms/Utils/SimplifyIndVar.h"
39
#include "llvm/Transforms/Utils/UnrollLoop.h"
40
using namespace llvm;
41
42
3
#define DEBUG_TYPE "loop-unroll-and-jam"
43
44
STATISTIC(NumUnrolledAndJammed, "Number of loops unroll and jammed");
45
STATISTIC(NumCompletelyUnrolledAndJammed, "Number of loops unroll and jammed");
46
47
typedef SmallPtrSet<BasicBlock *, 4> BasicBlockSet;
48
49
// Partition blocks in an outer/inner loop pair into blocks before and after
50
// the loop
51
static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop,
52
                                     BasicBlockSet &ForeBlocks,
53
                                     BasicBlockSet &SubLoopBlocks,
54
                                     BasicBlockSet &AftBlocks,
55
83
                                     DominatorTree *DT) {
56
83
  BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
57
83
  SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end());
58
83
59
274
  for (BasicBlock *BB : L->blocks()) {
60
274
    if (!SubLoop->contains(BB)) {
61
170
      if (DT->dominates(SubLoopLatch, BB))
62
85
        AftBlocks.insert(BB);
63
85
      else
64
85
        ForeBlocks.insert(BB);
65
170
    }
66
274
  }
67
83
68
83
  // Check that all blocks in ForeBlocks together dominate the subloop
69
83
  // TODO: This might ideally be done better with a dominator/postdominators.
70
83
  BasicBlock *SubLoopPreHeader = SubLoop->getLoopPreheader();
71
84
  for (BasicBlock *BB : ForeBlocks) {
72
84
    if (BB == SubLoopPreHeader)
73
82
      continue;
74
2
    Instruction *TI = BB->getTerminator();
75
4
    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; 
++i2
)
76
3
      if (!ForeBlocks.count(TI->getSuccessor(i)))
77
1
        return false;
78
2
  }
79
83
80
83
  
return true82
;
81
83
}
82
83
// Looks at the phi nodes in Header for values coming from Latch. For these
84
// instructions and all their operands calls Visit on them, keeping going for
85
// all the operands in AftBlocks. Returns false if Visit returns false,
86
// otherwise returns true. This is used to process the instructions in the
87
// Aft blocks that need to be moved before the subloop. It is used in two
88
// places. One to check that the required set of instructions can be moved
89
// before the loop. Then to collect the instructions to actually move in
90
// moveHeaderPhiOperandsToForeBlocks.
91
template <typename T>
92
static bool processHeaderPhiOperands(BasicBlock *Header, BasicBlock *Latch,
93
78
                                     BasicBlockSet &AftBlocks, T Visit) {
94
78
  SmallVector<Instruction *, 8> Worklist;
95
124
  for (auto &Phi : Header->phis()) {
96
124
    Value *V = Phi.getIncomingValueForBlock(Latch);
97
124
    if (Instruction *I = dyn_cast<Instruction>(V))
98
121
      Worklist.push_back(I);
99
124
  }
100
78
101
309
  while (!Worklist.empty()) {
102
233
    Instruction *I = Worklist.back();
103
233
    Worklist.pop_back();
104
233
    if (!Visit(I))
105
2
      return false;
106
231
107
231
    if (AftBlocks.count(I->getParent()))
108
109
      for (auto &U : I->operands())
109
218
        if (Instruction *II = dyn_cast<Instruction>(U))
110
114
          Worklist.push_back(II);
111
231
  }
112
78
113
78
  
return true76
;
114
78
}
LoopUnrollAndJam.cpp:bool processHeaderPhiOperands<moveHeaderPhiOperandsToForeBlocks(llvm::BasicBlock*, llvm::BasicBlock*, llvm::Instruction*, llvm::SmallPtrSet<llvm::BasicBlock*, 4u>&)::$_7>(llvm::BasicBlock*, llvm::BasicBlock*, llvm::SmallPtrSet<llvm::BasicBlock*, 4u>&, moveHeaderPhiOperandsToForeBlocks(llvm::BasicBlock*, llvm::BasicBlock*, llvm::Instruction*, llvm::SmallPtrSet<llvm::BasicBlock*, 4u>&)::$_7)
Line
Count
Source
93
30
                                     BasicBlockSet &AftBlocks, T Visit) {
94
30
  SmallVector<Instruction *, 8> Worklist;
95
65
  for (auto &Phi : Header->phis()) {
96
65
    Value *V = Phi.getIncomingValueForBlock(Latch);
97
65
    if (Instruction *I = dyn_cast<Instruction>(V))
98
64
      Worklist.push_back(I);
99
65
  }
100
30
101
155
  while (!Worklist.empty()) {
102
125
    Instruction *I = Worklist.back();
103
125
    Worklist.pop_back();
104
125
    if (!Visit(I))
105
0
      return false;
106
125
107
125
    if (AftBlocks.count(I->getParent()))
108
59
      for (auto &U : I->operands())
109
118
        if (Instruction *II = dyn_cast<Instruction>(U))
110
61
          Worklist.push_back(II);
111
125
  }
112
30
113
30
  return true;
114
30
}
LoopUnrollAndJam.cpp:bool processHeaderPhiOperands<llvm::isSafeToUnrollAndJam(llvm::Loop*, llvm::ScalarEvolution&, llvm::DominatorTree&, llvm::DependenceInfo&)::$_6>(llvm::BasicBlock*, llvm::BasicBlock*, llvm::SmallPtrSet<llvm::BasicBlock*, 4u>&, llvm::isSafeToUnrollAndJam(llvm::Loop*, llvm::ScalarEvolution&, llvm::DominatorTree&, llvm::DependenceInfo&)::$_6)
Line
Count
Source
93
48
                                     BasicBlockSet &AftBlocks, T Visit) {
94
48
  SmallVector<Instruction *, 8> Worklist;
95
59
  for (auto &Phi : Header->phis()) {
96
59
    Value *V = Phi.getIncomingValueForBlock(Latch);
97
59
    if (Instruction *I = dyn_cast<Instruction>(V))
98
57
      Worklist.push_back(I);
99
59
  }
100
48
101
154
  while (!Worklist.empty()) {
102
108
    Instruction *I = Worklist.back();
103
108
    Worklist.pop_back();
104
108
    if (!Visit(I))
105
2
      return false;
106
106
107
106
    if (AftBlocks.count(I->getParent()))
108
50
      for (auto &U : I->operands())
109
100
        if (Instruction *II = dyn_cast<Instruction>(U))
110
53
          Worklist.push_back(II);
111
106
  }
112
48
113
48
  
return true46
;
114
48
}
115
116
// Move the phi operands of Header from Latch out of AftBlocks to InsertLoc.
117
static void moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header,
118
                                              BasicBlock *Latch,
119
                                              Instruction *InsertLoc,
120
30
                                              BasicBlockSet &AftBlocks) {
121
30
  // We need to ensure we move the instructions in the correct order,
122
30
  // starting with the earliest required instruction and moving forward.
123
30
  std::vector<Instruction *> Visited;
124
30
  processHeaderPhiOperands(Header, Latch, AftBlocks,
125
125
                           [&Visited, &AftBlocks](Instruction *I) {
126
125
                             if (AftBlocks.count(I->getParent()))
127
59
                               Visited.push_back(I);
128
125
                             return true;
129
125
                           });
130
30
131
30
  // Move all instructions in program order to before the InsertLoc
132
30
  BasicBlock *InsertLocBB = InsertLoc->getParent();
133
59
  for (Instruction *I : reverse(Visited)) {
134
59
    if (I->getParent() != InsertLocBB)
135
59
      I->moveBefore(InsertLoc);
136
59
  }
137
30
}
138
139
/*
140
  This method performs Unroll and Jam. For a simple loop like:
141
  for (i = ..)
142
    Fore(i)
143
    for (j = ..)
144
      SubLoop(i, j)
145
    Aft(i)
146
147
  Instead of doing normal inner or outer unrolling, we do:
148
  for (i = .., i+=2)
149
    Fore(i)
150
    Fore(i+1)
151
    for (j = ..)
152
      SubLoop(i, j)
153
      SubLoop(i+1, j)
154
    Aft(i)
155
    Aft(i+1)
156
157
  So the outer loop is essetially unrolled and then the inner loops are fused
158
  ("jammed") together into a single loop. This can increase speed when there
159
  are loads in SubLoop that are invariant to i, as they become shared between
160
  the now jammed inner loops.
161
162
  We do this by spliting the blocks in the loop into Fore, Subloop and Aft.
163
  Fore blocks are those before the inner loop, Aft are those after. Normal
164
  Unroll code is used to copy each of these sets of blocks and the results are
165
  combined together into the final form above.
166
167
  isSafeToUnrollAndJam should be used prior to calling this to make sure the
168
  unrolling will be valid. Checking profitablility is also advisable.
169
170
  If EpilogueLoop is non-null, it receives the epilogue loop (if it was
171
  necessary to create one and not fully unrolled).
172
*/
173
LoopUnrollResult llvm::UnrollAndJamLoop(
174
    Loop *L, unsigned Count, unsigned TripCount, unsigned TripMultiple,
175
    bool UnrollRemainder, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
176
30
    AssumptionCache *AC, OptimizationRemarkEmitter *ORE, Loop **EpilogueLoop) {
177
30
178
30
  // When we enter here we should have already checked that it is safe
179
30
  BasicBlock *Header = L->getHeader();
180
30
  assert(L->getSubLoops().size() == 1);
181
30
  Loop *SubLoop = *L->begin();
182
30
183
30
  // Don't enter the unroll code if there is nothing to do.
184
30
  if (TripCount == 0 && 
Count < 225
) {
185
0
    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; almost nothing to do\n");
186
0
    return LoopUnrollResult::Unmodified;
187
0
  }
188
30
189
30
  assert(Count > 0);
190
30
  assert(TripMultiple > 0);
191
30
  assert(TripCount == 0 || TripCount % TripMultiple == 0);
192
30
193
30
  // Are we eliminating the loop control altogether?
194
30
  bool CompletelyUnroll = (Count == TripCount);
195
30
196
30
  // We use the runtime remainder in cases where we don't know trip multiple
197
30
  if (TripMultiple == 1 || 
TripMultiple % Count != 04
) {
198
28
    if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false,
199
28
                                    /*UseEpilogRemainder*/ true,
200
28
                                    UnrollRemainder, /*ForgetAllSCEV*/ false,
201
28
                                    LI, SE, DT, AC, true, EpilogueLoop)) {
202
0
      LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be "
203
0
                           "generated when assuming runtime trip count\n");
204
0
      return LoopUnrollResult::Unmodified;
205
0
    }
206
30
  }
207
30
208
30
  // Notify ScalarEvolution that the loop will be substantially changed,
209
30
  // if not outright eliminated.
210
30
  if (SE) {
211
30
    SE->forgetLoop(L);
212
30
    SE->forgetLoop(SubLoop);
213
30
  }
214
30
215
30
  using namespace ore;
216
30
  // Report the unrolling decision.
217
30
  if (CompletelyUnroll) {
218
3
    LLVM_DEBUG(dbgs() << "COMPLETELY UNROLL AND JAMMING loop %"
219
3
                      << Header->getName() << " with trip count " << TripCount
220
3
                      << "!\n");
221
3
    ORE->emit(OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
222
3
                                 L->getHeader())
223
3
              << "completely unroll and jammed loop with "
224
3
              << NV("UnrollCount", TripCount) << " iterations");
225
27
  } else {
226
27
    auto DiagBuilder = [&]() {
227
0
      OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),
228
0
                              L->getHeader());
229
0
      return Diag << "unroll and jammed loop by a factor of "
230
0
                  << NV("UnrollCount", Count);
231
0
    };
232
27
233
27
    LLVM_DEBUG(dbgs() << "UNROLL AND JAMMING loop %" << Header->getName()
234
27
                      << " by " << Count);
235
27
    if (TripMultiple != 1) {
236
2
      LLVM_DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
237
2
      ORE->emit([&]() {
238
0
        return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple)
239
0
                             << " trips per branch";
240
0
      });
241
25
    } else {
242
25
      LLVM_DEBUG(dbgs() << " with run-time trip count");
243
25
      ORE->emit([&]() 
{ return DiagBuilder() << " with run-time trip count"; }0
);
244
25
    }
245
27
    LLVM_DEBUG(dbgs() << "!\n");
246
27
  }
247
30
248
30
  BasicBlock *Preheader = L->getLoopPreheader();
249
30
  BasicBlock *LatchBlock = L->getLoopLatch();
250
30
  BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
251
30
  assert(Preheader && LatchBlock && Header);
252
30
  assert(BI && !BI->isUnconditional());
253
30
  bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
254
30
  BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
255
30
  bool SubLoopContinueOnTrue = SubLoop->contains(
256
30
      SubLoop->getLoopLatch()->getTerminator()->getSuccessor(0));
257
30
258
30
  // Partition blocks in an outer/inner loop pair into blocks before and after
259
30
  // the loop
260
30
  BasicBlockSet SubLoopBlocks;
261
30
  BasicBlockSet ForeBlocks;
262
30
  BasicBlockSet AftBlocks;
263
30
  partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, AftBlocks,
264
30
                           DT);
265
30
266
30
  // We keep track of the entering/first and exiting/last block of each of
267
30
  // Fore/SubLoop/Aft in each iteration. This helps make the stapling up of
268
30
  // blocks easier.
269
30
  std::vector<BasicBlock *> ForeBlocksFirst;
270
30
  std::vector<BasicBlock *> ForeBlocksLast;
271
30
  std::vector<BasicBlock *> SubLoopBlocksFirst;
272
30
  std::vector<BasicBlock *> SubLoopBlocksLast;
273
30
  std::vector<BasicBlock *> AftBlocksFirst;
274
30
  std::vector<BasicBlock *> AftBlocksLast;
275
30
  ForeBlocksFirst.push_back(Header);
276
30
  ForeBlocksLast.push_back(SubLoop->getLoopPreheader());
277
30
  SubLoopBlocksFirst.push_back(SubLoop->getHeader());
278
30
  SubLoopBlocksLast.push_back(SubLoop->getExitingBlock());
279
30
  AftBlocksFirst.push_back(SubLoop->getExitBlock());
280
30
  AftBlocksLast.push_back(L->getExitingBlock());
281
30
  // Maps Blocks[0] -> Blocks[It]
282
30
  ValueToValueMapTy LastValueMap;
283
30
284
30
  // Move any instructions from fore phi operands from AftBlocks into Fore.
285
30
  moveHeaderPhiOperandsToForeBlocks(
286
30
      Header, LatchBlock, SubLoop->getLoopPreheader()->getTerminator(),
287
30
      AftBlocks);
288
30
289
30
  // The current on-the-fly SSA update requires blocks to be processed in
290
30
  // reverse postorder so that LastValueMap contains the correct value at each
291
30
  // exit.
292
30
  LoopBlocksDFS DFS(L);
293
30
  DFS.perform(LI);
294
30
  // Stash the DFS iterators before adding blocks to the loop.
295
30
  LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
296
30
  LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
297
30
298
30
  if (Header->getParent()->isDebugInfoForProfiling())
299
0
    for (BasicBlock *BB : L->getBlocks())
300
0
      for (Instruction &I : *BB)
301
0
        if (!isa<DbgInfoIntrinsic>(&I))
302
0
          if (const DILocation *DIL = I.getDebugLoc()) {
303
0
            auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(Count);
304
0
            if (NewDIL)
305
0
              I.setDebugLoc(NewDIL.getValue());
306
0
            else
307
0
              LLVM_DEBUG(dbgs()
308
0
                         << "Failed to create new discriminator: "
309
0
                         << DIL->getFilename() << " Line: " << DIL->getLine());
310
0
          }
311
30
312
30
  // Copy all blocks
313
136
  for (unsigned It = 1; It != Count; 
++It106
) {
314
106
    std::vector<BasicBlock *> NewBlocks;
315
106
    // Maps Blocks[It] -> Blocks[It-1]
316
106
    DenseMap<Value *, Value *> PrevItValueMap;
317
106
318
443
    for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; 
++BB337
) {
319
337
      ValueToValueMapTy VMap;
320
337
      BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
321
337
      Header->getParent()->getBasicBlockList().push_back(New);
322
337
323
337
      if (ForeBlocks.count(*BB)) {
324
106
        L->addBasicBlockToLoop(New, *LI);
325
106
326
106
        if (*BB == ForeBlocksFirst[0])
327
106
          ForeBlocksFirst.push_back(New);
328
106
        if (*BB == ForeBlocksLast[0])
329
106
          ForeBlocksLast.push_back(New);
330
231
      } else if (SubLoopBlocks.count(*BB)) {
331
125
        SubLoop->addBasicBlockToLoop(New, *LI);
332
125
333
125
        if (*BB == SubLoopBlocksFirst[0])
334
106
          SubLoopBlocksFirst.push_back(New);
335
125
        if (*BB == SubLoopBlocksLast[0])
336
106
          SubLoopBlocksLast.push_back(New);
337
125
      } else 
if (106
AftBlocks.count(*BB)106
) {
338
106
        L->addBasicBlockToLoop(New, *LI);
339
106
340
106
        if (*BB == AftBlocksFirst[0])
341
106
          AftBlocksFirst.push_back(New);
342
106
        if (*BB == AftBlocksLast[0])
343
106
          AftBlocksLast.push_back(New);
344
106
      } else {
345
0
        llvm_unreachable("BB being cloned should be in Fore/Sub/Aft");
346
0
      }
347
337
348
337
      // Update our running maps of newest clones
349
337
      PrevItValueMap[New] = (It == 1 ? 
*BB96
:
LastValueMap[*BB]241
);
350
337
      LastValueMap[*BB] = New;
351
337
      for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
352
2.53k
           VI != VE; 
++VI2.19k
) {
353
2.19k
        PrevItValueMap[VI->second] =
354
2.19k
            const_cast<Value *>(It == 1 ? 
VI->first626
:
LastValueMap[VI->first]1.56k
);
355
2.19k
        LastValueMap[VI->first] = VI->second;
356
2.19k
      }
357
337
358
337
      NewBlocks.push_back(New);
359
337
360
337
      // Update DomTree:
361
337
      if (*BB == ForeBlocksFirst[0])
362
106
        DT->addNewBlock(New, ForeBlocksLast[It - 1]);
363
231
      else if (*BB == SubLoopBlocksFirst[0])
364
106
        DT->addNewBlock(New, SubLoopBlocksLast[It - 1]);
365
125
      else if (*BB == AftBlocksFirst[0])
366
106
        DT->addNewBlock(New, AftBlocksLast[It - 1]);
367
19
      else {
368
19
        // Each set of blocks (Fore/Sub/Aft) will have the same internal domtree
369
19
        // structure.
370
19
        auto BBDomNode = DT->getNode(*BB);
371
19
        auto BBIDom = BBDomNode->getIDom();
372
19
        BasicBlock *OriginalBBIDom = BBIDom->getBlock();
373
19
        assert(OriginalBBIDom);
374
19
        assert(LastValueMap[cast<Value>(OriginalBBIDom)]);
375
19
        DT->addNewBlock(
376
19
            New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)]));
377
19
      }
378
337
    }
379
106
380
106
    // Remap all instructions in the most recent iteration
381
337
    
for (BasicBlock *NewBlock : NewBlocks)106
{
382
2.19k
      for (Instruction &I : *NewBlock) {
383
2.19k
        ::remapInstruction(&I, LastValueMap);
384
2.19k
        if (auto *II = dyn_cast<IntrinsicInst>(&I))
385
0
          if (II->getIntrinsicID() == Intrinsic::assume)
386
0
            AC->registerAssumption(II);
387
2.19k
      }
388
337
    }
389
106
390
106
    // Alter the ForeBlocks phi's, pointing them at the latest version of the
391
106
    // value from the previous iteration's phis
392
223
    for (PHINode &Phi : ForeBlocksFirst[It]->phis()) {
393
223
      Value *OldValue = Phi.getIncomingValueForBlock(AftBlocksLast[It]);
394
223
      assert(OldValue && "should have incoming edge from Aft[It]");
395
223
      Value *NewValue = OldValue;
396
223
      if (Value *PrevValue = PrevItValueMap[OldValue])
397
220
        NewValue = PrevValue;
398
223
399
223
      assert(Phi.getNumOperands() == 2);
400
223
      Phi.setIncomingBlock(0, ForeBlocksLast[It - 1]);
401
223
      Phi.setIncomingValue(0, NewValue);
402
223
      Phi.removeIncomingValue(1);
403
223
    }
404
106
  }
405
30
406
30
  // Now that all the basic blocks for the unrolled iterations are in place,
407
30
  // finish up connecting the blocks and phi nodes. At this point LastValueMap
408
30
  // is the last unrolled iterations values.
409
30
410
30
  // Update Phis in BB from OldBB to point to NewBB
411
30
  auto updatePHIBlocks = [](BasicBlock *BB, BasicBlock *OldBB,
412
408
                            BasicBlock *NewBB) {
413
666
    for (PHINode &Phi : BB->phis()) {
414
666
      int I = Phi.getBasicBlockIndex(OldBB);
415
666
      Phi.setIncomingBlock(I, NewBB);
416
666
    }
417
408
  };
418
30
  // Update Phis in BB from OldBB to point to NewBB and use the latest value
419
30
  // from LastValueMap
420
30
  auto updatePHIBlocksAndValues = [](BasicBlock *BB, BasicBlock *OldBB,
421
30
                                     BasicBlock *NewBB,
422
57
                                     ValueToValueMapTy &LastValueMap) {
423
102
    for (PHINode &Phi : BB->phis()) {
424
132
      for (unsigned b = 0; b < Phi.getNumIncomingValues(); 
++b30
) {
425
132
        if (Phi.getIncomingBlock(b) == OldBB) {
426
102
          Value *OldValue = Phi.getIncomingValue(b);
427
102
          if (Value *LastValue = LastValueMap[OldValue])
428
97
            Phi.setIncomingValue(b, LastValue);
429
102
          Phi.setIncomingBlock(b, NewBB);
430
102
          break;
431
102
        }
432
132
      }
433
102
    }
434
57
  };
435
30
  // Move all the phis from Src into Dest
436
212
  auto movePHIs = [](BasicBlock *Src, BasicBlock *Dest) {
437
212
    Instruction *insertPoint = Dest->getFirstNonPHI();
438
520
    while (PHINode *Phi = dyn_cast<PHINode>(Src->begin()))
439
308
      Phi->moveBefore(insertPoint);
440
212
  };
441
30
442
30
  // Update the PHI values outside the loop to point to the last block
443
30
  updatePHIBlocksAndValues(LoopExit, AftBlocksLast[0], AftBlocksLast.back(),
444
30
                           LastValueMap);
445
30
446
30
  // Update ForeBlocks successors and phi nodes
447
30
  BranchInst *ForeTerm =
448
30
      cast<BranchInst>(ForeBlocksLast.back()->getTerminator());
449
30
  BasicBlock *Dest = SubLoopBlocksFirst[0];
450
30
  ForeTerm->setSuccessor(0, Dest);
451
30
452
30
  if (CompletelyUnroll) {
453
7
    while (PHINode *Phi = dyn_cast<PHINode>(ForeBlocksFirst[0]->begin())) {
454
4
      Phi->replaceAllUsesWith(Phi->getIncomingValueForBlock(Preheader));
455
4
      Phi->getParent()->getInstList().erase(Phi);
456
4
    }
457
27
  } else {
458
27
    // Update the PHI values to point to the last aft block
459
27
    updatePHIBlocksAndValues(ForeBlocksFirst[0], AftBlocksLast[0],
460
27
                             AftBlocksLast.back(), LastValueMap);
461
27
  }
462
30
463
136
  for (unsigned It = 1; It != Count; 
It++106
) {
464
106
    // Remap ForeBlock successors from previous iteration to this
465
106
    BranchInst *ForeTerm =
466
106
        cast<BranchInst>(ForeBlocksLast[It - 1]->getTerminator());
467
106
    BasicBlock *Dest = ForeBlocksFirst[It];
468
106
    ForeTerm->setSuccessor(0, Dest);
469
106
  }
470
30
471
30
  // Subloop successors and phis
472
30
  BranchInst *SubTerm =
473
30
      cast<BranchInst>(SubLoopBlocksLast.back()->getTerminator());
474
30
  SubTerm->setSuccessor(!SubLoopContinueOnTrue, SubLoopBlocksFirst[0]);
475
30
  SubTerm->setSuccessor(SubLoopContinueOnTrue, AftBlocksFirst[0]);
476
30
  updatePHIBlocks(SubLoopBlocksFirst[0], ForeBlocksLast[0],
477
30
                  ForeBlocksLast.back());
478
30
  updatePHIBlocks(SubLoopBlocksFirst[0], SubLoopBlocksLast[0],
479
30
                  SubLoopBlocksLast.back());
480
30
481
136
  for (unsigned It = 1; It != Count; 
It++106
) {
482
106
    // Replace the conditional branch of the previous iteration subloop with an
483
106
    // unconditional one to this one
484
106
    BranchInst *SubTerm =
485
106
        cast<BranchInst>(SubLoopBlocksLast[It - 1]->getTerminator());
486
106
    BranchInst::Create(SubLoopBlocksFirst[It], SubTerm);
487
106
    SubTerm->eraseFromParent();
488
106
489
106
    updatePHIBlocks(SubLoopBlocksFirst[It], ForeBlocksLast[It],
490
106
                    ForeBlocksLast.back());
491
106
    updatePHIBlocks(SubLoopBlocksFirst[It], SubLoopBlocksLast[It],
492
106
                    SubLoopBlocksLast.back());
493
106
    movePHIs(SubLoopBlocksFirst[It], SubLoopBlocksFirst[0]);
494
106
  }
495
30
496
30
  // Aft blocks successors and phis
497
30
  BranchInst *Term = cast<BranchInst>(AftBlocksLast.back()->getTerminator());
498
30
  if (CompletelyUnroll) {
499
3
    BranchInst::Create(LoopExit, Term);
500
3
    Term->eraseFromParent();
501
27
  } else {
502
27
    Term->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]);
503
27
  }
504
30
  updatePHIBlocks(AftBlocksFirst[0], SubLoopBlocksLast[0],
505
30
                  SubLoopBlocksLast.back());
506
30
507
136
  for (unsigned It = 1; It != Count; 
It++106
) {
508
106
    // Replace the conditional branch of the previous iteration subloop with an
509
106
    // unconditional one to this one
510
106
    BranchInst *AftTerm =
511
106
        cast<BranchInst>(AftBlocksLast[It - 1]->getTerminator());
512
106
    BranchInst::Create(AftBlocksFirst[It], AftTerm);
513
106
    AftTerm->eraseFromParent();
514
106
515
106
    updatePHIBlocks(AftBlocksFirst[It], SubLoopBlocksLast[It],
516
106
                    SubLoopBlocksLast.back());
517
106
    movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]);
518
106
  }
519
30
520
30
  // Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the
521
30
  // new ones required.
522
30
  if (Count != 1) {
523
29
    SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
524
29
    DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, ForeBlocksLast[0],
525
29
                           SubLoopBlocksFirst[0]);
526
29
    DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete,
527
29
                           SubLoopBlocksLast[0], AftBlocksFirst[0]);
528
29
529
29
    DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
530
29
                           ForeBlocksLast.back(), SubLoopBlocksFirst[0]);
531
29
    DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
532
29
                           SubLoopBlocksLast.back(), AftBlocksFirst[0]);
533
29
    DT->applyUpdates(DTUpdates);
534
29
  }
535
30
536
30
  // Merge adjacent basic blocks, if possible.
537
30
  SmallPtrSet<BasicBlock *, 16> MergeBlocks;
538
30
  MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end());
539
30
  MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end());
540
30
  MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end());
541
30
  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
542
474
  while (!MergeBlocks.empty()) {
543
444
    BasicBlock *BB = *MergeBlocks.begin();
544
444
    BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator());
545
444
    if (Term && Term->isUnconditional() && 
L->contains(Term->getSuccessor(0))363
) {
546
360
      BasicBlock *Dest = Term->getSuccessor(0);
547
360
      BasicBlock *Fold = Dest->getUniquePredecessor();
548
360
      if (MergeBlockIntoPredecessor(Dest, &DTU, LI)) {
549
319
        // Don't remove BB and add Fold as they are the same BB
550
319
        assert(Fold == BB);
551
319
        (void)Fold;
552
319
        MergeBlocks.erase(Dest);
553
319
      } else
554
41
        MergeBlocks.erase(BB);
555
360
    } else
556
84
      MergeBlocks.erase(BB);
557
444
  }
558
30
559
30
  // At this point, the code is well formed.  We now do a quick sweep over the
560
30
  // inserted code, doing constant propagation and dead code elimination as we
561
30
  // go.
562
30
  simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC);
563
30
  simplifyLoopAfterUnroll(L, !CompletelyUnroll && 
Count > 127
, LI, SE, DT, AC);
564
30
565
30
  NumCompletelyUnrolledAndJammed += CompletelyUnroll;
566
30
  ++NumUnrolledAndJammed;
567
30
568
#ifndef NDEBUG
569
  // We shouldn't have done anything to break loop simplify form or LCSSA.
570
  Loop *OuterL = L->getParentLoop();
571
  Loop *OutestLoop = OuterL ? OuterL : (!CompletelyUnroll ? L : SubLoop);
572
  assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI));
573
  if (!CompletelyUnroll)
574
    assert(L->isLoopSimplifyForm());
575
  assert(SubLoop->isLoopSimplifyForm());
576
  assert(DT->verify());
577
#endif
578
579
30
  // Update LoopInfo if the loop is completely removed.
580
30
  if (CompletelyUnroll)
581
3
    LI->erase(L);
582
30
583
30
  return CompletelyUnroll ? 
LoopUnrollResult::FullyUnrolled3
584
30
                          : 
LoopUnrollResult::PartiallyUnrolled27
;
585
30
}
586
587
static bool getLoadsAndStores(BasicBlockSet &Blocks,
588
138
                              SmallVector<Value *, 4> &MemInstr) {
589
138
  // Scan the BBs and collect legal loads and stores.
590
138
  // Returns false if non-simple loads/stores are found.
591
150
  for (BasicBlock *BB : Blocks) {
592
889
    for (Instruction &I : *BB) {
593
889
      if (auto *Ld = dyn_cast<LoadInst>(&I)) {
594
53
        if (!Ld->isSimple())
595
0
          return false;
596
53
        MemInstr.push_back(&I);
597
836
      } else if (auto *St = dyn_cast<StoreInst>(&I)) {
598
66
        if (!St->isSimple())
599
0
          return false;
600
66
        MemInstr.push_back(&I);
601
770
      } else if (I.mayReadOrWriteMemory()) {
602
0
        return false;
603
0
      }
604
889
    }
605
150
  }
606
138
  return true;
607
138
}
608
609
static bool checkDependencies(SmallVector<Value *, 4> &Earlier,
610
                              SmallVector<Value *, 4> &Later,
611
                              unsigned LoopDepth, bool InnerLoop,
612
168
                              DependenceInfo &DI) {
613
168
  // Use DA to check for dependencies between loads and stores that make unroll
614
168
  // and jam invalid
615
168
  for (Value *I : Earlier) {
616
162
    for (Value *J : Later) {
617
162
      Instruction *Src = cast<Instruction>(I);
618
162
      Instruction *Dst = cast<Instruction>(J);
619
162
      if (Src == Dst)
620
48
        continue;
621
114
      // Ignore Input dependencies.
622
114
      if (isa<LoadInst>(Src) && 
isa<LoadInst>(Dst)63
)
623
14
        continue;
624
100
625
100
      // Track dependencies, and if we find them take a conservative approach
626
100
      // by allowing only = or < (not >), altough some > would be safe
627
100
      // (depending upon unroll width).
628
100
      // For the inner loop, we need to disallow any (> <) dependencies
629
100
      // FIXME: Allow > so long as distance is less than unroll width
630
100
      if (auto D = DI.depends(Src, Dst, true)) {
631
28
        assert(D->isOrdered() && "Expected an output, flow or anti dep.");
632
28
633
28
        if (D->isConfused()) {
634
2
          LLVM_DEBUG(dbgs() << "  Confused dependency between:\n"
635
2
                            << "  " << *Src << "\n"
636
2
                            << "  " << *Dst << "\n");
637
2
          return false;
638
2
        }
639
26
        if (!InnerLoop) {
640
20
          if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT) {
641
7
            LLVM_DEBUG(dbgs() << "  > dependency between:\n"
642
7
                              << "  " << *Src << "\n"
643
7
                              << "  " << *Dst << "\n");
644
7
            return false;
645
7
          }
646
6
        } else {
647
6
          assert(LoopDepth + 1 <= D->getLevels());
648
6
          if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT &&
649
6
              
D->getDirection(LoopDepth + 1) & Dependence::DVEntry::LT3
) {
650
3
            LLVM_DEBUG(dbgs() << "  < > dependency between:\n"
651
3
                              << "  " << *Src << "\n"
652
3
                              << "  " << *Dst << "\n");
653
3
            return false;
654
3
          }
655
6
        }
656
26
      }
657
100
    }
658
145
  }
659
168
  
return true156
;
660
168
}
661
662
static bool checkDependencies(Loop *L, BasicBlockSet &ForeBlocks,
663
                              BasicBlockSet &SubLoopBlocks,
664
46
                              BasicBlockSet &AftBlocks, DependenceInfo &DI) {
665
46
  // Get all loads/store pairs for each blocks
666
46
  SmallVector<Value *, 4> ForeMemInstr;
667
46
  SmallVector<Value *, 4> SubLoopMemInstr;
668
46
  SmallVector<Value *, 4> AftMemInstr;
669
46
  if (!getLoadsAndStores(ForeBlocks, ForeMemInstr) ||
670
46
      !getLoadsAndStores(SubLoopBlocks, SubLoopMemInstr) ||
671
46
      !getLoadsAndStores(AftBlocks, AftMemInstr))
672
0
    return false;
673
46
674
46
  // Check for dependencies between any blocks that may change order
675
46
  unsigned LoopDepth = L->getLoopDepth();
676
46
  return checkDependencies(ForeMemInstr, SubLoopMemInstr, LoopDepth, false,
677
46
                           DI) &&
678
46
         
checkDependencies(ForeMemInstr, AftMemInstr, LoopDepth, false, DI)45
&&
679
46
         checkDependencies(SubLoopMemInstr, AftMemInstr, LoopDepth, false,
680
40
                           DI) &&
681
46
         checkDependencies(SubLoopMemInstr, SubLoopMemInstr, LoopDepth, true,
682
37
                           DI);
683
46
}
684
685
bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
686
53
                                DependenceInfo &DI) {
687
53
  /* We currently handle outer loops like this:
688
53
        |
689
53
    ForeFirst    <----\    }
690
53
     Blocks           |    } ForeBlocks
691
53
    ForeLast          |    }
692
53
        |             |
693
53
    SubLoopFirst  <\  |    }
694
53
     Blocks        |  |    } SubLoopBlocks
695
53
    SubLoopLast   -/  |    }
696
53
        |             |
697
53
    AftFirst          |    }
698
53
     Blocks           |    } AftBlocks
699
53
    AftLast     ------/    }
700
53
        |
701
53
702
53
    There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks
703
53
    and AftBlocks, providing that there is one edge from Fores to SubLoops,
704
53
    one edge from SubLoops to Afts and a single outer loop exit (from Afts).
705
53
    In practice we currently limit Aft blocks to a single block, and limit
706
53
    things further in the profitablility checks of the unroll and jam pass.
707
53
708
53
    Because of the way we rearrange basic blocks, we also require that
709
53
    the Fore blocks on all unrolled iterations are safe to move before the
710
53
    SubLoop blocks of all iterations. So we require that the phi node looping
711
53
    operands of ForeHeader can be moved to at least the end of ForeEnd, so that
712
53
    we can arrange cloned Fore Blocks before the subloop and match up Phi's
713
53
    correctly.
714
53
715
53
    i.e. The old order of blocks used to be F1 S1_1 S1_2 A1 F2 S2_1 S2_2 A2.
716
53
    It needs to be safe to tranform this to F1 F2 S1_1 S2_1 S1_2 S2_2 A1 A2.
717
53
718
53
    There are then a number of checks along the lines of no calls, no
719
53
    exceptions, inner loop IV is consistent, etc. Note that for loops requiring
720
53
    runtime unrolling, UnrollRuntimeLoopRemainder can also fail in
721
53
    UnrollAndJamLoop if the trip count cannot be easily calculated.
722
53
  */
723
53
724
53
  if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1)
725
0
    return false;
726
53
  Loop *SubLoop = L->getSubLoops()[0];
727
53
  if (!SubLoop->isLoopSimplifyForm())
728
0
    return false;
729
53
730
53
  BasicBlock *Header = L->getHeader();
731
53
  BasicBlock *Latch = L->getLoopLatch();
732
53
  BasicBlock *Exit = L->getExitingBlock();
733
53
  BasicBlock *SubLoopHeader = SubLoop->getHeader();
734
53
  BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
735
53
  BasicBlock *SubLoopExit = SubLoop->getExitingBlock();
736
53
737
53
  if (Latch != Exit)
738
0
    return false;
739
53
  if (SubLoopLatch != SubLoopExit)
740
0
    return false;
741
53
742
53
  if (Header->hasAddressTaken() || SubLoopHeader->hasAddressTaken()) {
743
0
    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Address taken\n");
744
0
    return false;
745
0
  }
746
53
747
53
  // Split blocks into Fore/SubLoop/Aft based on dominators
748
53
  BasicBlockSet SubLoopBlocks;
749
53
  BasicBlockSet ForeBlocks;
750
53
  BasicBlockSet AftBlocks;
751
53
  if (!partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks,
752
53
                                AftBlocks, &DT)) {
753
1
    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Incompatible loop layout\n");
754
1
    return false;
755
1
  }
756
52
757
52
  // Aft blocks may need to move instructions to fore blocks, which becomes more
758
52
  // difficult if there are multiple (potentially conditionally executed)
759
52
  // blocks. For now we just exclude loops with multiple aft blocks.
760
52
  if (AftBlocks.size() != 1) {
761
1
    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Can't currently handle "
762
1
                         "multiple blocks after the loop\n");
763
1
    return false;
764
1
  }
765
51
766
51
  // Check inner loop backedge count is consistent on all iterations of the
767
51
  // outer loop
768
51
  if (!hasIterationCountInvariantInParent(SubLoop, SE)) {
769
1
    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Inner loop iteration count is "
770
1
                         "not consistent on each iteration\n");
771
1
    return false;
772
1
  }
773
50
774
50
  // Check the loop safety info for exceptions.
775
50
  SimpleLoopSafetyInfo LSI;
776
50
  LSI.computeLoopSafetyInfo(L);
777
50
  if (LSI.anyBlockMayThrow()) {
778
2
    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Something may throw\n");
779
2
    return false;
780
2
  }
781
48
782
48
  // We've ruled out the easy stuff and now need to check that there are no
783
48
  // interdependencies which may prevent us from moving the:
784
48
  //  ForeBlocks before Subloop and AftBlocks.
785
48
  //  Subloop before AftBlocks.
786
48
  //  ForeBlock phi operands before the subloop
787
48
788
48
  // Make sure we can move all instructions we need to before the subloop
789
48
  if (!processHeaderPhiOperands(
790
108
          Header, Latch, AftBlocks, [&AftBlocks, &SubLoop](Instruction *I) {
791
108
            if (SubLoop->contains(I->getParent()))
792
0
              return false;
793
108
            if (AftBlocks.count(I->getParent())) {
794
52
              // If we hit a phi node in afts we know we are done (probably
795
52
              // LCSSA)
796
52
              if (isa<PHINode>(I))
797
1
                return false;
798
51
              // Can't move instructions with side effects or memory
799
51
              // reads/writes
800
51
              if (I->mayHaveSideEffects() || I->mayReadOrWriteMemory())
801
1
                return false;
802
106
            }
803
106
            // Keep going
804
106
            return true;
805
106
          })) {
806
2
    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; can't move required "
807
2
                         "instructions after subloop to before it\n");
808
2
    return false;
809
2
  }
810
46
811
46
  // Check for memory dependencies which prohibit the unrolling we are doing.
812
46
  // Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check
813
46
  // there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub.
814
46
  if (!checkDependencies(L, ForeBlocks, SubLoopBlocks, AftBlocks, DI)) {
815
12
    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; failed dependency check\n");
816
12
    return false;
817
12
  }
818
34
819
34
  return true;
820
34
}