Coverage Report

Created: 2017-10-03 07:32

/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Transforms/Scalar/LoopSink.cpp
Line
Count
Source (jump to first uncovered line)
1
//===-- LoopSink.cpp - Loop Sink Pass -------------------------------------===//
2
//
3
//                     The LLVM Compiler Infrastructure
4
//
5
// This file is distributed under the University of Illinois Open Source
6
// License. See LICENSE.TXT for details.
7
//
8
//===----------------------------------------------------------------------===//
9
//
10
// This pass does the inverse transformation of what LICM does.
11
// It traverses all of the instructions in the loop's preheader and sinks
12
// them to the loop body where frequency is lower than the loop's preheader.
13
// This pass is a reverse-transformation of LICM. It differs from the Sink
14
// pass in the following ways:
15
//
16
// * It only handles sinking of instructions from the loop's preheader to the
17
//   loop's body
18
// * It uses alias set tracker to get more accurate alias info
19
// * It uses block frequency info to find the optimal sinking locations
20
//
21
// Overall algorithm:
22
//
23
// For I in Preheader:
24
//   InsertBBs = BBs that uses I
25
//   For BB in sorted(LoopBBs):
26
//     DomBBs = BBs in InsertBBs that are dominated by BB
27
//     if freq(DomBBs) > freq(BB)
28
//       InsertBBs = UseBBs - DomBBs + BB
29
//   For BB in InsertBBs:
30
//     Insert I at BB's beginning
31
//
32
//===----------------------------------------------------------------------===//
33
34
#include "llvm/Transforms/Scalar/LoopSink.h"
35
#include "llvm/ADT/Statistic.h"
36
#include "llvm/Analysis/AliasAnalysis.h"
37
#include "llvm/Analysis/AliasSetTracker.h"
38
#include "llvm/Analysis/BasicAliasAnalysis.h"
39
#include "llvm/Analysis/BlockFrequencyInfo.h"
40
#include "llvm/Analysis/Loads.h"
41
#include "llvm/Analysis/LoopInfo.h"
42
#include "llvm/Analysis/LoopPass.h"
43
#include "llvm/Analysis/ScalarEvolution.h"
44
#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
45
#include "llvm/IR/Dominators.h"
46
#include "llvm/IR/Instructions.h"
47
#include "llvm/IR/LLVMContext.h"
48
#include "llvm/IR/Metadata.h"
49
#include "llvm/Support/CommandLine.h"
50
#include "llvm/Transforms/Scalar.h"
51
#include "llvm/Transforms/Scalar/LoopPassManager.h"
52
#include "llvm/Transforms/Utils/Local.h"
53
#include "llvm/Transforms/Utils/LoopUtils.h"
54
using namespace llvm;
55
56
#define DEBUG_TYPE "loopsink"
57
58
STATISTIC(NumLoopSunk, "Number of instructions sunk into loop");
59
STATISTIC(NumLoopSunkCloned, "Number of cloned instructions sunk into loop");
60
61
static cl::opt<unsigned> SinkFrequencyPercentThreshold(
62
    "sink-freq-percent-threshold", cl::Hidden, cl::init(90),
63
    cl::desc("Do not sink instructions that require cloning unless they "
64
             "execute less than this percent of the time."));
65
66
static cl::opt<unsigned> MaxNumberOfUseBBsForSinking(
67
    "max-uses-for-sinking", cl::Hidden, cl::init(30),
68
    cl::desc("Do not sink instructions that have too many uses."));
69
70
/// Return adjusted total frequency of \p BBs.
71
///
72
/// * If there is only one BB, sinking instruction will not introduce code
73
///   size increase. Thus there is no need to adjust the frequency.
74
/// * If there are more than one BB, sinking would lead to code size increase.
75
///   In this case, we add some "tax" to the total frequency to make it harder
76
///   to sink. E.g.
77
///     Freq(Preheader) = 100
78
///     Freq(BBs) = sum(50, 49) = 99
79
///   Even if Freq(BBs) < Freq(Preheader), we will not sink from Preheade to
80
///   BBs as the difference is too small to justify the code size increase.
81
///   To model this, The adjusted Freq(BBs) will be:
82
///     AdjustedFreq(BBs) = 99 / SinkFrequencyPercentThreshold%
83
static BlockFrequency adjustedSumFreq(SmallPtrSetImpl<BasicBlock *> &BBs,
84
24
                                      BlockFrequencyInfo &BFI) {
85
24
  BlockFrequency T = 0;
86
24
  for (BasicBlock *B : BBs)
87
30
    T += BFI.getBlockFreq(B);
88
24
  if (BBs.size() > 1)
89
6
    T /= BranchProbability(SinkFrequencyPercentThreshold, 100);
90
24
  return T;
91
24
}
92
93
/// Return a set of basic blocks to insert sinked instructions.
94
///
95
/// The returned set of basic blocks (BBsToSinkInto) should satisfy:
96
///
97
/// * Inside the loop \p L
98
/// * For each UseBB in \p UseBBs, there is at least one BB in BBsToSinkInto
99
///   that domintates the UseBB
100
/// * Has minimum total frequency that is no greater than preheader frequency
101
///
102
/// The purpose of the function is to find the optimal sinking points to
103
/// minimize execution cost, which is defined as "sum of frequency of
104
/// BBsToSinkInto".
105
/// As a result, the returned BBsToSinkInto needs to have minimum total
106
/// frequency.
107
/// Additionally, if the total frequency of BBsToSinkInto exceeds preheader
108
/// frequency, the optimal solution is not sinking (return empty set).
109
///
110
/// \p ColdLoopBBs is used to help find the optimal sinking locations.
111
/// It stores a list of BBs that is:
112
///
113
/// * Inside the loop \p L
114
/// * Has a frequency no larger than the loop's preheader
115
/// * Sorted by BB frequency
116
///
117
/// The complexity of the function is O(UseBBs.size() * ColdLoopBBs.size()).
118
/// To avoid expensive computation, we cap the maximum UseBBs.size() in its
119
/// caller.
120
static SmallPtrSet<BasicBlock *, 2>
121
findBBsToSinkInto(const Loop &L, const SmallPtrSetImpl<BasicBlock *> &UseBBs,
122
                  const SmallVectorImpl<BasicBlock *> &ColdLoopBBs,
123
10
                  DominatorTree &DT, BlockFrequencyInfo &BFI) {
124
10
  SmallPtrSet<BasicBlock *, 2> BBsToSinkInto;
125
10
  if (UseBBs.size() == 0)
126
0
    return BBsToSinkInto;
127
10
128
10
  BBsToSinkInto.insert(UseBBs.begin(), UseBBs.end());
129
10
  SmallPtrSet<BasicBlock *, 2> BBsDominatedByColdestBB;
130
10
131
10
  // For every iteration:
132
10
  //   * Pick the ColdestBB from ColdLoopBBs
133
10
  //   * Find the set BBsDominatedByColdestBB that satisfy:
134
10
  //     - BBsDominatedByColdestBB is a subset of BBsToSinkInto
135
10
  //     - Every BB in BBsDominatedByColdestBB is dominated by ColdestBB
136
10
  //   * If Freq(ColdestBB) < Freq(BBsDominatedByColdestBB), remove
137
10
  //     BBsDominatedByColdestBB from BBsToSinkInto, add ColdestBB to
138
10
  //     BBsToSinkInto
139
18
  for (BasicBlock *ColdestBB : ColdLoopBBs) {
140
18
    BBsDominatedByColdestBB.clear();
141
18
    for (BasicBlock *SinkedBB : BBsToSinkInto)
142
32
      
if (32
DT.dominates(ColdestBB, SinkedBB)32
)
143
16
        BBsDominatedByColdestBB.insert(SinkedBB);
144
18
    if (BBsDominatedByColdestBB.size() == 0)
145
4
      continue;
146
14
    
if (14
adjustedSumFreq(BBsDominatedByColdestBB, BFI) >
147
14
        BFI.getBlockFreq(ColdestBB)) {
148
4
      for (BasicBlock *DominatedBB : BBsDominatedByColdestBB) {
149
4
        BBsToSinkInto.erase(DominatedBB);
150
4
      }
151
2
      BBsToSinkInto.insert(ColdestBB);
152
2
    }
153
18
  }
154
10
155
10
  // If the total frequency of BBsToSinkInto is larger than preheader frequency,
156
10
  // do not sink.
157
10
  if (adjustedSumFreq(BBsToSinkInto, BFI) >
158
10
      BFI.getBlockFreq(L.getLoopPreheader()))
159
2
    BBsToSinkInto.clear();
160
10
  return BBsToSinkInto;
161
10
}
162
163
// Sinks \p I from the loop \p L's preheader to its uses. Returns true if
164
// sinking is successful.
165
// \p LoopBlockNumber is used to sort the insertion blocks to ensure
166
// determinism.
167
static bool sinkInstruction(Loop &L, Instruction &I,
168
                            const SmallVectorImpl<BasicBlock *> &ColdLoopBBs,
169
                            const SmallDenseMap<BasicBlock *, int, 16> &LoopBlockNumber,
170
                            LoopInfo &LI, DominatorTree &DT,
171
10
                            BlockFrequencyInfo &BFI) {
172
10
  // Compute the set of blocks in loop L which contain a use of I.
173
10
  SmallPtrSet<BasicBlock *, 2> BBs;
174
18
  for (auto &U : I.uses()) {
175
18
    Instruction *UI = cast<Instruction>(U.getUser());
176
18
    // We cannot sink I to PHI-uses.
177
18
    if (dyn_cast<PHINode>(UI))
178
0
      return false;
179
18
    // We cannot sink I if it has uses outside of the loop.
180
18
    
if (18
!L.contains(LI.getLoopFor(UI->getParent()))18
)
181
0
      return false;
182
18
    BBs.insert(UI->getParent());
183
18
  }
184
10
185
10
  // findBBsToSinkInto is O(BBs.size() * ColdLoopBBs.size()). We cap the max
186
10
  // BBs.size() to avoid expensive computation.
187
10
  // FIXME: Handle code size growth for min_size and opt_size.
188
10
  
if (10
BBs.size() > MaxNumberOfUseBBsForSinking10
)
189
0
    return false;
190
10
191
10
  // Find the set of BBs that we should insert a copy of I.
192
10
  SmallPtrSet<BasicBlock *, 2> BBsToSinkInto =
193
10
      findBBsToSinkInto(L, BBs, ColdLoopBBs, DT, BFI);
194
10
  if (BBsToSinkInto.empty())
195
2
    return false;
196
8
197
8
  // Copy the final BBs into a vector and sort them using the total ordering
198
8
  // of the loop block numbers as iterating the set doesn't give a useful
199
8
  // order. No need to stable sort as the block numbers are a total ordering.
200
8
  SmallVector<BasicBlock *, 2> SortedBBsToSinkInto;
201
8
  SortedBBsToSinkInto.insert(SortedBBsToSinkInto.begin(), BBsToSinkInto.begin(),
202
8
                             BBsToSinkInto.end());
203
8
  std::sort(SortedBBsToSinkInto.begin(), SortedBBsToSinkInto.end(),
204
2
            [&](BasicBlock *A, BasicBlock *B) {
205
2
              return *LoopBlockNumber.find(A) < *LoopBlockNumber.find(B);
206
2
            });
207
8
208
8
  BasicBlock *MoveBB = *SortedBBsToSinkInto.begin();
209
8
  // FIXME: Optimize the efficiency for cloned value replacement. The current
210
8
  //        implementation is O(SortedBBsToSinkInto.size() * I.num_uses()).
211
10
  for (BasicBlock *N : SortedBBsToSinkInto) {
212
10
    if (N == MoveBB)
213
8
      continue;
214
2
    // Clone I and replace its uses.
215
2
    Instruction *IC = I.clone();
216
2
    IC->setName(I.getName());
217
2
    IC->insertBefore(&*N->getFirstInsertionPt());
218
2
    // Replaces uses of I with IC in N
219
6
    for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); 
UI != UE6
;) {
220
4
      Use &U = *UI++;
221
4
      auto *I = cast<Instruction>(U.getUser());
222
4
      if (I->getParent() == N)
223
2
        U.set(IC);
224
4
    }
225
2
    // Replaces uses of I with IC in blocks dominated by N
226
2
    replaceDominatedUsesWith(&I, IC, DT, N);
227
2
    DEBUG(dbgs() << "Sinking a clone of " << I << " To: " << N->getName()
228
10
                 << '\n');
229
10
    NumLoopSunkCloned++;
230
10
  }
231
8
  DEBUG(dbgs() << "Sinking " << I << " To: " << MoveBB->getName() << '\n');
232
10
  NumLoopSunk++;
233
10
  I.moveBefore(&*MoveBB->getFirstInsertionPt());
234
10
235
10
  return true;
236
10
}
237
238
/// Sinks instructions from loop's preheader to the loop body if the
239
/// sum frequency of inserted copy is smaller than preheader's frequency.
240
static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI,
241
                                          DominatorTree &DT,
242
                                          BlockFrequencyInfo &BFI,
243
312k
                                          ScalarEvolution *SE) {
244
312k
  BasicBlock *Preheader = L.getLoopPreheader();
245
312k
  if (!Preheader)
246
0
    return false;
247
312k
248
312k
  // Enable LoopSink only when runtime profile is available.
249
312k
  // With static profile, the sinking decision may be sub-optimal.
250
312k
  
if (312k
!Preheader->getParent()->getEntryCount()312k
)
251
312k
    return false;
252
19
253
19
  const BlockFrequency PreheaderFreq = BFI.getBlockFreq(Preheader);
254
19
  // If there are no basic blocks with lower frequency than the preheader then
255
19
  // we can avoid the detailed analysis as we will never find profitable sinking
256
19
  // opportunities.
257
19
  if (
all_of(L.blocks(), [&](const BasicBlock *BB) 19
{
258
41
        return BFI.getBlockFreq(BB) > PreheaderFreq;
259
41
      }))
260
7
    return false;
261
12
262
12
  bool Changed = false;
263
12
  AliasSetTracker CurAST(AA);
264
12
265
12
  // Compute alias set.
266
12
  for (BasicBlock *BB : L.blocks())
267
64
    CurAST.add(*BB);
268
12
269
12
  // Sort loop's basic blocks by frequency
270
12
  SmallVector<BasicBlock *, 10> ColdLoopBBs;
271
12
  SmallDenseMap<BasicBlock *, int, 16> LoopBlockNumber;
272
12
  int i = 0;
273
12
  for (BasicBlock *B : L.blocks())
274
64
    
if (64
BFI.getBlockFreq(B) < BFI.getBlockFreq(L.getLoopPreheader())64
) {
275
26
      ColdLoopBBs.push_back(B);
276
26
      LoopBlockNumber[B] = ++i;
277
26
    }
278
12
  std::stable_sort(ColdLoopBBs.begin(), ColdLoopBBs.end(),
279
20
                   [&](BasicBlock *A, BasicBlock *B) {
280
20
                     return BFI.getBlockFreq(A) < BFI.getBlockFreq(B);
281
20
                   });
282
12
283
12
  // Traverse preheader's instructions in reverse order becaue if A depends
284
12
  // on B (A appears after B), A needs to be sinked first before B can be
285
12
  // sinked.
286
36
  for (auto II = Preheader->rbegin(), E = Preheader->rend(); 
II != E36
;) {
287
24
    Instruction *I = &*II++;
288
24
    // No need to check for instruction's operands are loop invariant.
289
24
    assert(L.hasLoopInvariantOperands(I) &&
290
24
           "Insts in a loop's preheader should have loop invariant operands!");
291
24
    if (!canSinkOrHoistInst(*I, &AA, &DT, &L, &CurAST, nullptr))
292
14
      continue;
293
10
    
if (10
sinkInstruction(L, *I, ColdLoopBBs, LoopBlockNumber, LI, DT, BFI)10
)
294
8
      Changed = true;
295
24
  }
296
12
297
12
  if (
Changed && 12
SE8
)
298
4
    SE->forgetLoopDispositions(&L);
299
312k
  return Changed;
300
312k
}
301
302
77
PreservedAnalyses LoopSinkPass::run(Function &F, FunctionAnalysisManager &FAM) {
303
77
  LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
304
77
  // Nothing to do if there are no loops.
305
77
  if (LI.empty())
306
49
    return PreservedAnalyses::all();
307
28
308
28
  AAResults &AA = FAM.getResult<AAManager>(F);
309
28
  DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
310
28
  BlockFrequencyInfo &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
311
28
312
28
  // We want to do a postorder walk over the loops. Since loops are a tree this
313
28
  // is equivalent to a reversed preorder walk and preorder is easy to compute
314
28
  // without recursion. Since we reverse the preorder, we will visit siblings
315
28
  // in reverse program order. This isn't expected to matter at all but is more
316
28
  // consistent with sinking algorithms which generally work bottom-up.
317
28
  SmallVector<Loop *, 4> PreorderLoops = LI.getLoopsInPreorder();
318
28
319
28
  bool Changed = false;
320
29
  do {
321
29
    Loop &L = *PreorderLoops.pop_back_val();
322
29
323
29
    // Note that we don't pass SCEV here because it is only used to invalidate
324
29
    // loops in SCEV and we don't preserve (or request) SCEV at all making that
325
29
    // unnecessary.
326
29
    Changed |= sinkLoopInvariantInstructions(L, AA, LI, DT, BFI,
327
29
                                             /*ScalarEvolution*/ nullptr);
328
29
  } while (!PreorderLoops.empty());
329
28
330
28
  if (!Changed)
331
24
    return PreservedAnalyses::all();
332
4
333
4
  PreservedAnalyses PA;
334
4
  PA.preserveSet<CFGAnalyses>();
335
4
  return PA;
336
4
}
337
338
namespace {
339
struct LegacyLoopSinkPass : public LoopPass {
340
  static char ID;
341
17.2k
  LegacyLoopSinkPass() : LoopPass(ID) {
342
17.2k
    initializeLegacyLoopSinkPassPass(*PassRegistry::getPassRegistry());
343
17.2k
  }
344
345
312k
  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
346
312k
    if (skipLoop(L))
347
23
      return false;
348
312k
349
312k
    auto *SE = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
350
312k
    return sinkLoopInvariantInstructions(
351
312k
        *L, getAnalysis<AAResultsWrapperPass>().getAAResults(),
352
312k
        getAnalysis<LoopInfoWrapperPass>().getLoopInfo(),
353
312k
        getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
354
312k
        getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(),
355
312k
        SE ? 
&SE->getSE()312k
:
nullptr0
);
356
312k
  }
357
358
17.2k
  void getAnalysisUsage(AnalysisUsage &AU) const override {
359
17.2k
    AU.setPreservesCFG();
360
17.2k
    AU.addRequired<BlockFrequencyInfoWrapperPass>();
361
17.2k
    getLoopAnalysisUsage(AU);
362
17.2k
  }
363
};
364
}
365
366
char LegacyLoopSinkPass::ID = 0;
367
41.6k
INITIALIZE_PASS_BEGIN41.6k
(LegacyLoopSinkPass, "loop-sink", "Loop Sink", false,
368
41.6k
                      false)
369
41.6k
INITIALIZE_PASS_DEPENDENCY(LoopPass)
370
41.6k
INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
371
41.6k
INITIALIZE_PASS_END(LegacyLoopSinkPass, "loop-sink", "Loop Sink", false, false)
372
373
17.2k
Pass *llvm::createLoopSinkPass() { return new LegacyLoopSinkPass(); }