Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
Line
Count
Source (jump to first uncovered line)
1
//===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This file implements the transformation that optimizes memory intrinsics
10
// such as memcpy using the size value profile. When memory intrinsic size
11
// value profile metadata is available, a single memory intrinsic is expanded
12
// to a sequence of guarded specialized versions that are called with the
13
// hottest size(s), for later expansion into more optimal inline sequences.
14
//
15
//===----------------------------------------------------------------------===//
16
17
#include "llvm/ADT/ArrayRef.h"
18
#include "llvm/ADT/Statistic.h"
19
#include "llvm/ADT/StringRef.h"
20
#include "llvm/ADT/Twine.h"
21
#include "llvm/Analysis/BlockFrequencyInfo.h"
22
#include "llvm/Analysis/DomTreeUpdater.h"
23
#include "llvm/Analysis/GlobalsModRef.h"
24
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
25
#include "llvm/IR/BasicBlock.h"
26
#include "llvm/IR/CallSite.h"
27
#include "llvm/IR/DerivedTypes.h"
28
#include "llvm/IR/Dominators.h"
29
#include "llvm/IR/Function.h"
30
#include "llvm/IR/IRBuilder.h"
31
#include "llvm/IR/InstVisitor.h"
32
#include "llvm/IR/InstrTypes.h"
33
#include "llvm/IR/Instruction.h"
34
#include "llvm/IR/Instructions.h"
35
#include "llvm/IR/LLVMContext.h"
36
#include "llvm/IR/PassManager.h"
37
#include "llvm/IR/Type.h"
38
#include "llvm/Pass.h"
39
#include "llvm/PassRegistry.h"
40
#include "llvm/PassSupport.h"
41
#include "llvm/ProfileData/InstrProf.h"
42
#include "llvm/Support/Casting.h"
43
#include "llvm/Support/CommandLine.h"
44
#include "llvm/Support/Debug.h"
45
#include "llvm/Support/ErrorHandling.h"
46
#include "llvm/Support/MathExtras.h"
47
#include "llvm/Transforms/Instrumentation.h"
48
#include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
49
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
50
#include <cassert>
51
#include <cstdint>
52
#include <vector>
53
54
using namespace llvm;
55
56
4
#define DEBUG_TYPE "pgo-memop-opt"
57
58
STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized.");
59
STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated.");
60
61
// The minimum call count to optimize memory intrinsic calls.
62
static cl::opt<unsigned>
63
    MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::ZeroOrMore,
64
                        cl::init(1000),
65
                        cl::desc("The minimum count to optimize memory "
66
                                 "intrinsic calls"));
67
68
// Command line option to disable memory intrinsic optimization. The default is
69
// false. This is for debug purpose.
70
static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false),
71
                                     cl::Hidden, cl::desc("Disable optimize"));
72
73
// The percent threshold to optimize memory intrinsic calls.
74
static cl::opt<unsigned>
75
    MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40),
76
                          cl::Hidden, cl::ZeroOrMore,
77
                          cl::desc("The percentage threshold for the "
78
                                   "memory intrinsic calls optimization"));
79
80
// Maximum number of versions for optimizing memory intrinsic call.
81
static cl::opt<unsigned>
82
    MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden,
83
                    cl::ZeroOrMore,
84
                    cl::desc("The max version for the optimized memory "
85
                             " intrinsic calls"));
86
87
// Scale the counts from the annotation using the BB count value.
88
static cl::opt<bool>
89
    MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden,
90
                    cl::desc("Scale the memop size counts using the basic "
91
                             " block count value"));
92
93
// This option sets the rangge of precise profile memop sizes.
94
extern cl::opt<std::string> MemOPSizeRange;
95
96
// This option sets the value that groups large memop sizes
97
extern cl::opt<unsigned> MemOPSizeLarge;
98
99
namespace {
100
class PGOMemOPSizeOptLegacyPass : public FunctionPass {
101
public:
102
  static char ID;
103
104
11.6k
  PGOMemOPSizeOptLegacyPass() : FunctionPass(ID) {
105
11.6k
    initializePGOMemOPSizeOptLegacyPassPass(*PassRegistry::getPassRegistry());
106
11.6k
  }
107
108
462k
  StringRef getPassName() const override { return "PGOMemOPSize"; }
109
110
private:
111
  bool runOnFunction(Function &F) override;
112
11.6k
  void getAnalysisUsage(AnalysisUsage &AU) const override {
113
11.6k
    AU.addRequired<BlockFrequencyInfoWrapperPass>();
114
11.6k
    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
115
11.6k
    AU.addPreserved<GlobalsAAWrapperPass>();
116
11.6k
    AU.addPreserved<DominatorTreeWrapperPass>();
117
11.6k
  }
118
};
119
} // end anonymous namespace
120
121
char PGOMemOPSizeOptLegacyPass::ID = 0;
122
22.3k
INITIALIZE_PASS_BEGIN(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
123
22.3k
                      "Optimize memory intrinsic using its size value profile",
124
22.3k
                      false, false)
125
22.3k
INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
126
22.3k
INITIALIZE_PASS_END(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
127
                    "Optimize memory intrinsic using its size value profile",
128
                    false, false)
129
130
11.6k
FunctionPass *llvm::createPGOMemOPSizeOptLegacyPass() {
131
11.6k
  return new PGOMemOPSizeOptLegacyPass();
132
11.6k
}
133
134
namespace {
135
class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> {
136
public:
137
  MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI,
138
               OptimizationRemarkEmitter &ORE, DominatorTree *DT)
139
462k
      : Func(Func), BFI(BFI), ORE(ORE), DT(DT), Changed(false) {
140
462k
    ValueDataArray =
141
462k
        llvm::make_unique<InstrProfValueData[]>(MemOPMaxVersion + 2);
142
462k
    // Get the MemOPSize range information from option MemOPSizeRange,
143
462k
    getMemOPSizeRangeFromOption(MemOPSizeRange, PreciseRangeStart,
144
462k
                                PreciseRangeLast);
145
462k
  }
146
462k
  bool isChanged() const { return Changed; }
147
462k
  void perform() {
148
462k
    WorkList.clear();
149
462k
    visit(Func);
150
462k
151
462k
    for (auto &MI : WorkList) {
152
3.88k
      ++NumOfPGOMemOPAnnotate;
153
3.88k
      if (perform(MI)) {
154
10
        Changed = true;
155
10
        ++NumOfPGOMemOPOpt;
156
10
        LLVM_DEBUG(dbgs() << "MemOP call: "
157
10
                          << MI->getCalledFunction()->getName()
158
10
                          << "is Transformed.\n");
159
10
      }
160
3.88k
    }
161
462k
  }
162
163
46.9k
  void visitMemIntrinsic(MemIntrinsic &MI) {
164
46.9k
    Value *Length = MI.getLength();
165
46.9k
    // Not perform on constant length calls.
166
46.9k
    if (dyn_cast<ConstantInt>(Length))
167
43.0k
      return;
168
3.88k
    WorkList.push_back(&MI);
169
3.88k
  }
170
171
private:
172
  Function &Func;
173
  BlockFrequencyInfo &BFI;
174
  OptimizationRemarkEmitter &ORE;
175
  DominatorTree *DT;
176
  bool Changed;
177
  std::vector<MemIntrinsic *> WorkList;
178
  // Start of the previse range.
179
  int64_t PreciseRangeStart;
180
  // Last value of the previse range.
181
  int64_t PreciseRangeLast;
182
  // The space to read the profile annotation.
183
  std::unique_ptr<InstrProfValueData[]> ValueDataArray;
184
  bool perform(MemIntrinsic *MI);
185
186
  // This kind shows which group the value falls in. For PreciseValue, we have
187
  // the profile count for that value. LargeGroup groups the values that are in
188
  // range [LargeValue, +inf). NonLargeGroup groups the rest of values.
189
  enum MemOPSizeKind { PreciseValue, NonLargeGroup, LargeGroup };
190
191
20
  MemOPSizeKind getMemOPSizeKind(int64_t Value) const {
192
20
    if (Value == MemOPSizeLarge && 
MemOPSizeLarge != 00
)
193
0
      return LargeGroup;
194
20
    if (Value == PreciseRangeLast + 1)
195
0
      return NonLargeGroup;
196
20
    return PreciseValue;
197
20
  }
198
};
199
200
4
static const char *getMIName(const MemIntrinsic *MI) {
201
4
  switch (MI->getIntrinsicID()) {
202
4
  case Intrinsic::memcpy:
203
4
    return "memcpy";
204
4
  case Intrinsic::memmove:
205
0
    return "memmove";
206
4
  case Intrinsic::memset:
207
0
    return "memset";
208
4
  default:
209
0
    return "unknown";
210
4
  }
211
4
}
212
213
20
static bool isProfitable(uint64_t Count, uint64_t TotalCount) {
214
20
  assert(Count <= TotalCount);
215
20
  if (Count < MemOPCountThreshold)
216
8
    return false;
217
12
  if (Count < TotalCount * MemOPPercentThreshold / 100)
218
0
    return false;
219
12
  return true;
220
12
}
221
222
static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num,
223
20
                                      uint64_t Denom) {
224
20
  if (!MemOPScaleCount)
225
0
    return Count;
226
20
  bool Overflowed;
227
20
  uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed);
228
20
  return ScaleCount / Denom;
229
20
}
230
231
3.88k
bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
232
3.88k
  assert(MI);
233
3.88k
  if (MI->getIntrinsicID() == Intrinsic::memmove)
234
853
    return false;
235
3.03k
236
3.03k
  uint32_t NumVals, MaxNumPromotions = MemOPMaxVersion + 2;
237
3.03k
  uint64_t TotalCount;
238
3.03k
  if (!getValueProfDataFromInst(*MI, IPVK_MemOPSize, MaxNumPromotions,
239
3.03k
                                ValueDataArray.get(), NumVals, TotalCount))
240
3.01k
    return false;
241
12
242
12
  uint64_t ActualCount = TotalCount;
243
12
  uint64_t SavedTotalCount = TotalCount;
244
12
  if (MemOPScaleCount) {
245
12
    auto BBEdgeCount = BFI.getBlockProfileCount(MI->getParent());
246
12
    if (!BBEdgeCount)
247
0
      return false;
248
12
    ActualCount = *BBEdgeCount;
249
12
  }
250
12
251
12
  ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals);
252
12
  LLVM_DEBUG(dbgs() << "Read one memory intrinsic profile with count "
253
12
                    << ActualCount << "\n");
254
12
  LLVM_DEBUG(
255
12
      for (auto &VD
256
12
           : VDs) { dbgs() << "  (" << VD.Value << "," << VD.Count << ")\n"; });
257
12
258
12
  if (ActualCount < MemOPCountThreshold)
259
0
    return false;
260
12
  // Skip if the total value profiled count is 0, in which case we can't
261
12
  // scale up the counts properly (and there is no profitable transformation).
262
12
  if (TotalCount == 0)
263
2
    return false;
264
10
265
10
  TotalCount = ActualCount;
266
10
  if (MemOPScaleCount)
267
10
    LLVM_DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount
268
10
                      << " denominator = " << SavedTotalCount << "\n");
269
10
270
10
  // Keeping track of the count of the default case:
271
10
  uint64_t RemainCount = TotalCount;
272
10
  uint64_t SavedRemainCount = SavedTotalCount;
273
10
  SmallVector<uint64_t, 16> SizeIds;
274
10
  SmallVector<uint64_t, 16> CaseCounts;
275
10
  uint64_t MaxCount = 0;
276
10
  unsigned Version = 0;
277
10
  // Default case is in the front -- save the slot here.
278
10
  CaseCounts.push_back(0);
279
20
  for (auto &VD : VDs) {
280
20
    int64_t V = VD.Value;
281
20
    uint64_t C = VD.Count;
282
20
    if (MemOPScaleCount)
283
20
      C = getScaledCount(C, ActualCount, SavedTotalCount);
284
20
285
20
    // Only care precise value here.
286
20
    if (getMemOPSizeKind(V) != PreciseValue)
287
0
      continue;
288
20
289
20
    // ValueCounts are sorted on the count. Break at the first un-profitable
290
20
    // value.
291
20
    if (!isProfitable(C, RemainCount))
292
8
      break;
293
12
294
12
    SizeIds.push_back(V);
295
12
    CaseCounts.push_back(C);
296
12
    if (C > MaxCount)
297
10
      MaxCount = C;
298
12
299
12
    assert(RemainCount >= C);
300
12
    RemainCount -= C;
301
12
    assert(SavedRemainCount >= VD.Count);
302
12
    SavedRemainCount -= VD.Count;
303
12
304
12
    if (++Version > MemOPMaxVersion && 
MemOPMaxVersion != 00
)
305
0
      break;
306
12
  }
307
10
308
10
  if (Version == 0)
309
0
    return false;
310
10
311
10
  CaseCounts[0] = RemainCount;
312
10
  if (RemainCount > MaxCount)
313
8
    MaxCount = RemainCount;
314
10
315
10
  uint64_t SumForOpt = TotalCount - RemainCount;
316
10
317
10
  LLVM_DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version
318
10
                    << " Versions (covering " << SumForOpt << " out of "
319
10
                    << TotalCount << ")\n");
320
10
321
10
  // mem_op(..., size)
322
10
  // ==>
323
10
  // switch (size) {
324
10
  //   case s1:
325
10
  //      mem_op(..., s1);
326
10
  //      goto merge_bb;
327
10
  //   case s2:
328
10
  //      mem_op(..., s2);
329
10
  //      goto merge_bb;
330
10
  //   ...
331
10
  //   default:
332
10
  //      mem_op(..., size);
333
10
  //      goto merge_bb;
334
10
  // }
335
10
  // merge_bb:
336
10
337
10
  BasicBlock *BB = MI->getParent();
338
10
  LLVM_DEBUG(dbgs() << "\n\n== Basic Block Before ==\n");
339
10
  LLVM_DEBUG(dbgs() << *BB << "\n");
340
10
  auto OrigBBFreq = BFI.getBlockFreq(BB);
341
10
342
10
  BasicBlock *DefaultBB = SplitBlock(BB, MI, DT);
343
10
  BasicBlock::iterator It(*MI);
344
10
  ++It;
345
10
  assert(It != DefaultBB->end());
346
10
  BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It), DT);
347
10
  MergeBB->setName("MemOP.Merge");
348
10
  BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency());
349
10
  DefaultBB->setName("MemOP.Default");
350
10
351
10
  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
352
10
  auto &Ctx = Func.getContext();
353
10
  IRBuilder<> IRB(BB);
354
10
  BB->getTerminator()->eraseFromParent();
355
10
  Value *SizeVar = MI->getLength();
356
10
  SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size());
357
10
358
10
  // Clear the value profile data.
359
10
  MI->setMetadata(LLVMContext::MD_prof, nullptr);
360
10
  // If all promoted, we don't need the MD.prof metadata.
361
10
  if (SavedRemainCount > 0 || 
Version != NumVals2
)
362
8
    // Otherwise we need update with the un-promoted records back.
363
8
    annotateValueSite(*Func.getParent(), *MI, VDs.slice(Version),
364
8
                      SavedRemainCount, IPVK_MemOPSize, NumVals);
365
10
366
10
  LLVM_DEBUG(dbgs() << "\n\n== Basic Block After==\n");
367
10
368
10
  std::vector<DominatorTree::UpdateType> Updates;
369
10
  if (DT)
370
10
    Updates.reserve(2 * SizeIds.size());
371
10
372
12
  for (uint64_t SizeId : SizeIds) {
373
12
    BasicBlock *CaseBB = BasicBlock::Create(
374
12
        Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB);
375
12
    Instruction *NewInst = MI->clone();
376
12
    // Fix the argument.
377
12
    MemIntrinsic * MemI = dyn_cast<MemIntrinsic>(NewInst);
378
12
    IntegerType *SizeType = dyn_cast<IntegerType>(MemI->getLength()->getType());
379
12
    assert(SizeType && "Expected integer type size argument.");
380
12
    ConstantInt *CaseSizeId = ConstantInt::get(SizeType, SizeId);
381
12
    MemI->setLength(CaseSizeId);
382
12
    CaseBB->getInstList().push_back(NewInst);
383
12
    IRBuilder<> IRBCase(CaseBB);
384
12
    IRBCase.CreateBr(MergeBB);
385
12
    SI->addCase(CaseSizeId, CaseBB);
386
12
    if (DT) {
387
12
      Updates.push_back({DominatorTree::Insert, CaseBB, MergeBB});
388
12
      Updates.push_back({DominatorTree::Insert, BB, CaseBB});
389
12
    }
390
12
    LLVM_DEBUG(dbgs() << *CaseBB << "\n");
391
12
  }
392
10
  DTU.applyUpdates(Updates);
393
10
  Updates.clear();
394
10
395
10
  setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount);
396
10
397
10
  LLVM_DEBUG(dbgs() << *BB << "\n");
398
10
  LLVM_DEBUG(dbgs() << *DefaultBB << "\n");
399
10
  LLVM_DEBUG(dbgs() << *MergeBB << "\n");
400
10
401
10
  ORE.emit([&]() {
402
4
    using namespace ore;
403
4
    return OptimizationRemark(DEBUG_TYPE, "memopt-opt", MI)
404
4
             << "optimized " << NV("Intrinsic", StringRef(getMIName(MI)))
405
4
             << " with count " << NV("Count", SumForOpt) << " out of "
406
4
             << NV("Total", TotalCount) << " for " << NV("Versions", Version)
407
4
             << " versions";
408
4
  });
409
10
410
10
  return true;
411
10
}
412
} // namespace
413
414
static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI,
415
                                OptimizationRemarkEmitter &ORE,
416
462k
                                DominatorTree *DT) {
417
462k
  if (DisableMemOPOPT)
418
0
    return false;
419
462k
420
462k
  if (F.hasFnAttribute(Attribute::OptimizeForSize))
421
1
    return false;
422
462k
  MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT);
423
462k
  MemOPSizeOpt.perform();
424
462k
  return MemOPSizeOpt.isChanged();
425
462k
}
426
427
462k
bool PGOMemOPSizeOptLegacyPass::runOnFunction(Function &F) {
428
462k
  BlockFrequencyInfo &BFI =
429
462k
      getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
430
462k
  auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
431
462k
  auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
432
462k
  DominatorTree *DT = DTWP ? 
&DTWP->getDomTree()462k
:
nullptr1
;
433
462k
  return PGOMemOPSizeOptImpl(F, BFI, ORE, DT);
434
462k
}
435
436
namespace llvm {
437
char &PGOMemOPSizeOptID = PGOMemOPSizeOptLegacyPass::ID;
438
439
PreservedAnalyses PGOMemOPSizeOpt::run(Function &F,
440
6
                                       FunctionAnalysisManager &FAM) {
441
6
  auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
442
6
  auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
443
6
  auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
444
6
  bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT);
445
6
  if (!Changed)
446
4
    return PreservedAnalyses::all();
447
2
  auto PA = PreservedAnalyses();
448
2
  PA.preserve<GlobalsAA>();
449
2
  PA.preserve<DominatorTreeAnalysis>();
450
2
  return PA;
451
2
}
452
} // namespace llvm