Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
Line
Count
Source (jump to first uncovered line)
1
//===- AMDGPUPerfHintAnalysis.cpp - analysis of functions memory traffic --===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
/// \file
10
/// \brief Analyzes if a function potentially memory bound and if a kernel
11
/// kernel may benefit from limiting number of waves to reduce cache thrashing.
12
///
13
//===----------------------------------------------------------------------===//
14
15
#include "AMDGPU.h"
16
#include "AMDGPUPerfHintAnalysis.h"
17
#include "Utils/AMDGPUBaseInfo.h"
18
#include "llvm/ADT/SmallSet.h"
19
#include "llvm/ADT/Statistic.h"
20
#include "llvm/Analysis/CallGraph.h"
21
#include "llvm/Analysis/ValueTracking.h"
22
#include "llvm/CodeGen/TargetLowering.h"
23
#include "llvm/CodeGen/TargetPassConfig.h"
24
#include "llvm/CodeGen/TargetSubtargetInfo.h"
25
#include "llvm/IR/Constants.h"
26
#include "llvm/IR/Instructions.h"
27
#include "llvm/IR/IntrinsicInst.h"
28
#include "llvm/IR/Module.h"
29
#include "llvm/IR/ValueMap.h"
30
#include "llvm/Support/CommandLine.h"
31
32
using namespace llvm;
33
34
#define DEBUG_TYPE "amdgpu-perf-hint"
35
36
static cl::opt<unsigned>
37
    MemBoundThresh("amdgpu-membound-threshold", cl::init(50), cl::Hidden,
38
                   cl::desc("Function mem bound threshold in %"));
39
40
static cl::opt<unsigned>
41
    LimitWaveThresh("amdgpu-limit-wave-threshold", cl::init(50), cl::Hidden,
42
                    cl::desc("Kernel limit wave threshold in %"));
43
44
static cl::opt<unsigned>
45
    IAWeight("amdgpu-indirect-access-weight", cl::init(1000), cl::Hidden,
46
             cl::desc("Indirect access memory instruction weight"));
47
48
static cl::opt<unsigned>
49
    LSWeight("amdgpu-large-stride-weight", cl::init(1000), cl::Hidden,
50
             cl::desc("Large stride memory access weight"));
51
52
static cl::opt<unsigned>
53
    LargeStrideThresh("amdgpu-large-stride-threshold", cl::init(64), cl::Hidden,
54
                      cl::desc("Large stride memory access threshold"));
55
56
STATISTIC(NumMemBound, "Number of functions marked as memory bound");
57
STATISTIC(NumLimitWave, "Number of functions marked as needing limit wave");
58
59
char llvm::AMDGPUPerfHintAnalysis::ID = 0;
60
char &llvm::AMDGPUPerfHintAnalysisID = AMDGPUPerfHintAnalysis::ID;
61
62
INITIALIZE_PASS(AMDGPUPerfHintAnalysis, DEBUG_TYPE,
63
                "Analysis if a function is memory bound", true, true)
64
65
namespace {
66
67
struct AMDGPUPerfHint {
68
  friend AMDGPUPerfHintAnalysis;
69
70
public:
71
  AMDGPUPerfHint(AMDGPUPerfHintAnalysis::FuncInfoMap &FIM_,
72
                 const TargetLowering *TLI_)
73
27.4k
      : FIM(FIM_), DL(nullptr), TLI(TLI_) {}
74
75
  bool runOnFunction(Function &F);
76
77
private:
78
  struct MemAccessInfo {
79
    const Value *V;
80
    const Value *Base;
81
    int64_t Offset;
82
141k
    MemAccessInfo() : V(nullptr), Base(nullptr), Offset(0) {}
83
    bool isLargeStride(MemAccessInfo &Reference) const;
84
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
85
    Printable print() const {
86
      return Printable([this](raw_ostream &OS) {
87
        OS << "Value: " << *V << '\n'
88
           << "Base: " << *Base << " Offset: " << Offset << '\n';
89
      });
90
    }
91
#endif
92
  };
93
94
  MemAccessInfo makeMemAccessInfo(Instruction *) const;
95
96
  MemAccessInfo LastAccess; // Last memory access info
97
98
  AMDGPUPerfHintAnalysis::FuncInfoMap &FIM;
99
100
  const DataLayout *DL;
101
102
  const TargetLowering *TLI;
103
104
  AMDGPUPerfHintAnalysis::FuncInfo *visit(const Function &F);
105
  static bool isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &F);
106
  static bool needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &F);
107
108
  bool isIndirectAccess(const Instruction *Inst) const;
109
110
  /// Check if the instruction is large stride.
111
  /// The purpose is to identify memory access pattern like:
112
  /// x = a[i];
113
  /// y = a[i+1000];
114
  /// z = a[i+2000];
115
  /// In the above example, the second and third memory access will be marked
116
  /// large stride memory access.
117
  bool isLargeStride(const Instruction *Inst);
118
119
  bool isGlobalAddr(const Value *V) const;
120
  bool isLocalAddr(const Value *V) const;
121
  bool isConstantAddr(const Value *V) const;
122
};
123
124
474k
static const Value *getMemoryInstrPtr(const Instruction *Inst) {
125
474k
  if (auto LI = dyn_cast<LoadInst>(Inst)) {
126
171k
    return LI->getPointerOperand();
127
171k
  }
128
303k
  if (auto SI = dyn_cast<StoreInst>(Inst)) {
129
71.5k
    return SI->getPointerOperand();
130
71.5k
  }
131
231k
  if (auto AI = dyn_cast<AtomicCmpXchgInst>(Inst)) {
132
2.32k
    return AI->getPointerOperand();
133
2.32k
  }
134
229k
  if (auto AI = dyn_cast<AtomicRMWInst>(Inst)) {
135
5.73k
    return AI->getPointerOperand();
136
5.73k
  }
137
223k
  if (auto MI = dyn_cast<AnyMemIntrinsic>(Inst)) {
138
252
    return MI->getRawDest();
139
252
  }
140
223k
141
223k
  return nullptr;
142
223k
}
143
144
83.6k
bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const {
145
83.6k
  LLVM_DEBUG(dbgs() << "[isIndirectAccess] " << *Inst << '\n');
146
83.6k
  SmallSet<const Value *, 32> WorkSet;
147
83.6k
  SmallSet<const Value *, 32> Visited;
148
83.6k
  if (const Value *MO = getMemoryInstrPtr(Inst)) {
149
83.6k
    if (isGlobalAddr(MO))
150
36.5k
      WorkSet.insert(MO);
151
83.6k
  }
152
83.6k
153
110k
  while (!WorkSet.empty()) {
154
54.5k
    const Value *V = *WorkSet.begin();
155
54.5k
    WorkSet.erase(*WorkSet.begin());
156
54.5k
    if (!Visited.insert(V).second)
157
0
      continue;
158
54.5k
    LLVM_DEBUG(dbgs() << "  check: " << *V << '\n');
159
54.5k
160
54.5k
    if (auto LD = dyn_cast<LoadInst>(V)) {
161
27.3k
      auto M = LD->getPointerOperand();
162
27.3k
      if (isGlobalAddr(M) || 
isLocalAddr(M)27.3k
||
isConstantAddr(M)27.3k
) {
163
27.3k
        LLVM_DEBUG(dbgs() << "    is IA\n");
164
27.3k
        return true;
165
27.3k
      }
166
31
      continue;
167
31
    }
168
27.1k
169
27.1k
    if (auto GEP = dyn_cast<GetElementPtrInst>(V)) {
170
12.9k
      auto P = GEP->getPointerOperand();
171
12.9k
      WorkSet.insert(P);
172
25.9k
      for (unsigned I = 1, E = GEP->getNumIndices() + 1; I != E; 
++I12.9k
)
173
12.9k
        WorkSet.insert(GEP->getOperand(I));
174
12.9k
      continue;
175
12.9k
    }
176
14.2k
177
14.2k
    if (auto U = dyn_cast<UnaryInstruction>(V)) {
178
2.04k
      WorkSet.insert(U->getOperand(0));
179
2.04k
      continue;
180
2.04k
    }
181
12.2k
182
12.2k
    if (auto BO = dyn_cast<BinaryOperator>(V)) {
183
31
      WorkSet.insert(BO->getOperand(0));
184
31
      WorkSet.insert(BO->getOperand(1));
185
31
      continue;
186
31
    }
187
12.1k
188
12.1k
    if (auto S = dyn_cast<SelectInst>(V)) {
189
0
      WorkSet.insert(S->getFalseValue());
190
0
      WorkSet.insert(S->getTrueValue());
191
0
      continue;
192
0
    }
193
12.1k
194
12.1k
    if (auto E = dyn_cast<ExtractElementInst>(V)) {
195
0
      WorkSet.insert(E->getVectorOperand());
196
0
      continue;
197
0
    }
198
12.1k
199
12.1k
    LLVM_DEBUG(dbgs() << "    dropped\n");
200
12.1k
  }
201
83.6k
202
83.6k
  
LLVM_DEBUG56.3k
(dbgs() << " is not IA\n");
203
56.3k
  return false;
204
83.6k
}
205
206
27.4k
AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {
207
27.4k
  AMDGPUPerfHintAnalysis::FuncInfo &FI = FIM[&F];
208
27.4k
209
27.4k
  LLVM_DEBUG(dbgs() << "[AMDGPUPerfHint] process " << F.getName() << '\n');
210
27.4k
211
30.6k
  for (auto &B : F) {
212
30.6k
    LastAccess = MemAccessInfo();
213
307k
    for (auto &I : B) {
214
307k
      if (getMemoryInstrPtr(&I)) {
215
83.6k
        if (isIndirectAccess(&I))
216
27.3k
          ++FI.IAMInstCount;
217
83.6k
        if (isLargeStride(&I))
218
182
          ++FI.LSMInstCount;
219
83.6k
        ++FI.MemInstCount;
220
83.6k
        ++FI.InstCount;
221
83.6k
        continue;
222
83.6k
      }
223
223k
      CallSite CS(const_cast<Instruction *>(&I));
224
223k
      if (CS) {
225
37.6k
        Function *Callee = CS.getCalledFunction();
226
37.6k
        if (!Callee || 
Callee->isDeclaration()36.1k
) {
227
37.4k
          ++FI.InstCount;
228
37.4k
          continue;
229
37.4k
        }
230
257
        if (&F == Callee) // Handle immediate recursion
231
5
          continue;
232
252
233
252
        auto Loc = FIM.find(Callee);
234
252
        if (Loc == FIM.end())
235
1
          continue;
236
251
237
251
        FI.MemInstCount += Loc->second.MemInstCount;
238
251
        FI.InstCount += Loc->second.InstCount;
239
251
        FI.IAMInstCount += Loc->second.IAMInstCount;
240
251
        FI.LSMInstCount += Loc->second.LSMInstCount;
241
185k
      } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
242
52.4k
        TargetLoweringBase::AddrMode AM;
243
52.4k
        auto *Ptr = GetPointerBaseWithConstantOffset(GEP, AM.BaseOffs, *DL);
244
52.4k
        AM.BaseGV = dyn_cast_or_null<GlobalValue>(const_cast<Value *>(Ptr));
245
52.4k
        AM.HasBaseReg = !AM.BaseGV;
246
52.4k
        if (TLI->isLegalAddressingMode(*DL, AM, GEP->getResultElementType(),
247
52.4k
                                       GEP->getPointerAddressSpace()))
248
36.8k
          // Offset will likely be folded into load or store
249
36.8k
          continue;
250
15.6k
        ++FI.InstCount;
251
133k
      } else {
252
133k
        ++FI.InstCount;
253
133k
      }
254
223k
    }
255
30.6k
  }
256
27.4k
257
27.4k
  return &FI;
258
27.4k
}
259
260
27.4k
bool AMDGPUPerfHint::runOnFunction(Function &F) {
261
27.4k
  const Module &M = *F.getParent();
262
27.4k
  DL = &M.getDataLayout();
263
27.4k
264
27.4k
  if (F.hasFnAttribute("amdgpu-wave-limiter") &&
265
27.4k
      
F.hasFnAttribute("amdgpu-memory-bound")0
)
266
0
    return false;
267
27.4k
268
27.4k
  const AMDGPUPerfHintAnalysis::FuncInfo *Info = visit(F);
269
27.4k
270
27.4k
  LLVM_DEBUG(dbgs() << F.getName() << " MemInst: " << Info->MemInstCount
271
27.4k
                    << '\n'
272
27.4k
                    << " IAMInst: " << Info->IAMInstCount << '\n'
273
27.4k
                    << " LSMInst: " << Info->LSMInstCount << '\n'
274
27.4k
                    << " TotalInst: " << Info->InstCount << '\n');
275
27.4k
276
27.4k
  if (isMemBound(*Info)) {
277
842
    LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n");
278
842
    NumMemBound++;
279
842
    F.addFnAttr("amdgpu-memory-bound", "true");
280
842
  }
281
27.4k
282
27.4k
  if (AMDGPU::isEntryFunctionCC(F.getCallingConv()) && 
needLimitWave(*Info)25.1k
) {
283
15.1k
    LLVM_DEBUG(dbgs() << F.getName() << " needs limit wave\n");
284
15.1k
    NumLimitWave++;
285
15.1k
    F.addFnAttr("amdgpu-wave-limiter", "true");
286
15.1k
  }
287
27.4k
288
27.4k
  return true;
289
27.4k
}
290
291
27.4k
bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
292
27.4k
  return FI.MemInstCount * 100 / FI.InstCount > MemBoundThresh;
293
27.4k
}
294
295
25.1k
bool AMDGPUPerfHint::needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
296
25.1k
  return ((FI.MemInstCount + FI.IAMInstCount * IAWeight +
297
25.1k
           FI.LSMInstCount * LSWeight) *
298
25.1k
          100 / FI.InstCount) > LimitWaveThresh;
299
25.1k
}
300
301
111k
bool AMDGPUPerfHint::isGlobalAddr(const Value *V) const {
302
111k
  if (auto PT = dyn_cast<PointerType>(V->getType())) {
303
111k
    unsigned As = PT->getAddressSpace();
304
111k
    // Flat likely points to global too.
305
111k
    return As == AMDGPUAS::GLOBAL_ADDRESS || 
As == AMDGPUAS::FLAT_ADDRESS77.6k
;
306
111k
  }
307
0
  return false;
308
0
}
309
310
111k
bool AMDGPUPerfHint::isLocalAddr(const Value *V) const {
311
111k
  if (auto PT = dyn_cast<PointerType>(V->getType()))
312
111k
    return PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
313
0
  return false;
314
0
}
315
316
83.6k
bool AMDGPUPerfHint::isLargeStride(const Instruction *Inst) {
317
83.6k
  LLVM_DEBUG(dbgs() << "[isLargeStride] " << *Inst << '\n');
318
83.6k
319
83.6k
  MemAccessInfo MAI = makeMemAccessInfo(const_cast<Instruction *>(Inst));
320
83.6k
  bool IsLargeStride = MAI.isLargeStride(LastAccess);
321
83.6k
  if (MAI.Base)
322
77.2k
    LastAccess = std::move(MAI);
323
83.6k
324
83.6k
  return IsLargeStride;
325
83.6k
}
326
327
AMDGPUPerfHint::MemAccessInfo
328
83.6k
AMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const {
329
83.6k
  MemAccessInfo MAI;
330
83.6k
  const Value *MO = getMemoryInstrPtr(Inst);
331
83.6k
332
83.6k
  LLVM_DEBUG(dbgs() << "[isLargeStride] MO: " << *MO << '\n');
333
83.6k
  // Do not treat local-addr memory access as large stride.
334
83.6k
  if (isLocalAddr(MO))
335
6.38k
    return MAI;
336
77.2k
337
77.2k
  MAI.V = MO;
338
77.2k
  MAI.Base = GetPointerBaseWithConstantOffset(MO, MAI.Offset, *DL);
339
77.2k
  return MAI;
340
77.2k
}
341
342
27.3k
bool AMDGPUPerfHint::isConstantAddr(const Value *V) const {
343
27.3k
  if (auto PT = dyn_cast<PointerType>(V->getType())) {
344
27.3k
    unsigned As = PT->getAddressSpace();
345
27.3k
    return As == AMDGPUAS::CONSTANT_ADDRESS ||
346
27.3k
           
As == AMDGPUAS::CONSTANT_ADDRESS_32BIT31
;
347
27.3k
  }
348
0
  return false;
349
0
}
350
351
bool AMDGPUPerfHint::MemAccessInfo::isLargeStride(
352
83.6k
    MemAccessInfo &Reference) const {
353
83.6k
354
83.6k
  if (!Base || 
!Reference.Base77.2k
||
Base != Reference.Base55.1k
)
355
55.8k
    return false;
356
27.8k
357
27.8k
  uint64_t Diff = Offset > Reference.Offset ? 
Offset - Reference.Offset23.6k
358
27.8k
                                            : 
Reference.Offset - Offset4.23k
;
359
27.8k
  bool Result = Diff > LargeStrideThresh;
360
27.8k
  LLVM_DEBUG(dbgs() << "[isLargeStride compare]\n"
361
27.8k
               << print() << "<=>\n"
362
27.8k
               << Reference.print() << "Result:" << Result << '\n');
363
27.8k
  return Result;
364
27.8k
}
365
} // namespace
366
367
35.9k
bool AMDGPUPerfHintAnalysis::runOnSCC(CallGraphSCC &SCC) {
368
35.9k
  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
369
35.9k
  if (!TPC)
370
0
    return false;
371
35.9k
372
35.9k
  const TargetMachine &TM = TPC->getTM<TargetMachine>();
373
35.9k
374
35.9k
  bool Changed = false;
375
35.9k
  for (CallGraphNode *I : SCC) {
376
35.9k
    Function *F = I->getFunction();
377
35.9k
    if (!F || 
F->isDeclaration()33.0k
)
378
8.47k
      continue;
379
27.4k
380
27.4k
    const TargetSubtargetInfo *ST = TM.getSubtargetImpl(*F);
381
27.4k
    AMDGPUPerfHint Analyzer(FIM, ST->getTargetLowering());
382
27.4k
383
27.4k
    if (Analyzer.runOnFunction(*F))
384
27.4k
      Changed = true;
385
27.4k
  }
386
35.9k
387
35.9k
  return Changed;
388
35.9k
}
389
390
0
bool AMDGPUPerfHintAnalysis::isMemoryBound(const Function *F) const {
391
0
  auto FI = FIM.find(F);
392
0
  if (FI == FIM.end())
393
0
    return false;
394
0
395
0
  return AMDGPUPerfHint::isMemBound(FI->second);
396
0
}
397
398
0
bool AMDGPUPerfHintAnalysis::needsWaveLimiter(const Function *F) const {
399
0
  auto FI = FIM.find(F);
400
0
  if (FI == FIM.end())
401
0
    return false;
402
0
403
0
  return AMDGPUPerfHint::needLimitWave(FI->second);
404
0
}