Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
Line
Count
Source (jump to first uncovered line)
1
//===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
/// \file This pass adds target attributes to functions which use intrinsics
10
/// which will impact calling convention lowering.
11
//
12
//===----------------------------------------------------------------------===//
13
14
#include "AMDGPU.h"
15
#include "AMDGPUSubtarget.h"
16
#include "Utils/AMDGPUBaseInfo.h"
17
#include "llvm/ADT/SmallPtrSet.h"
18
#include "llvm/ADT/SmallVector.h"
19
#include "llvm/ADT/StringRef.h"
20
#include "llvm/ADT/Triple.h"
21
#include "llvm/Analysis/CallGraph.h"
22
#include "llvm/Analysis/CallGraphSCCPass.h"
23
#include "llvm/CodeGen/TargetPassConfig.h"
24
#include "llvm/IR/CallSite.h"
25
#include "llvm/IR/Constant.h"
26
#include "llvm/IR/Constants.h"
27
#include "llvm/IR/Function.h"
28
#include "llvm/IR/Instruction.h"
29
#include "llvm/IR/Instructions.h"
30
#include "llvm/IR/Intrinsics.h"
31
#include "llvm/IR/Module.h"
32
#include "llvm/IR/Type.h"
33
#include "llvm/IR/Use.h"
34
#include "llvm/Pass.h"
35
#include "llvm/Support/Casting.h"
36
#include "llvm/Support/ErrorHandling.h"
37
#include "llvm/Target/TargetMachine.h"
38
39
#define DEBUG_TYPE "amdgpu-annotate-kernel-features"
40
41
using namespace llvm;
42
43
namespace {
44
45
class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
46
private:
47
  const TargetMachine *TM = nullptr;
48
  SmallVector<CallGraphNode*, 8> NodeList;
49
50
  bool addFeatureAttributes(Function &F);
51
  bool processUniformWorkGroupAttribute();
52
  bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee);
53
54
public:
55
  static char ID;
56
57
2.45k
  AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {}
58
59
  bool doInitialization(CallGraph &CG) override;
60
  bool runOnSCC(CallGraphSCC &SCC) override;
61
62
0
  StringRef getPassName() const override {
63
0
    return "AMDGPU Annotate Kernel Features";
64
0
  }
65
66
2.43k
  void getAnalysisUsage(AnalysisUsage &AU) const override {
67
2.43k
    AU.setPreservesAll();
68
2.43k
    CallGraphSCCPass::getAnalysisUsage(AU);
69
2.43k
  }
70
71
  static bool visitConstantExpr(const ConstantExpr *CE);
72
  static bool visitConstantExprsRecursively(
73
    const Constant *EntryC,
74
    SmallPtrSet<const Constant *, 8> &ConstantExprVisited);
75
};
76
77
} // end anonymous namespace
78
79
char AMDGPUAnnotateKernelFeatures::ID = 0;
80
81
char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;
82
83
INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
84
                "Add AMDGPU function attributes", false, false)
85
86
87
// The queue ptr is only needed when casting to flat, not from it.
88
100
static bool castRequiresQueuePtr(unsigned SrcAS) {
89
100
  return SrcAS == AMDGPUAS::LOCAL_ADDRESS || 
SrcAS == AMDGPUAS::PRIVATE_ADDRESS81
;
90
100
}
91
92
75
static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) {
93
75
  return castRequiresQueuePtr(ASC->getSrcAddressSpace());
94
75
}
95
96
220
bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) {
97
220
  if (CE->getOpcode() == Instruction::AddrSpaceCast) {
98
25
    unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
99
25
    return castRequiresQueuePtr(SrcAS);
100
25
  }
101
195
102
195
  return false;
103
195
}
104
105
bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
106
  const Constant *EntryC,
107
57.8k
  SmallPtrSet<const Constant *, 8> &ConstantExprVisited) {
108
57.8k
109
57.8k
  if (!ConstantExprVisited.insert(EntryC).second)
110
19.0k
    return false;
111
38.7k
112
38.7k
  SmallVector<const Constant *, 16> Stack;
113
38.7k
  Stack.push_back(EntryC);
114
38.7k
115
78.8k
  while (!Stack.empty()) {
116
40.1k
    const Constant *C = Stack.pop_back_val();
117
40.1k
118
40.1k
    // Check this constant expression.
119
40.1k
    if (const auto *CE = dyn_cast<ConstantExpr>(C)) {
120
220
      if (visitConstantExpr(CE))
121
15
        return true;
122
40.1k
    }
123
40.1k
124
40.1k
    // Visit all sub-expressions.
125
40.1k
    for (const Use &U : C->operands()) {
126
3.75k
      const auto *OpC = dyn_cast<Constant>(U);
127
3.75k
      if (!OpC)
128
0
        continue;
129
3.75k
130
3.75k
      if (!ConstantExprVisited.insert(OpC).second)
131
2.39k
        continue;
132
1.36k
133
1.36k
      Stack.push_back(OpC);
134
1.36k
    }
135
40.1k
  }
136
38.7k
137
38.7k
  
return false38.7k
;
138
38.7k
}
139
140
// We do not need to note the x workitem or workgroup id because they are always
141
// initialized.
142
//
143
// TODO: We should not add the attributes if the known compile time workgroup
144
// size is 1 for y/z.
145
static StringRef intrinsicToAttrName(Intrinsic::ID ID,
146
                                     bool &NonKernelOnly,
147
16.5k
                                     bool &IsQueuePtr) {
148
16.5k
  switch (ID) {
149
16.5k
  case Intrinsic::amdgcn_workitem_id_x:
150
3.29k
    NonKernelOnly = true;
151
3.29k
    return "amdgpu-work-item-id-x";
152
16.5k
  case Intrinsic::amdgcn_workgroup_id_x:
153
64
    NonKernelOnly = true;
154
64
    return "amdgpu-work-group-id-x";
155
16.5k
  case Intrinsic::amdgcn_workitem_id_y:
156
167
  case Intrinsic::r600_read_tidig_y:
157
167
    return "amdgpu-work-item-id-y";
158
167
  case Intrinsic::amdgcn_workitem_id_z:
159
91
  case Intrinsic::r600_read_tidig_z:
160
91
    return "amdgpu-work-item-id-z";
161
91
  case Intrinsic::amdgcn_workgroup_id_y:
162
53
  case Intrinsic::r600_read_tgid_y:
163
53
    return "amdgpu-work-group-id-y";
164
53
  case Intrinsic::amdgcn_workgroup_id_z:
165
44
  case Intrinsic::r600_read_tgid_z:
166
44
    return "amdgpu-work-group-id-z";
167
53
  case Intrinsic::amdgcn_dispatch_ptr:
168
53
    return "amdgpu-dispatch-ptr";
169
44
  case Intrinsic::amdgcn_dispatch_id:
170
11
    return "amdgpu-dispatch-id";
171
44
  case Intrinsic::amdgcn_kernarg_segment_ptr:
172
30
    return "amdgpu-kernarg-segment-ptr";
173
57
  case Intrinsic::amdgcn_implicitarg_ptr:
174
57
    return "amdgpu-implicitarg-ptr";
175
51
  case Intrinsic::amdgcn_queue_ptr:
176
51
  case Intrinsic::trap:
177
51
  case Intrinsic::debugtrap:
178
51
    IsQueuePtr = true;
179
51
    return "amdgpu-queue-ptr";
180
12.6k
  default:
181
12.6k
    return "";
182
16.5k
  }
183
16.5k
}
184
185
static bool handleAttr(Function &Parent, const Function &Callee,
186
8.25k
                       StringRef Name) {
187
8.25k
  if (Callee.hasFnAttribute(Name)) {
188
151
    Parent.addFnAttr(Name);
189
151
    return true;
190
151
  }
191
8.09k
  return false;
192
8.09k
}
193
194
static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
195
750
                                   bool &NeedQueuePtr) {
196
750
  // X ids unnecessarily propagated to kernels.
197
750
  static const StringRef AttrNames[] = {
198
750
    { "amdgpu-work-item-id-x" },
199
750
    { "amdgpu-work-item-id-y" },
200
750
    { "amdgpu-work-item-id-z" },
201
750
    { "amdgpu-work-group-id-x" },
202
750
    { "amdgpu-work-group-id-y" },
203
750
    { "amdgpu-work-group-id-z" },
204
750
    { "amdgpu-dispatch-ptr" },
205
750
    { "amdgpu-dispatch-id" },
206
750
    { "amdgpu-kernarg-segment-ptr" },
207
750
    { "amdgpu-implicitarg-ptr" }
208
750
  };
209
750
210
750
  if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
211
11
    NeedQueuePtr = true;
212
750
213
750
  for (StringRef AttrName : AttrNames)
214
7.50k
    handleAttr(Parent, Callee, AttrName);
215
750
}
216
217
2.43k
bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() {
218
2.43k
  bool Changed = false;
219
2.43k
220
30.7k
  for (auto *Node : reverse(NodeList)) {
221
30.7k
    Function *Caller = Node->getFunction();
222
30.7k
223
30.7k
    for (auto I : *Node) {
224
2.60k
      Function *Callee = std::get<1>(I)->getFunction();
225
2.60k
      if (Callee)
226
750
        Changed = propagateUniformWorkGroupAttribute(*Caller, *Callee);
227
2.60k
    }
228
30.7k
  }
229
2.43k
230
2.43k
  return Changed;
231
2.43k
}
232
233
bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute(
234
750
       Function &Caller, Function &Callee) {
235
750
236
750
  // Check for externally defined function
237
750
  if (!Callee.hasExactDefinition()) {
238
461
    Callee.addFnAttr("uniform-work-group-size", "false");
239
461
    if (!Caller.hasFnAttribute("uniform-work-group-size"))
240
443
      Caller.addFnAttr("uniform-work-group-size", "false");
241
461
242
461
    return true;
243
461
  }
244
289
  // Check if the Caller has the attribute
245
289
  if (Caller.hasFnAttribute("uniform-work-group-size")) {
246
56
    // Check if the value of the attribute is true
247
56
    if (Caller.getFnAttribute("uniform-work-group-size")
248
56
        .getValueAsString().equals("true")) {
249
6
      // Propagate the attribute to the Callee, if it does not have it
250
6
      if (!Callee.hasFnAttribute("uniform-work-group-size")) {
251
3
        Callee.addFnAttr("uniform-work-group-size", "true");
252
3
        return true;
253
3
      }
254
50
    } else {
255
50
      Callee.addFnAttr("uniform-work-group-size", "false");
256
50
      return true;
257
50
    }
258
233
  } else {
259
233
    // If the attribute is absent, set it as false
260
233
    Caller.addFnAttr("uniform-work-group-size", "false");
261
233
    Callee.addFnAttr("uniform-work-group-size", "false");
262
233
    return true;
263
233
  }
264
3
  return false;
265
3
}
266
267
25.3k
bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
268
25.3k
  const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
269
25.3k
  bool HasFlat = ST.hasFlatAddressSpace();
270
25.3k
  bool HasApertureRegs = ST.hasApertureRegs();
271
25.3k
  SmallPtrSet<const Constant *, 8> ConstantExprVisited;
272
25.3k
273
25.3k
  bool Changed = false;
274
25.3k
  bool NeedQueuePtr = false;
275
25.3k
  bool HaveCall = false;
276
25.3k
  bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv());
277
25.3k
278
28.2k
  for (BasicBlock &BB : F) {
279
163k
    for (Instruction &I : BB) {
280
163k
      CallSite CS(&I);
281
163k
      if (CS) {
282
18.8k
        Function *Callee = CS.getCalledFunction();
283
18.8k
284
18.8k
        // TODO: Do something with indirect calls.
285
18.8k
        if (!Callee) {
286
1.50k
          if (!CS.isInlineAsm())
287
1
            HaveCall = true;
288
1.50k
          continue;
289
1.50k
        }
290
17.3k
291
17.3k
        Intrinsic::ID IID = Callee->getIntrinsicID();
292
17.3k
        if (IID == Intrinsic::not_intrinsic) {
293
750
          HaveCall = true;
294
750
          copyFeaturesToFunction(F, *Callee, NeedQueuePtr);
295
750
          Changed = true;
296
16.5k
        } else {
297
16.5k
          bool NonKernelOnly = false;
298
16.5k
          StringRef AttrName = intrinsicToAttrName(IID,
299
16.5k
                                                   NonKernelOnly, NeedQueuePtr);
300
16.5k
          if (!AttrName.empty() && 
(3.91k
IsFunc3.91k
||
!NonKernelOnly3.76k
)) {
301
598
            F.addFnAttr(AttrName);
302
598
            Changed = true;
303
598
          }
304
16.5k
        }
305
17.3k
      }
306
163k
307
163k
      
if (162k
NeedQueuePtr162k
||
HasApertureRegs162k
)
308
47.1k
        continue;
309
115k
310
115k
      if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
311
75
        if (castRequiresQueuePtr(ASC)) {
312
22
          NeedQueuePtr = true;
313
22
          continue;
314
22
        }
315
115k
      }
316
115k
317
194k
      
for (const Use &U : I.operands())115k
{
318
194k
        const auto *OpC = dyn_cast<Constant>(U);
319
194k
        if (!OpC)
320
136k
          continue;
321
57.8k
322
57.8k
        if (visitConstantExprsRecursively(OpC, ConstantExprVisited)) {
323
15
          NeedQueuePtr = true;
324
15
          break;
325
15
        }
326
57.8k
      }
327
115k
    }
328
28.2k
  }
329
25.3k
330
25.3k
  if (NeedQueuePtr) {
331
99
    F.addFnAttr("amdgpu-queue-ptr");
332
99
    Changed = true;
333
99
  }
334
25.3k
335
25.3k
  // TODO: We could refine this to captured pointers that could possibly be
336
25.3k
  // accessed by flat instructions. For now this is mostly a poor way of
337
25.3k
  // estimating whether there are calls before argument lowering.
338
25.3k
  if (HasFlat && 
!IsFunc18.8k
&&
HaveCall16.6k
) {
339
485
    F.addFnAttr("amdgpu-flat-scratch");
340
485
    Changed = true;
341
485
  }
342
25.3k
343
25.3k
  return Changed;
344
25.3k
}
345
346
33.1k
bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) {
347
33.1k
  bool Changed = false;
348
33.1k
349
33.1k
  for (CallGraphNode *I : SCC) {
350
33.1k
    // Build a list of CallGraphNodes from most number of uses to least
351
33.1k
    if (I->getNumReferences())
352
30.7k
      NodeList.push_back(I);
353
2.43k
    else {
354
2.43k
      processUniformWorkGroupAttribute();
355
2.43k
      NodeList.clear();
356
2.43k
    }
357
33.1k
358
33.1k
    Function *F = I->getFunction();
359
33.1k
    // Add feature attributes
360
33.1k
    if (!F || 
F->isDeclaration()30.5k
)
361
7.87k
      continue;
362
25.3k
    Changed |= addFeatureAttributes(*F);
363
25.3k
  }
364
33.1k
365
33.1k
  return Changed;
366
33.1k
}
367
368
2.43k
bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) {
369
2.43k
  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
370
2.43k
  if (!TPC)
371
0
    report_fatal_error("TargetMachine is required");
372
2.43k
373
2.43k
  TM = &TPC->getTM<TargetMachine>();
374
2.43k
  return false;
375
2.43k
}
376
377
2.44k
Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
378
2.44k
  return new AMDGPUAnnotateKernelFeatures();
379
2.44k
}