Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
Line
Count
Source (jump to first uncovered line)
1
//===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
/// \file
10
/// Implements the AMDGPU specific subclass of TargetSubtarget.
11
//
12
//===----------------------------------------------------------------------===//
13
14
#include "AMDGPUSubtarget.h"
15
#include "AMDGPU.h"
16
#include "AMDGPUTargetMachine.h"
17
#include "AMDGPUCallLowering.h"
18
#include "AMDGPUInstructionSelector.h"
19
#include "AMDGPULegalizerInfo.h"
20
#include "AMDGPURegisterBankInfo.h"
21
#include "SIMachineFunctionInfo.h"
22
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23
#include "llvm/ADT/SmallString.h"
24
#include "llvm/CodeGen/MachineScheduler.h"
25
#include "llvm/MC/MCSubtargetInfo.h"
26
#include "llvm/IR/MDBuilder.h"
27
#include "llvm/CodeGen/TargetFrameLowering.h"
28
#include <algorithm>
29
30
using namespace llvm;
31
32
#define DEBUG_TYPE "amdgpu-subtarget"
33
34
#define GET_SUBTARGETINFO_TARGET_DESC
35
#define GET_SUBTARGETINFO_CTOR
36
#define AMDGPUSubtarget GCNSubtarget
37
#include "AMDGPUGenSubtargetInfo.inc"
38
#define GET_SUBTARGETINFO_TARGET_DESC
39
#define GET_SUBTARGETINFO_CTOR
40
#undef AMDGPUSubtarget
41
#include "R600GenSubtargetInfo.inc"
42
43
static cl::opt<bool> DisablePowerSched(
44
  "amdgpu-disable-power-sched",
45
  cl::desc("Disable scheduling to minimize mAI power bursts"),
46
  cl::init(false));
47
48
3.61k
GCNSubtarget::~GCNSubtarget() = default;
49
50
R600Subtarget &
51
R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
52
290
                                               StringRef GPU, StringRef FS) {
53
290
  SmallString<256> FullFS("+promote-alloca,");
54
290
  FullFS += FS;
55
290
  ParseSubtargetFeatures(GPU, FullFS);
56
290
57
290
  // FIXME: I don't think think Evergreen has any useful support for
58
290
  // denormals, but should be checked. Should we issue a warning somewhere
59
290
  // if someone tries to enable these?
60
290
  if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
61
290
    FP32Denormals = false;
62
290
  }
63
290
64
290
  HasMulU24 = getGeneration() >= EVERGREEN;
65
290
  HasMulI24 = hasCaymanISA();
66
290
67
290
  return *this;
68
290
}
69
70
GCNSubtarget &
71
GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
72
3.64k
                                              StringRef GPU, StringRef FS) {
73
3.64k
  // Determine default and user-specified characteristics
74
3.64k
  // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
75
3.64k
  // enabled, but some instructions do not respect them and they run at the
76
3.64k
  // double precision rate, so don't enable by default.
77
3.64k
  //
78
3.64k
  // We want to be able to turn these off, but making this a subtarget feature
79
3.64k
  // for SI has the unhelpful behavior that it unsets everything else if you
80
3.64k
  // disable it.
81
3.64k
  //
82
3.64k
  // Similarly we want enable-prt-strict-null to be on by default and not to
83
3.64k
  // unset everything else if it is disabled
84
3.64k
85
3.64k
  // Assuming ECC is enabled is the conservative default.
86
3.64k
  SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,");
87
3.64k
88
3.64k
  if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
89
681
    FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
90
3.64k
91
3.64k
  // FIXME: I don't think think Evergreen has any useful support for
92
3.64k
  // denormals, but should be checked. Should we issue a warning somewhere
93
3.64k
  // if someone tries to enable these?
94
3.64k
  if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
95
3.64k
    FullFS += "+fp64-fp16-denormals,";
96
3.64k
  } else {
97
0
    FullFS += "-fp32-denormals,";
98
0
  }
99
3.64k
100
3.64k
  FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
101
3.64k
102
3.64k
  // Disable mutually exclusive bits.
103
3.64k
  if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
104
60
    if (FS.find_lower("wavefrontsize16") == StringRef::npos)
105
53
      FullFS += "-wavefrontsize16,";
106
60
    if (FS.find_lower("wavefrontsize32") == StringRef::npos)
107
11
      FullFS += "-wavefrontsize32,";
108
60
    if (FS.find_lower("wavefrontsize64") == StringRef::npos)
109
12
      FullFS += "-wavefrontsize64,";
110
60
  }
111
3.64k
112
3.64k
  FullFS += FS;
113
3.64k
114
3.64k
  ParseSubtargetFeatures(GPU, FullFS);
115
3.64k
116
3.64k
  // We don't support FP64 for EG/NI atm.
117
3.64k
  assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
118
3.64k
119
3.64k
  // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
120
3.64k
  // on VI and newer hardware to avoid assertion failures due to missing ADDR64
121
3.64k
  // variants of MUBUF instructions.
122
3.64k
  if (!hasAddr64() && 
!FS.contains("flat-for-global")2.01k
) {
123
1.62k
    FlatForGlobal = true;
124
1.62k
  }
125
3.64k
126
3.64k
  // Set defaults if needed.
127
3.64k
  if (MaxPrivateElementSize == 0)
128
3.61k
    MaxPrivateElementSize = 4;
129
3.64k
130
3.64k
  if (LDSBankCount == 0)
131
888
    LDSBankCount = 32;
132
3.64k
133
3.64k
  if (TT.getArch() == Triple::amdgcn) {
134
3.62k
    if (LocalMemorySize == 0)
135
868
      LocalMemorySize = 32768;
136
3.62k
137
3.62k
    // Do something sensible for unspecified target.
138
3.62k
    if (!HasMovrel && 
!HasVGPRIndexMode1.42k
)
139
868
      HasMovrel = true;
140
3.62k
  }
141
3.64k
142
3.64k
  // Don't crash on invalid devices.
143
3.64k
  if (WavefrontSize == 0)
144
21
    WavefrontSize = 64;
145
3.64k
146
3.64k
  HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
147
3.64k
148
3.64k
  if (DoesNotSupportXNACK && 
EnableXNACK2.64k
) {
149
2.64k
    ToggleFeature(AMDGPU::FeatureXNACK);
150
2.64k
    EnableXNACK = false;
151
2.64k
  }
152
3.64k
153
3.64k
  // ECC is on by default, but turn it off if the hardware doesn't support it
154
3.64k
  // anyway. This matters for the gfx9 targets with d16 loads, but don't support
155
3.64k
  // ECC.
156
3.64k
  if (DoesNotSupportSRAMECC && 
EnableSRAMECC2.69k
) {
157
2.69k
    ToggleFeature(AMDGPU::FeatureSRAMECC);
158
2.69k
    EnableSRAMECC = false;
159
2.69k
  }
160
3.64k
161
3.64k
  return *this;
162
3.64k
}
163
164
AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
165
  TargetTriple(TT),
166
  Has16BitInsts(false),
167
  HasMadMixInsts(false),
168
  FP32Denormals(false),
169
  FPExceptions(false),
170
  HasSDWA(false),
171
  HasVOP3PInsts(false),
172
  HasMulI24(true),
173
  HasMulU24(true),
174
  HasInv2PiInlineImm(false),
175
  HasFminFmaxLegacy(true),
176
  EnablePromoteAlloca(false),
177
  HasTrigReducedRange(false),
178
  LocalMemorySize(0),
179
  WavefrontSize(0)
180
3.93k
  { }
181
182
GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
183
                           const GCNTargetMachine &TM) :
184
    AMDGPUGenSubtargetInfo(TT, GPU, FS),
185
    AMDGPUSubtarget(TT),
186
    TargetTriple(TT),
187
    Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
188
    InstrItins(getInstrItineraryForCPU(GPU)),
189
    LDSBankCount(0),
190
    MaxPrivateElementSize(0),
191
192
    FastFMAF32(false),
193
    HalfRate64Ops(false),
194
195
    FP64FP16Denormals(false),
196
    FlatForGlobal(false),
197
    AutoWaitcntBeforeBarrier(false),
198
    CodeObjectV3(false),
199
    UnalignedScratchAccess(false),
200
    UnalignedBufferAccess(false),
201
202
    HasApertureRegs(false),
203
    EnableXNACK(false),
204
    DoesNotSupportXNACK(false),
205
    EnableCuMode(false),
206
    TrapHandler(false),
207
208
    EnableLoadStoreOpt(false),
209
    EnableUnsafeDSOffsetFolding(false),
210
    EnableSIScheduler(false),
211
    EnableDS128(false),
212
    EnablePRTStrictNull(false),
213
    DumpCode(false),
214
215
    FP64(false),
216
    GCN3Encoding(false),
217
    CIInsts(false),
218
    GFX8Insts(false),
219
    GFX9Insts(false),
220
    GFX10Insts(false),
221
    GFX7GFX8GFX9Insts(false),
222
    SGPRInitBug(false),
223
    HasSMemRealTime(false),
224
    HasIntClamp(false),
225
    HasFmaMixInsts(false),
226
    HasMovrel(false),
227
    HasVGPRIndexMode(false),
228
    HasScalarStores(false),
229
    HasScalarAtomics(false),
230
    HasSDWAOmod(false),
231
    HasSDWAScalar(false),
232
    HasSDWASdst(false),
233
    HasSDWAMac(false),
234
    HasSDWAOutModsVOPC(false),
235
    HasDPP(false),
236
    HasDPP8(false),
237
    HasR128A16(false),
238
    HasNSAEncoding(false),
239
    HasDLInsts(false),
240
    HasDot1Insts(false),
241
    HasDot2Insts(false),
242
    HasDot3Insts(false),
243
    HasDot4Insts(false),
244
    HasDot5Insts(false),
245
    HasDot6Insts(false),
246
    HasMAIInsts(false),
247
    HasPkFmacF16Inst(false),
248
    HasAtomicFaddInsts(false),
249
    EnableSRAMECC(false),
250
    DoesNotSupportSRAMECC(false),
251
    HasNoSdstCMPX(false),
252
    HasVscnt(false),
253
    HasRegisterBanking(false),
254
    HasVOP3Literal(false),
255
    HasNoDataDepHazard(false),
256
    FlatAddressSpace(false),
257
    FlatInstOffsets(false),
258
    FlatGlobalInsts(false),
259
    FlatScratchInsts(false),
260
    ScalarFlatScratchInsts(false),
261
    AddNoCarryInsts(false),
262
    HasUnpackedD16VMem(false),
263
    LDSMisalignedBug(false),
264
265
    ScalarizeGlobal(false),
266
267
    HasVcmpxPermlaneHazard(false),
268
    HasVMEMtoScalarWriteHazard(false),
269
    HasSMEMtoVectorWriteHazard(false),
270
    HasInstFwdPrefetchBug(false),
271
    HasVcmpxExecWARHazard(false),
272
    HasLdsBranchVmemWARHazard(false),
273
    HasNSAtoVMEMBug(false),
274
    HasOffset3fBug(false),
275
    HasFlatSegmentOffsetBug(false),
276
277
    FeatureDisable(false),
278
    InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
279
    TLInfo(TM, *this),
280
3.64k
    FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
281
3.64k
  CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
282
3.64k
  Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
283
3.64k
  RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
284
3.64k
  InstSelector.reset(new AMDGPUInstructionSelector(
285
3.64k
  *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
286
3.64k
}
287
288
6.07M
unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
289
6.07M
  if (getGeneration() < GFX10)
290
5.71M
    return 1;
291
360k
292
360k
  switch (Opcode) {
293
360k
  case AMDGPU::V_LSHLREV_B64:
294
1.19k
  case AMDGPU::V_LSHLREV_B64_gfx10:
295
1.19k
  case AMDGPU::V_LSHL_B64:
296
1.19k
  case AMDGPU::V_LSHRREV_B64:
297
1.19k
  case AMDGPU::V_LSHRREV_B64_gfx10:
298
1.19k
  case AMDGPU::V_LSHR_B64:
299
1.19k
  case AMDGPU::V_ASHRREV_I64:
300
1.19k
  case AMDGPU::V_ASHRREV_I64_gfx10:
301
1.19k
  case AMDGPU::V_ASHR_I64:
302
1.19k
    return 1;
303
359k
  }
304
359k
305
359k
  return 2;
306
359k
}
307
308
unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
309
24.3k
  const Function &F) const {
310
24.3k
  if (NWaves == 1)
311
27
    return getLocalMemorySize();
312
24.3k
  unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
313
24.3k
  unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
314
24.3k
  if (!WorkGroupsPerCu)
315
1
    return 0;
316
24.3k
  unsigned MaxWaves = getMaxWavesPerEU();
317
24.3k
  return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
318
24.3k
}
319
320
unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
321
321k
  const Function &F) const {
322
321k
  unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
323
321k
  unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
324
321k
  if (!WorkGroupsPerCu)
325
1
    return 0;
326
321k
  unsigned MaxWaves = getMaxWavesPerEU();
327
321k
  unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
328
321k
  unsigned NumWaves = Limit / (Bytes ? 
Bytes2.07k
:
1u319k
);
329
321k
  NumWaves = std::min(NumWaves, MaxWaves);
330
321k
  NumWaves = std::max(NumWaves, 1u);
331
321k
  return NumWaves;
332
321k
}
333
334
unsigned
335
0
AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
336
0
  const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
337
0
  return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
338
0
}
339
340
std::pair<unsigned, unsigned>
341
438k
AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
342
438k
  switch (CC) {
343
438k
  case CallingConv::AMDGPU_CS:
344
336k
  case CallingConv::AMDGPU_KERNEL:
345
336k
  case CallingConv::SPIR_KERNEL:
346
336k
    return std::make_pair(getWavefrontSize() * 2,
347
336k
                          std::max(getWavefrontSize() * 4, 256u));
348
336k
  case CallingConv::AMDGPU_VS:
349
48.7k
  case CallingConv::AMDGPU_LS:
350
48.7k
  case CallingConv::AMDGPU_HS:
351
48.7k
  case CallingConv::AMDGPU_ES:
352
48.7k
  case CallingConv::AMDGPU_GS:
353
48.7k
  case CallingConv::AMDGPU_PS:
354
48.7k
    return std::make_pair(1, getWavefrontSize());
355
53.4k
  default:
356
53.4k
    return std::make_pair(1, 16 * getWavefrontSize());
357
438k
  }
358
438k
}
359
360
std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
361
438k
  const Function &F) const {
362
438k
  // FIXME: 1024 if function.
363
438k
  // Default minimum/maximum flat work group sizes.
364
438k
  std::pair<unsigned, unsigned> Default =
365
438k
    getDefaultFlatWorkGroupSize(F.getCallingConv());
366
438k
367
438k
  // Requested minimum/maximum flat work group sizes.
368
438k
  std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
369
438k
    F, "amdgpu-flat-work-group-size", Default);
370
438k
371
438k
  // Make sure requested minimum is less than requested maximum.
372
438k
  if (Requested.first > Requested.second)
373
0
    return Default;
374
438k
375
438k
  // Make sure requested values do not violate subtarget's specifications.
376
438k
  if (Requested.first < getMinFlatWorkGroupSize())
377
0
    return Default;
378
438k
  if (Requested.second > getMaxFlatWorkGroupSize())
379
0
    return Default;
380
438k
381
438k
  return Requested;
382
438k
}
383
384
std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
385
55.6k
  const Function &F) const {
386
55.6k
  // Default minimum/maximum number of waves per execution unit.
387
55.6k
  std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
388
55.6k
389
55.6k
  // Default/requested minimum/maximum flat work group sizes.
390
55.6k
  std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
391
55.6k
392
55.6k
  // If minimum/maximum flat work group sizes were explicitly requested using
393
55.6k
  // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
394
55.6k
  // number of waves per execution unit to values implied by requested
395
55.6k
  // minimum/maximum flat work group sizes.
396
55.6k
  unsigned MinImpliedByFlatWorkGroupSize =
397
55.6k
    getMaxWavesPerEU(FlatWorkGroupSizes.second);
398
55.6k
  bool RequestedFlatWorkGroupSize = false;
399
55.6k
400
55.6k
  if (F.hasFnAttribute("amdgpu-flat-work-group-size")) {
401
145
    Default.first = MinImpliedByFlatWorkGroupSize;
402
145
    RequestedFlatWorkGroupSize = true;
403
145
  }
404
55.6k
405
55.6k
  // Requested minimum/maximum number of waves per execution unit.
406
55.6k
  std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
407
55.6k
    F, "amdgpu-waves-per-eu", Default, true);
408
55.6k
409
55.6k
  // Make sure requested minimum is less than requested maximum.
410
55.6k
  if (Requested.second && Requested.first > Requested.second)
411
0
    return Default;
412
55.6k
413
55.6k
  // Make sure requested values do not violate subtarget's specifications.
414
55.6k
  if (Requested.first < getMinWavesPerEU() ||
415
55.6k
      Requested.first > getMaxWavesPerEU())
416
0
    return Default;
417
55.6k
  if (Requested.second > getMaxWavesPerEU())
418
0
    return Default;
419
55.6k
420
55.6k
  // Make sure requested values are compatible with values implied by requested
421
55.6k
  // minimum/maximum flat work group sizes.
422
55.6k
  if (RequestedFlatWorkGroupSize &&
423
55.6k
      
Requested.first < MinImpliedByFlatWorkGroupSize145
)
424
4
    return Default;
425
55.6k
426
55.6k
  return Requested;
427
55.6k
}
428
429
4.76k
bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
430
4.76k
  Function *Kernel = I->getParent()->getParent();
431
4.76k
  unsigned MinSize = 0;
432
4.76k
  unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
433
4.76k
  bool IdQuery = false;
434
4.76k
435
4.76k
  // If reqd_work_group_size is present it narrows value down.
436
4.76k
  if (auto *CI = dyn_cast<CallInst>(I)) {
437
4.69k
    const Function *F = CI->getCalledFunction();
438
4.69k
    if (F) {
439
4.69k
      unsigned Dim = UINT_MAX;
440
4.69k
      switch (F->getIntrinsicID()) {
441
4.69k
      case Intrinsic::amdgcn_workitem_id_x:
442
3.93k
      case Intrinsic::r600_read_tidig_x:
443
3.93k
        IdQuery = true;
444
3.93k
        LLVM_FALLTHROUGH;
445
3.95k
      case Intrinsic::r600_read_local_size_x:
446
3.95k
        Dim = 0;
447
3.95k
        break;
448
3.93k
      case Intrinsic::amdgcn_workitem_id_y:
449
280
      case Intrinsic::r600_read_tidig_y:
450
280
        IdQuery = true;
451
280
        LLVM_FALLTHROUGH;
452
409
      case Intrinsic::r600_read_local_size_y:
453
409
        Dim = 1;
454
409
        break;
455
280
      case Intrinsic::amdgcn_workitem_id_z:
456
207
      case Intrinsic::r600_read_tidig_z:
457
207
        IdQuery = true;
458
207
        LLVM_FALLTHROUGH;
459
336
      case Intrinsic::r600_read_local_size_z:
460
336
        Dim = 2;
461
336
        break;
462
207
      default:
463
0
        break;
464
4.69k
      }
465
4.69k
      if (Dim <= 3) {
466
4.69k
        if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
467
14
          if (Node->getNumOperands() == 3)
468
14
            MinSize = MaxSize = mdconst::extract<ConstantInt>(
469
14
                                  Node->getOperand(Dim))->getZExtValue();
470
4.69k
      }
471
4.69k
    }
472
4.69k
  }
473
4.76k
474
4.76k
  if (!MaxSize)
475
0
    return false;
476
4.76k
477
4.76k
  // Range metadata is [Lo, Hi). For ID query we need to pass max size
478
4.76k
  // as Hi. For size query we need to pass Hi + 1.
479
4.76k
  if (IdQuery)
480
4.42k
    MinSize = 0;
481
343
  else
482
343
    ++MaxSize;
483
4.76k
484
4.76k
  MDBuilder MDB(I->getContext());
485
4.76k
  MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
486
4.76k
                                                  APInt(32, MaxSize));
487
4.76k
  I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
488
4.76k
  return true;
489
4.76k
}
490
491
uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
492
46.0k
                                                 unsigned &MaxAlign) const {
493
46.0k
  assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
494
46.0k
         F.getCallingConv() == CallingConv::SPIR_KERNEL);
495
46.0k
496
46.0k
  const DataLayout &DL = F.getParent()->getDataLayout();
497
46.0k
  uint64_t ExplicitArgBytes = 0;
498
46.0k
  MaxAlign = 1;
499
46.0k
500
99.6k
  for (const Argument &Arg : F.args()) {
501
99.6k
    Type *ArgTy = Arg.getType();
502
99.6k
503
99.6k
    unsigned Align = DL.getABITypeAlignment(ArgTy);
504
99.6k
    uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
505
99.6k
    ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
506
99.6k
    MaxAlign = std::max(MaxAlign, Align);
507
99.6k
  }
508
46.0k
509
46.0k
  return ExplicitArgBytes;
510
46.0k
}
511
512
unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
513
23.2k
                                                unsigned &MaxAlign) const {
514
23.2k
  uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
515
23.2k
516
23.2k
  unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
517
23.2k
518
23.2k
  uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
519
23.2k
  unsigned ImplicitBytes = getImplicitArgNumBytes(F);
520
23.2k
  if (ImplicitBytes != 0) {
521
1.84k
    unsigned Alignment = getAlignmentForImplicitArgPtr();
522
1.84k
    TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
523
1.84k
  }
524
23.2k
525
23.2k
  // Being able to dereference past the end is useful for emitting scalar loads.
526
23.2k
  return alignTo(TotalSize, 4);
527
23.2k
}
528
529
R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
530
                             const TargetMachine &TM) :
531
  R600GenSubtargetInfo(TT, GPU, FS),
532
  AMDGPUSubtarget(TT),
533
  InstrInfo(*this),
534
  FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
535
  FMA(false),
536
  CaymanISA(false),
537
  CFALUBug(false),
538
  HasVertexCache(false),
539
  R600ALUInst(false),
540
  FP64(false),
541
  TexVTXClauseSize(0),
542
  Gen(R600),
543
  TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
544
290
  InstrItins(getInstrItineraryForCPU(GPU)) { }
545
546
void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
547
56.9k
                                      unsigned NumRegionInstrs) const {
548
56.9k
  // Track register pressure so the scheduler can try to decrease
549
56.9k
  // pressure once register usage is above the threshold defined by
550
56.9k
  // SIRegisterInfo::getRegPressureSetLimit()
551
56.9k
  Policy.ShouldTrackPressure = true;
552
56.9k
553
56.9k
  // Enabling both top down and bottom up scheduling seems to give us less
554
56.9k
  // register spills than just using one of these approaches on its own.
555
56.9k
  Policy.OnlyTopDown = false;
556
56.9k
  Policy.OnlyBottomUp = false;
557
56.9k
558
56.9k
  // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
559
56.9k
  if (!enableSIScheduler())
560
56.9k
    Policy.ShouldTrackLaneMasks = true;
561
56.9k
}
562
563
82
bool GCNSubtarget::hasMadF16() const {
564
82
  return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
565
82
}
566
567
10.3k
unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
568
10.3k
  if (getGeneration() >= AMDGPUSubtarget::GFX10)
569
0
    return 10;
570
10.3k
571
10.3k
  if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
572
1.87k
    if (SGPRs <= 80)
573
1.82k
      return 10;
574
55
    if (SGPRs <= 88)
575
0
      return 9;
576
55
    if (SGPRs <= 100)
577
7
      return 8;
578
48
    return 7;
579
48
  }
580
8.42k
  if (SGPRs <= 48)
581
8.20k
    return 10;
582
227
  if (SGPRs <= 56)
583
18
    return 9;
584
209
  if (SGPRs <= 64)
585
24
    return 8;
586
185
  if (SGPRs <= 72)
587
53
    return 7;
588
132
  if (SGPRs <= 80)
589
12
    return 6;
590
120
  return 5;
591
120
}
592
593
10.3k
unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
594
10.3k
  unsigned MaxWaves = getMaxWavesPerEU();
595
10.3k
  unsigned Granule = getVGPRAllocGranule();
596
10.3k
  if (VGPRs < Granule)
597
7.16k
    return MaxWaves;
598
3.14k
  unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
599
3.14k
  return std::min(getTotalNumVGPRs() / RoundedRegs, MaxWaves);
600
3.14k
}
601
602
269k
unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
603
269k
  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
604
269k
  if (getGeneration() >= AMDGPUSubtarget::GFX10)
605
24.4k
    return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
606
244k
607
244k
  if (MFI.hasFlatScratchInit()) {
608
3.94k
    if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
609
2.32k
      return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
610
1.61k
    if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
611
1.61k
      return 4; // FLAT_SCRATCH, VCC (in that order).
612
240k
  }
613
240k
614
240k
  if (isXNACKEnabled())
615
52.6k
    return 4; // XNACK, VCC (in that order).
616
187k
  return 2; // VCC.
617
187k
}
618
619
268k
unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
620
268k
  const Function &F = MF.getFunction();
621
268k
  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
622
268k
623
268k
  // Compute maximum number of SGPRs function can use using default/requested
624
268k
  // minimum number of waves per execution unit.
625
268k
  std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
626
268k
  unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
627
268k
  unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
628
268k
629
268k
  // Check if maximum number of SGPRs was explicitly requested using
630
268k
  // "amdgpu-num-sgpr" attribute.
631
268k
  if (F.hasFnAttribute("amdgpu-num-sgpr")) {
632
139
    unsigned Requested = AMDGPU::getIntegerAttribute(
633
139
      F, "amdgpu-num-sgpr", MaxNumSGPRs);
634
139
635
139
    // Make sure requested value does not violate subtarget's specifications.
636
139
    if (Requested && (Requested <= getReservedNumSGPRs(MF)))
637
0
      Requested = 0;
638
139
639
139
    // If more SGPRs are required to support the input user/system SGPRs,
640
139
    // increase to accommodate them.
641
139
    //
642
139
    // FIXME: This really ends up using the requested number of SGPRs + number
643
139
    // of reserved special registers in total. Theoretically you could re-use
644
139
    // the last input registers for these special registers, but this would
645
139
    // require a lot of complexity to deal with the weird aliasing.
646
139
    unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
647
139
    if (Requested && Requested < InputNumSGPRs)
648
2
      Requested = InputNumSGPRs;
649
139
650
139
    // Make sure requested value is compatible with values implied by
651
139
    // default/requested minimum/maximum number of waves per execution unit.
652
139
    if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
653
0
      Requested = 0;
654
139
    if (WavesPerEU.second &&
655
139
        Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
656
0
      Requested = 0;
657
139
658
139
    if (Requested)
659
139
      MaxNumSGPRs = Requested;
660
139
  }
661
268k
662
268k
  if (hasSGPRInitBug())
663
50.8k
    MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
664
268k
665
268k
  return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
666
268k
                  MaxAddressableNumSGPRs);
667
268k
}
668
669
231k
unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
670
231k
  const Function &F = MF.getFunction();
671
231k
  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
672
231k
673
231k
  // Compute maximum number of VGPRs function can use using default/requested
674
231k
  // minimum number of waves per execution unit.
675
231k
  std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
676
231k
  unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
677
231k
678
231k
  // Check if maximum number of VGPRs was explicitly requested using
679
231k
  // "amdgpu-num-vgpr" attribute.
680
231k
  if (F.hasFnAttribute("amdgpu-num-vgpr")) {
681
389
    unsigned Requested = AMDGPU::getIntegerAttribute(
682
389
      F, "amdgpu-num-vgpr", MaxNumVGPRs);
683
389
684
389
    // Make sure requested value is compatible with values implied by
685
389
    // default/requested minimum/maximum number of waves per execution unit.
686
389
    if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
687
0
      Requested = 0;
688
389
    if (WavesPerEU.second &&
689
389
        Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
690
21
      Requested = 0;
691
389
692
389
    if (Requested)
693
368
      MaxNumVGPRs = Requested;
694
389
  }
695
231k
696
231k
  return MaxNumVGPRs;
697
231k
}
698
699
namespace {
700
struct MemOpClusterMutation : ScheduleDAGMutation {
701
  const SIInstrInfo *TII;
702
703
21.4k
  MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
704
705
50.9k
  void apply(ScheduleDAGInstrs *DAG) override {
706
50.9k
    SUnit *SUa = nullptr;
707
50.9k
    // Search for two consequent memory operations and link them
708
50.9k
    // to prevent scheduler from moving them apart.
709
50.9k
    // In DAG pre-process SUnits are in the original order of
710
50.9k
    // the instructions before scheduling.
711
270k
    for (SUnit &SU : DAG->SUnits) {
712
270k
      MachineInstr &MI2 = *SU.getInstr();
713
270k
      if (!MI2.mayLoad() && 
!MI2.mayStore()221k
) {
714
191k
        SUa = nullptr;
715
191k
        continue;
716
191k
      }
717
79.2k
      if (!SUa) {
718
47.6k
        SUa = &SU;
719
47.6k
        continue;
720
47.6k
      }
721
31.5k
722
31.5k
      MachineInstr &MI1 = *SUa->getInstr();
723
31.5k
      if ((TII->isVMEM(MI1) && 
TII->isVMEM(MI2)11.5k
) ||
724
31.5k
          
(20.7k
TII->isFLAT(MI1)20.7k
&&
TII->isFLAT(MI2)5.70k
) ||
725
31.5k
          
(15.6k
TII->isSMRD(MI1)15.6k
&&
TII->isSMRD(MI2)10.8k
) ||
726
31.5k
          
(4.91k
TII->isDS(MI1)4.91k
&&
TII->isDS(MI2)3.13k
)) {
727
29.3k
        SU.addPredBarrier(SUa);
728
29.3k
729
291k
        for (const SDep &SI : SU.Preds) {
730
291k
          if (SI.getSUnit() != SUa)
731
246k
            SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
732
291k
        }
733
29.3k
734
29.3k
        if (&SU != &DAG->ExitSU) {
735
313k
          for (const SDep &SI : SUa->Succs) {
736
313k
            if (SI.getSUnit() != &SU)
737
268k
              SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
738
313k
          }
739
29.3k
        }
740
29.3k
      }
741
31.5k
742
31.5k
      SUa = &SU;
743
31.5k
    }
744
50.9k
  }
745
};
746
747
struct FillMFMAShadowMutation : ScheduleDAGMutation {
748
  const SIInstrInfo *TII;
749
750
  ScheduleDAGMI *DAG;
751
752
21.4k
  FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
753
754
2.16k
  bool isSALU(const SUnit *SU) const {
755
2.16k
    const MachineInstr *MI = SU->getInstr();
756
2.16k
    return MI && 
TII->isSALU(*MI)2.16k
&&
!MI->isTerminator()19
;
757
2.16k
  }
758
759
79
  bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
760
79
    if (Pred->NodeNum < Succ->NodeNum)
761
66
      return true;
762
13
763
13
    SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
764
13
765
353
    for (unsigned I = 0; I < Succs.size(); 
++I340
) {
766
1.23k
      for (const SDep &SI : Succs[I]->Succs) {
767
1.23k
        const SUnit *SU = SI.getSUnit();
768
1.23k
        if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end())
769
327
          Succs.push_back(SU);
770
1.23k
      }
771
340
    }
772
13
773
13
    SmallPtrSet<const SUnit*, 32> Visited;
774
108
    while (!Preds.empty()) {
775
102
      const SUnit *SU = Preds.pop_back_val();
776
102
      if (llvm::find(Succs, SU) != Succs.end())
777
7
        return false;
778
95
      Visited.insert(SU);
779
95
      for (const SDep &SI : SU->Preds)
780
173
        if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
781
89
          Preds.push_back(SI.getSUnit());
782
95
    }
783
13
784
13
    
return true6
;
785
13
  }
786
787
  // Link as much SALU intructions in chain as possible. Return the size
788
  // of the chain. Links up to MaxChain instructions.
789
  unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
790
6
                         SmallPtrSetImpl<SUnit *> &Visited) const {
791
6
    SmallVector<SUnit *, 8> Worklist({To});
792
6
    unsigned Linked = 0;
793
6
794
16
    while (!Worklist.empty() && 
MaxChain-- > 012
) {
795
10
      SUnit *SU = Worklist.pop_back_val();
796
10
      if (!Visited.insert(SU).second)
797
0
        continue;
798
10
799
10
      LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
800
10
                 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
801
10
802
10
      if (SU->addPred(SDep(From, SDep::Artificial), false))
803
10
        ++Linked;
804
10
805
82
      for (SDep &SI : From->Succs) {
806
82
        SUnit *SUv = SI.getSUnit();
807
82
        if (SUv != From && TII->isVALU(*SUv->getInstr()) && 
canAddEdge(SUv, SU)60
)
808
60
          SUv->addPred(SDep(SU, SDep::Artificial), false);
809
82
      }
810
10
811
83
      for (SDep &SI : SU->Succs) {
812
83
        SUnit *Succ = SI.getSUnit();
813
83
        if (Succ != SU && isSALU(Succ) && 
canAddEdge(From, Succ)6
)
814
6
          Worklist.push_back(Succ);
815
83
      }
816
10
    }
817
6
818
6
    return Linked;
819
6
  }
820
821
50.9k
  void apply(ScheduleDAGInstrs *DAGInstrs) override {
822
50.9k
    const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
823
50.9k
    if (!ST.hasMAIInsts() || 
DisablePowerSched282
)
824
50.6k
      return;
825
282
    DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
826
282
    const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
827
282
    if (!TSchedModel || DAG->SUnits.empty())
828
140
      return;
829
142
830
142
    // Scan for MFMA long latency instructions and try to add a dependency
831
142
    // of available SALU instructions to give them a chance to fill MFMA
832
142
    // shadow. That is desirable to fill MFMA shadow with SALU instructions
833
142
    // rather than VALU to prevent power consumption bursts and throttle.
834
142
    auto LastSALU = DAG->SUnits.begin();
835
142
    auto E = DAG->SUnits.end();
836
142
    SmallPtrSet<SUnit*, 32> Visited;
837
5.10k
    for (SUnit &SU : DAG->SUnits) {
838
5.10k
      MachineInstr &MAI = *SU.getInstr();
839
5.10k
      if (!TII->isMAI(MAI) ||
840
5.10k
           
MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B322.03k
||
841
5.10k
           
MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B321.03k
)
842
5.05k
        continue;
843
47
844
47
      unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
845
47
846
47
      LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
847
47
                 dbgs() << "Need " << Lat
848
47
                        << " instructions to cover latency.\n");
849
47
850
47
      // Find up to Lat independent scalar instructions as early as
851
47
      // possible such that they can be scheduled after this MFMA.
852
2.13k
      for ( ; Lat && 
LastSALU != E2.12k
;
++LastSALU2.08k
) {
853
2.08k
        if (Visited.count(&*LastSALU))
854
4
          continue;
855
2.07k
856
2.07k
        if (!isSALU(&*LastSALU) || 
!canAddEdge(&*LastSALU, &SU)13
)
857
2.07k
          continue;
858
6
859
6
        Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
860
6
      }
861
47
    }
862
142
  }
863
};
864
} // namespace
865
866
void GCNSubtarget::getPostRAMutations(
867
21.4k
    std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
868
21.4k
  Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
869
21.4k
  Mutations.push_back(llvm::make_unique<FillMFMAShadowMutation>(&InstrInfo));
870
21.4k
}
871
872
55.7k
const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
873
55.7k
  if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
874
51.2k
    return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
875
4.55k
  else
876
4.55k
    return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
877
55.7k
}
878
879
59.0k
const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
880
59.0k
  if (TM.getTargetTriple().getArch() == Triple::amdgcn)
881
53.9k
    return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
882
5.08k
  else
883
5.08k
    return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
884
59.0k
}