Coverage Report

Created: 2017-10-03 07:32

/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
Line
Count
Source (jump to first uncovered line)
1
//===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2
//
3
//                     The LLVM Compiler Infrastructure
4
//
5
// This file is distributed under the University of Illinois Open Source
6
// License. See LICENSE.TXT for details.
7
//
8
//===----------------------------------------------------------------------===//
9
//
10
/// \file
11
/// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
12
//
13
//===----------------------------------------------------------------------===//
14
15
#include "AMDGPUSubtarget.h"
16
#include "AMDGPU.h"
17
#include "AMDGPUTargetMachine.h"
18
#include "AMDGPUCallLowering.h"
19
#include "AMDGPUInstructionSelector.h"
20
#include "AMDGPULegalizerInfo.h"
21
#include "AMDGPURegisterBankInfo.h"
22
#include "SIMachineFunctionInfo.h"
23
#include "llvm/ADT/SmallString.h"
24
#include "llvm/CodeGen/MachineScheduler.h"
25
#include "llvm/IR/MDBuilder.h"
26
#include "llvm/Target/TargetFrameLowering.h"
27
#include <algorithm>
28
29
using namespace llvm;
30
31
#define DEBUG_TYPE "amdgpu-subtarget"
32
33
#define GET_SUBTARGETINFO_TARGET_DESC
34
#define GET_SUBTARGETINFO_CTOR
35
#include "AMDGPUGenSubtargetInfo.inc"
36
37
2.05k
AMDGPUSubtarget::~AMDGPUSubtarget() = default;
38
39
AMDGPUSubtarget &
40
AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
41
2.06k
                                                 StringRef GPU, StringRef FS) {
42
2.06k
  // Determine default and user-specified characteristics
43
2.06k
  // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
44
2.06k
  // enabled, but some instructions do not respect them and they run at the
45
2.06k
  // double precision rate, so don't enable by default.
46
2.06k
  //
47
2.06k
  // We want to be able to turn these off, but making this a subtarget feature
48
2.06k
  // for SI has the unhelpful behavior that it unsets everything else if you
49
2.06k
  // disable it.
50
2.06k
51
2.06k
  SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+dx10-clamp,+load-store-opt,");
52
2.06k
  if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
53
350
    FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,";
54
2.06k
55
2.06k
  FullFS += FS;
56
2.06k
57
2.06k
  ParseSubtargetFeatures(GPU, FullFS);
58
2.06k
59
2.06k
  // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
60
2.06k
  // on VI and newer hardware to avoid assertion failures due to missing ADDR64
61
2.06k
  // variants of MUBUF instructions.
62
2.06k
  if (
!hasAddr64() && 2.06k
!FS.contains("flat-for-global")783
) {
63
447
    FlatForGlobal = true;
64
447
  }
65
2.06k
66
2.06k
  // FIXME: I don't think think Evergreen has any useful support for
67
2.06k
  // denormals, but should be checked. Should we issue a warning somewhere
68
2.06k
  // if someone tries to enable these?
69
2.06k
  if (
getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS2.06k
) {
70
271
    FP64FP16Denormals = false;
71
271
    FP32Denormals = false;
72
271
  }
73
2.06k
74
2.06k
  // Set defaults if needed.
75
2.06k
  if (MaxPrivateElementSize == 0)
76
2.04k
    MaxPrivateElementSize = 4;
77
2.06k
78
2.06k
  if (LDSBankCount == 0)
79
863
    LDSBankCount = 32;
80
2.06k
81
2.06k
  if (
TT.getArch() == Triple::amdgcn2.06k
) {
82
1.79k
    if (LocalMemorySize == 0)
83
592
      LocalMemorySize = 32768;
84
1.79k
85
1.79k
    // Do something sensible for unspecified target.
86
1.79k
    if (
!HasMovrel && 1.79k
!HasVGPRIndexMode724
)
87
592
      HasMovrel = true;
88
1.79k
  }
89
2.06k
90
2.06k
  return *this;
91
2.06k
}
92
93
AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
94
                                 const TargetMachine &TM)
95
  : AMDGPUGenSubtargetInfo(TT, GPU, FS),
96
    TargetTriple(TT),
97
    Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
98
    IsaVersion(ISAVersion0_0_0),
99
    WavefrontSize(64),
100
    LocalMemorySize(0),
101
    LDSBankCount(0),
102
    MaxPrivateElementSize(0),
103
104
    FastFMAF32(false),
105
    HalfRate64Ops(false),
106
107
    FP32Denormals(false),
108
    FP64FP16Denormals(false),
109
    FPExceptions(false),
110
    DX10Clamp(false),
111
    FlatForGlobal(false),
112
    AutoWaitcntBeforeBarrier(false),
113
    UnalignedScratchAccess(false),
114
    UnalignedBufferAccess(false),
115
116
    HasApertureRegs(false),
117
    EnableXNACK(false),
118
    TrapHandler(false),
119
    DebuggerInsertNops(false),
120
    DebuggerReserveRegs(false),
121
    DebuggerEmitPrologue(false),
122
123
    EnableVGPRSpilling(false),
124
    EnablePromoteAlloca(false),
125
    EnableLoadStoreOpt(false),
126
    EnableUnsafeDSOffsetFolding(false),
127
    EnableSIScheduler(false),
128
    DumpCode(false),
129
130
    FP64(false),
131
    IsGCN(false),
132
    GCN3Encoding(false),
133
    CIInsts(false),
134
    GFX9Insts(false),
135
    SGPRInitBug(false),
136
    HasSMemRealTime(false),
137
    Has16BitInsts(false),
138
    HasIntClamp(false),
139
    HasVOP3PInsts(false),
140
    HasMovrel(false),
141
    HasVGPRIndexMode(false),
142
    HasScalarStores(false),
143
    HasInv2PiInlineImm(false),
144
    HasSDWA(false),
145
    HasSDWAOmod(false),
146
    HasSDWAScalar(false),
147
    HasSDWASdst(false),
148
    HasSDWAMac(false),
149
    HasSDWAOutModsVOPC(false),
150
    HasDPP(false),
151
    FlatAddressSpace(false),
152
    FlatInstOffsets(false),
153
    FlatGlobalInsts(false),
154
    FlatScratchInsts(false),
155
    AddNoCarryInsts(false),
156
157
    R600ALUInst(false),
158
    CaymanISA(false),
159
    CFALUBug(false),
160
    HasVertexCache(false),
161
    TexVTXClauseSize(0),
162
    ScalarizeGlobal(false),
163
164
    FeatureDisable(false),
165
2.06k
    InstrItins(getInstrItineraryForCPU(GPU)) {
166
2.06k
  AS = AMDGPU::getAMDGPUAS(TT);
167
2.06k
  initializeSubtargetDependencies(TT, GPU, FS);
168
2.06k
}
169
170
unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
171
15.2k
  const Function &F) const {
172
15.2k
  if (NWaves == 1)
173
26
    return getLocalMemorySize();
174
15.2k
  unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
175
15.2k
  unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
176
15.2k
  unsigned MaxWaves = getMaxWavesPerEU();
177
15.2k
  return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
178
15.2k
}
179
180
unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
181
125k
  const Function &F) const {
182
125k
  unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
183
125k
  unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
184
125k
  unsigned MaxWaves = getMaxWavesPerEU();
185
125k
  unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
186
125k
  unsigned NumWaves = Limit / (Bytes ? 
Bytes2.15k
:
1u123k
);
187
125k
  NumWaves = std::min(NumWaves, MaxWaves);
188
125k
  NumWaves = std::max(NumWaves, 1u);
189
125k
  return NumWaves;
190
125k
}
191
192
std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
193
190k
  const Function &F) const {
194
190k
  // Default minimum/maximum flat work group sizes.
195
190k
  std::pair<unsigned, unsigned> Default =
196
190k
    AMDGPU::isCompute(F.getCallingConv()) ?
197
182k
      std::pair<unsigned, unsigned>(getWavefrontSize() * 2,
198
182k
                                    getWavefrontSize() * 4) :
199
7.56k
      std::pair<unsigned, unsigned>(1, getWavefrontSize());
200
190k
201
190k
  // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
202
190k
  // starts using "amdgpu-flat-work-group-size" attribute.
203
190k
  Default.second = AMDGPU::getIntegerAttribute(
204
190k
    F, "amdgpu-max-work-group-size", Default.second);
205
190k
  Default.first = std::min(Default.first, Default.second);
206
190k
207
190k
  // Requested minimum/maximum flat work group sizes.
208
190k
  std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
209
190k
    F, "amdgpu-flat-work-group-size", Default);
210
190k
211
190k
  // Make sure requested minimum is less than requested maximum.
212
190k
  if (Requested.first > Requested.second)
213
0
    return Default;
214
190k
215
190k
  // Make sure requested values do not violate subtarget's specifications.
216
190k
  
if (190k
Requested.first < getMinFlatWorkGroupSize()190k
)
217
0
    return Default;
218
190k
  
if (190k
Requested.second > getMaxFlatWorkGroupSize()190k
)
219
0
    return Default;
220
190k
221
190k
  return Requested;
222
190k
}
223
224
std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
225
30.6k
  const Function &F) const {
226
30.6k
  // Default minimum/maximum number of waves per execution unit.
227
30.6k
  std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
228
30.6k
229
30.6k
  // Default/requested minimum/maximum flat work group sizes.
230
30.6k
  std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
231
30.6k
232
30.6k
  // If minimum/maximum flat work group sizes were explicitly requested using
233
30.6k
  // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
234
30.6k
  // number of waves per execution unit to values implied by requested
235
30.6k
  // minimum/maximum flat work group sizes.
236
30.6k
  unsigned MinImpliedByFlatWorkGroupSize =
237
30.6k
    getMaxWavesPerEU(FlatWorkGroupSizes.second);
238
30.6k
  bool RequestedFlatWorkGroupSize = false;
239
30.6k
240
30.6k
  // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
241
30.6k
  // starts using "amdgpu-flat-work-group-size" attribute.
242
30.6k
  if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
243
30.6k
      
F.hasFnAttribute("amdgpu-flat-work-group-size")30.5k
) {
244
103
    Default.first = MinImpliedByFlatWorkGroupSize;
245
103
    RequestedFlatWorkGroupSize = true;
246
103
  }
247
30.6k
248
30.6k
  // Requested minimum/maximum number of waves per execution unit.
249
30.6k
  std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
250
30.6k
    F, "amdgpu-waves-per-eu", Default, true);
251
30.6k
252
30.6k
  // Make sure requested minimum is less than requested maximum.
253
30.6k
  if (
Requested.second && 30.6k
Requested.first > Requested.second30.6k
)
254
0
    return Default;
255
30.6k
256
30.6k
  // Make sure requested values do not violate subtarget's specifications.
257
30.6k
  
if (30.6k
Requested.first < getMinWavesPerEU() ||
258
30.6k
      Requested.first > getMaxWavesPerEU())
259
0
    return Default;
260
30.6k
  
if (30.6k
Requested.second > getMaxWavesPerEU()30.6k
)
261
0
    return Default;
262
30.6k
263
30.6k
  // Make sure requested values are compatible with values implied by requested
264
30.6k
  // minimum/maximum flat work group sizes.
265
30.6k
  
if (30.6k
RequestedFlatWorkGroupSize &&
266
103
      Requested.first < MinImpliedByFlatWorkGroupSize)
267
2
    return Default;
268
30.6k
269
30.6k
  return Requested;
270
30.6k
}
271
272
3.25k
bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
273
3.25k
  Function *Kernel = I->getParent()->getParent();
274
3.25k
  unsigned MinSize = 0;
275
3.25k
  unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
276
3.25k
  bool IdQuery = false;
277
3.25k
278
3.25k
  // If reqd_work_group_size is present it narrows value down.
279
3.25k
  if (auto *
CI3.25k
= dyn_cast<CallInst>(I)) {
280
3.21k
    const Function *F = CI->getCalledFunction();
281
3.21k
    if (
F3.21k
) {
282
3.21k
      unsigned Dim = UINT_MAX;
283
3.21k
      switch (F->getIntrinsicID()) {
284
2.71k
      case Intrinsic::amdgcn_workitem_id_x:
285
2.71k
      case Intrinsic::r600_read_tidig_x:
286
2.71k
        IdQuery = true;
287
2.71k
        LLVM_FALLTHROUGH;
288
2.73k
      case Intrinsic::r600_read_local_size_x:
289
2.73k
        Dim = 0;
290
2.73k
        break;
291
171
      case Intrinsic::amdgcn_workitem_id_y:
292
171
      case Intrinsic::r600_read_tidig_y:
293
171
        IdQuery = true;
294
171
        LLVM_FALLTHROUGH;
295
257
      case Intrinsic::r600_read_local_size_y:
296
257
        Dim = 1;
297
257
        break;
298
132
      case Intrinsic::amdgcn_workitem_id_z:
299
132
      case Intrinsic::r600_read_tidig_z:
300
132
        IdQuery = true;
301
132
        LLVM_FALLTHROUGH;
302
218
      case Intrinsic::r600_read_local_size_z:
303
218
        Dim = 2;
304
218
        break;
305
0
      default:
306
0
        break;
307
3.21k
      }
308
3.21k
      
if (3.21k
Dim <= 33.21k
) {
309
3.21k
        if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
310
6
          
if (6
Node->getNumOperands() == 36
)
311
6
            MinSize = MaxSize = mdconst::extract<ConstantInt>(
312
6
                                  Node->getOperand(Dim))->getZExtValue();
313
3.21k
      }
314
3.21k
    }
315
3.21k
  }
316
3.25k
317
3.25k
  
if (3.25k
!MaxSize3.25k
)
318
0
    return false;
319
3.25k
320
3.25k
  // Range metadata is [Lo, Hi). For ID query we need to pass max size
321
3.25k
  // as Hi. For size query we need to pass Hi + 1.
322
3.25k
  
if (3.25k
IdQuery3.25k
)
323
3.02k
    MinSize = 0;
324
3.25k
  else
325
234
    ++MaxSize;
326
3.25k
327
3.25k
  MDBuilder MDB(I->getContext());
328
3.25k
  MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
329
3.25k
                                                  APInt(32, MaxSize));
330
3.25k
  I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
331
3.25k
  return true;
332
3.25k
}
333
334
R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
335
                             const TargetMachine &TM) :
336
  AMDGPUSubtarget(TT, GPU, FS, TM),
337
  InstrInfo(*this),
338
  FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
339
253
  TLInfo(TM, *this) {}
340
341
SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
342
                         const TargetMachine &TM)
343
    : AMDGPUSubtarget(TT, GPU, FS, TM), InstrInfo(*this),
344
      FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
345
1.81k
      TLInfo(TM, *this) {
346
1.81k
  CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
347
1.81k
  Legalizer.reset(new AMDGPULegalizerInfo());
348
1.81k
349
1.81k
  RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
350
1.81k
  InstSelector.reset(new AMDGPUInstructionSelector(
351
1.81k
      *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get())));
352
1.81k
}
353
354
void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
355
35.9k
                                      unsigned NumRegionInstrs) const {
356
35.9k
  // Track register pressure so the scheduler can try to decrease
357
35.9k
  // pressure once register usage is above the threshold defined by
358
35.9k
  // SIRegisterInfo::getRegPressureSetLimit()
359
35.9k
  Policy.ShouldTrackPressure = true;
360
35.9k
361
35.9k
  // Enabling both top down and bottom up scheduling seems to give us less
362
35.9k
  // register spills than just using one of these approaches on its own.
363
35.9k
  Policy.OnlyTopDown = false;
364
35.9k
  Policy.OnlyBottomUp = false;
365
35.9k
366
35.9k
  // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
367
35.9k
  if (!enableSIScheduler())
368
35.9k
    Policy.ShouldTrackLaneMasks = true;
369
35.9k
}
370
371
17.9k
bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
372
17.5k
  return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
373
17.9k
}
374
375
unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF,
376
1.74k
                                            unsigned ExplicitArgBytes) const {
377
1.74k
  unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
378
1.74k
  if (ImplicitBytes == 0)
379
1.65k
    return ExplicitArgBytes;
380
95
381
95
  unsigned Alignment = getAlignmentForImplicitArgPtr();
382
95
  return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
383
95
}
384
385
981
unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
386
981
  if (
getGeneration() >= SISubtarget::VOLCANIC_ISLANDS981
) {
387
495
    if (SGPRs <= 80)
388
412
      return 10;
389
83
    
if (83
SGPRs <= 8883
)
390
0
      return 9;
391
83
    
if (83
SGPRs <= 10083
)
392
11
      return 8;
393
72
    return 7;
394
72
  }
395
486
  
if (486
SGPRs <= 48486
)
396
358
    return 10;
397
128
  
if (128
SGPRs <= 56128
)
398
10
    return 9;
399
118
  
if (118
SGPRs <= 64118
)
400
4
    return 8;
401
114
  
if (114
SGPRs <= 72114
)
402
29
    return 7;
403
85
  
if (85
SGPRs <= 8085
)
404
0
    return 6;
405
85
  return 5;
406
85
}
407
408
981
unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
409
981
  if (VGPRs <= 24)
410
118
    return 10;
411
863
  
if (863
VGPRs <= 28863
)
412
22
    return 9;
413
841
  
if (841
VGPRs <= 32841
)
414
217
    return 8;
415
624
  
if (624
VGPRs <= 36624
)
416
271
    return 7;
417
353
  
if (353
VGPRs <= 40353
)
418
35
    return 6;
419
318
  
if (318
VGPRs <= 48318
)
420
16
    return 5;
421
302
  
if (302
VGPRs <= 64302
)
422
37
    return 4;
423
265
  
if (265
VGPRs <= 84265
)
424
103
    return 3;
425
162
  
if (162
VGPRs <= 128162
)
426
26
    return 2;
427
136
  return 1;
428
136
}
429
430
107k
unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
431
107k
  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
432
107k
  if (
MFI.hasFlatScratchInit()107k
) {
433
2.77k
    if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
434
1.59k
      return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
435
1.17k
    
if (1.17k
getGeneration() == AMDGPUSubtarget::SEA_ISLANDS1.17k
)
436
910
      return 4; // FLAT_SCRATCH, VCC (in that order).
437
104k
  }
438
104k
439
104k
  
if (104k
isXNACKEnabled()104k
)
440
4.69k
    return 4; // XNACK, VCC (in that order).
441
100k
  return 2; // VCC.
442
100k
}
443
444
107k
unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
445
107k
  const Function &F = *MF.getFunction();
446
107k
  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
447
107k
448
107k
  // Compute maximum number of SGPRs function can use using default/requested
449
107k
  // minimum number of waves per execution unit.
450
107k
  std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
451
107k
  unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
452
107k
  unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
453
107k
454
107k
  // Check if maximum number of SGPRs was explicitly requested using
455
107k
  // "amdgpu-num-sgpr" attribute.
456
107k
  if (
F.hasFnAttribute("amdgpu-num-sgpr")107k
) {
457
40
    unsigned Requested = AMDGPU::getIntegerAttribute(
458
40
      F, "amdgpu-num-sgpr", MaxNumSGPRs);
459
40
460
40
    // Make sure requested value does not violate subtarget's specifications.
461
40
    if (
Requested && 40
(Requested <= getReservedNumSGPRs(MF))40
)
462
0
      Requested = 0;
463
40
464
40
    // If more SGPRs are required to support the input user/system SGPRs,
465
40
    // increase to accommodate them.
466
40
    //
467
40
    // FIXME: This really ends up using the requested number of SGPRs + number
468
40
    // of reserved special registers in total. Theoretically you could re-use
469
40
    // the last input registers for these special registers, but this would
470
40
    // require a lot of complexity to deal with the weird aliasing.
471
40
    unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
472
40
    if (
Requested && 40
Requested < InputNumSGPRs40
)
473
0
      Requested = InputNumSGPRs;
474
40
475
40
    // Make sure requested value is compatible with values implied by
476
40
    // default/requested minimum/maximum number of waves per execution unit.
477
40
    if (
Requested && 40
Requested > getMaxNumSGPRs(WavesPerEU.first, false)40
)
478
0
      Requested = 0;
479
40
    if (WavesPerEU.second &&
480
40
        
Requested40
&&
Requested < getMinNumSGPRs(WavesPerEU.second)40
)
481
0
      Requested = 0;
482
40
483
40
    if (Requested)
484
40
      MaxNumSGPRs = Requested;
485
40
  }
486
107k
487
107k
  if (hasSGPRInitBug())
488
29.8k
    MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
489
107k
490
107k
  return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
491
107k
                  MaxAddressableNumSGPRs);
492
107k
}
493
494
77.4k
unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
495
77.4k
  const Function &F = *MF.getFunction();
496
77.4k
  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
497
77.4k
498
77.4k
  // Compute maximum number of VGPRs function can use using default/requested
499
77.4k
  // minimum number of waves per execution unit.
500
77.4k
  std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
501
77.4k
  unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
502
77.4k
503
77.4k
  // Check if maximum number of VGPRs was explicitly requested using
504
77.4k
  // "amdgpu-num-vgpr" attribute.
505
77.4k
  if (
F.hasFnAttribute("amdgpu-num-vgpr")77.4k
) {
506
6
    unsigned Requested = AMDGPU::getIntegerAttribute(
507
6
      F, "amdgpu-num-vgpr", MaxNumVGPRs);
508
6
509
6
    // Make sure requested value does not violate subtarget's specifications.
510
6
    if (
Requested && 6
Requested <= getReservedNumVGPRs(MF)6
)
511
0
      Requested = 0;
512
6
513
6
    // Make sure requested value is compatible with values implied by
514
6
    // default/requested minimum/maximum number of waves per execution unit.
515
6
    if (
Requested && 6
Requested > getMaxNumVGPRs(WavesPerEU.first)6
)
516
0
      Requested = 0;
517
6
    if (WavesPerEU.second &&
518
6
        
Requested6
&&
Requested < getMinNumVGPRs(WavesPerEU.second)6
)
519
0
      Requested = 0;
520
6
521
6
    if (Requested)
522
6
      MaxNumVGPRs = Requested;
523
6
  }
524
77.4k
525
77.4k
  return MaxNumVGPRs - getReservedNumVGPRs(MF);
526
77.4k
}
527
528
struct MemOpClusterMutation : ScheduleDAGMutation {
529
  const SIInstrInfo *TII;
530
531
11.4k
  MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
532
533
26.3k
  void apply(ScheduleDAGInstrs *DAGInstrs) override {
534
26.3k
    ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
535
26.3k
536
26.3k
    SUnit *SUa = nullptr;
537
26.3k
    // Search for two consequent memory operations and link them
538
26.3k
    // to prevent scheduler from moving them apart.
539
26.3k
    // In DAG pre-process SUnits are in the original order of
540
26.3k
    // the instructions before scheduling.
541
176k
    for (SUnit &SU : DAG->SUnits) {
542
176k
      MachineInstr &MI2 = *SU.getInstr();
543
176k
      if (
!MI2.mayLoad() && 176k
!MI2.mayStore()139k
) {
544
120k
        SUa = nullptr;
545
120k
        continue;
546
120k
      }
547
55.4k
      
if (55.4k
!SUa55.4k
) {
548
29.5k
        SUa = &SU;
549
29.5k
        continue;
550
29.5k
      }
551
25.9k
552
25.9k
      MachineInstr &MI1 = *SUa->getInstr();
553
25.9k
      if (
(TII->isVMEM(MI1) && 25.9k
TII->isVMEM(MI2)10.3k
) ||
554
16.0k
          
(TII->isFLAT(MI1) && 16.0k
TII->isFLAT(MI2)2.59k
) ||
555
13.7k
          
(TII->isSMRD(MI1) && 13.7k
TII->isSMRD(MI2)11.3k
) ||
556
25.9k
          
(TII->isDS(MI1) && 2.72k
TII->isDS(MI2)1.39k
)) {
557
24.4k
        SU.addPredBarrier(SUa);
558
24.4k
559
220k
        for (const SDep &SI : SU.Preds) {
560
220k
          if (SI.getSUnit() != SUa)
561
181k
            SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
562
220k
        }
563
24.4k
564
24.4k
        if (
&SU != &DAG->ExitSU24.4k
) {
565
239k
          for (const SDep &SI : SUa->Succs) {
566
239k
            if (SI.getSUnit() != &SU)
567
201k
              SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
568
239k
          }
569
24.4k
        }
570
24.4k
      }
571
176k
572
176k
      SUa = &SU;
573
176k
    }
574
26.3k
  }
575
};
576
577
void SISubtarget::getPostRAMutations(
578
11.4k
    std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
579
11.4k
  Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
580
11.4k
}