Coverage Report

Created: 2017-10-03 07:32

/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
Line
Count
Source (jump to first uncovered line)
1
//===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
2
//
3
//                     The LLVM Compiler Infrastructure
4
//
5
// This file is distributed under the University of Illinois Open Source
6
// License. See LICENSE.TXT for details.
7
//
8
//===----------------------------------------------------------------------===//
9
//
10
/// \file
11
/// \brief SI implementation of the TargetRegisterInfo class.
12
//
13
//===----------------------------------------------------------------------===//
14
15
#include "SIRegisterInfo.h"
16
#include "AMDGPUSubtarget.h"
17
#include "SIInstrInfo.h"
18
#include "SIMachineFunctionInfo.h"
19
#include "llvm/CodeGen/MachineFrameInfo.h"
20
#include "llvm/CodeGen/MachineInstrBuilder.h"
21
#include "llvm/CodeGen/RegisterScavenging.h"
22
#include "llvm/IR/Function.h"
23
#include "llvm/IR/LLVMContext.h"
24
25
using namespace llvm;
26
27
97.9k
static bool hasPressureSet(const int *PSets, unsigned PSetID) {
28
322k
  for (unsigned i = 0; 
PSets[i] != -1322k
;
++i224k
) {
29
234k
    if (PSets[i] == (int)PSetID)
30
9.07k
      return true;
31
234k
  }
32
88.8k
  return false;
33
97.9k
}
34
35
void SIRegisterInfo::classifyPressureSet(unsigned PSetID, unsigned Reg,
36
97.9k
                                         BitVector &PressureSets) const {
37
186k
  for (MCRegUnitIterator U(Reg, this); 
U.isValid()186k
;
++U88.8k
) {
38
97.9k
    const int *PSets = getRegUnitPressureSets(*U);
39
97.9k
    if (
hasPressureSet(PSets, PSetID)97.9k
) {
40
9.07k
      PressureSets.set(PSetID);
41
9.07k
      break;
42
9.07k
    }
43
97.9k
  }
44
97.9k
}
45
46
static cl::opt<bool> EnableSpillSGPRToSMEM(
47
  "amdgpu-spill-sgpr-to-smem",
48
  cl::desc("Use scalar stores to spill SGPRs if supported by subtarget"),
49
  cl::init(false));
50
51
static cl::opt<bool> EnableSpillSGPRToVGPR(
52
  "amdgpu-spill-sgpr-to-vgpr",
53
  cl::desc("Enable spilling VGPRs to SGPRs"),
54
  cl::ReallyHidden,
55
  cl::init(true));
56
57
SIRegisterInfo::SIRegisterInfo(const SISubtarget &ST) :
58
  AMDGPURegisterInfo(),
59
  SGPRPressureSets(getNumRegPressureSets()),
60
  VGPRPressureSets(getNumRegPressureSets()),
61
  SpillSGPRToVGPR(false),
62
1.81k
  SpillSGPRToSMEM(false) {
63
1.81k
  if (
EnableSpillSGPRToSMEM && 1.81k
ST.hasScalarStores()5
)
64
5
    SpillSGPRToSMEM = true;
65
1.80k
  else 
if (1.80k
EnableSpillSGPRToVGPR1.80k
)
66
1.80k
    SpillSGPRToVGPR = true;
67
1.81k
68
1.81k
  unsigned NumRegPressureSets = getNumRegPressureSets();
69
1.81k
70
1.81k
  SGPRSetID = NumRegPressureSets;
71
1.81k
  VGPRSetID = NumRegPressureSets;
72
1.81k
73
50.7k
  for (unsigned i = 0; 
i < NumRegPressureSets50.7k
;
++i48.9k
) {
74
48.9k
    classifyPressureSet(i, AMDGPU::SGPR0, SGPRPressureSets);
75
48.9k
    classifyPressureSet(i, AMDGPU::VGPR0, VGPRPressureSets);
76
48.9k
  }
77
1.81k
78
1.81k
  // Determine the number of reg units for each pressure set.
79
1.81k
  std::vector<unsigned> PressureSetRegUnits(NumRegPressureSets, 0);
80
3.15M
  for (unsigned i = 0, e = getNumRegUnits(); 
i != e3.15M
;
++i3.15M
) {
81
3.15M
    const int *PSets = getRegUnitPressureSets(i);
82
10.3M
    for (unsigned j = 0; 
PSets[j] != -110.3M
;
++j7.21M
) {
83
7.21M
      ++PressureSetRegUnits[PSets[j]];
84
7.21M
    }
85
3.15M
  }
86
1.81k
87
1.81k
  unsigned VGPRMax = 0, SGPRMax = 0;
88
50.7k
  for (unsigned i = 0; 
i < NumRegPressureSets50.7k
;
++i48.9k
) {
89
48.9k
    if (
isVGPRPressureSet(i) && 48.9k
PressureSetRegUnits[i] > VGPRMax1.81k
) {
90
1.81k
      VGPRSetID = i;
91
1.81k
      VGPRMax = PressureSetRegUnits[i];
92
1.81k
      continue;
93
1.81k
    }
94
47.1k
    
if (47.1k
isSGPRPressureSet(i) && 47.1k
PressureSetRegUnits[i] > SGPRMax7.25k
) {
95
7.25k
      SGPRSetID = i;
96
7.25k
      SGPRMax = PressureSetRegUnits[i];
97
7.25k
    }
98
48.9k
  }
99
1.81k
100
1.81k
  assert(SGPRSetID < NumRegPressureSets &&
101
1.81k
         VGPRSetID < NumRegPressureSets);
102
1.81k
}
103
104
713k
void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const {
105
713k
  MCRegAliasIterator R(Reg, this, true);
106
713k
107
4.69M
  for (; 
R.isValid()4.69M
;
++R3.98M
)
108
3.98M
    Reserved.set(*R);
109
713k
}
110
111
unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
112
14.4k
  const MachineFunction &MF) const {
113
14.4k
114
14.4k
  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
115
14.4k
  unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4;
116
14.4k
  unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
117
14.4k
  return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
118
14.4k
}
119
120
14.4k
static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) {
121
14.4k
  unsigned Reg;
122
14.4k
123
14.4k
  // Try to place it in a hole after PrivateSegmentBufferReg.
124
14.4k
  if (
RegCount & 314.4k
) {
125
14.2k
    // We cannot put the segment buffer in (Idx - 4) ... (Idx - 1) due to
126
14.2k
    // alignment constraints, so we have a hole where can put the wave offset.
127
14.2k
    Reg = RegCount - 1;
128
14.4k
  } else {
129
173
    // We can put the segment buffer in (Idx - 4) ... (Idx - 1) and put the
130
173
    // wave offset before it.
131
173
    Reg = RegCount - 5;
132
173
  }
133
14.4k
134
14.4k
  return Reg;
135
14.4k
}
136
137
unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
138
14.4k
  const MachineFunction &MF) const {
139
14.4k
  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
140
14.4k
  unsigned Reg = findPrivateSegmentWaveByteOffsetRegIndex(ST.getMaxNumSGPRs(MF));
141
14.4k
  return AMDGPU::SGPR_32RegClass.getRegister(Reg);
142
14.4k
}
143
144
unsigned SIRegisterInfo::reservedStackPtrOffsetReg(
145
1.19k
  const MachineFunction &MF) const {
146
1.19k
  return AMDGPU::SGPR32;
147
1.19k
}
148
149
30.4k
BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
150
30.4k
  BitVector Reserved(getNumRegs());
151
30.4k
152
30.4k
  // EXEC_LO and EXEC_HI could be allocated and used as regular register, but
153
30.4k
  // this seems likely to result in bugs, so I'm marking them as reserved.
154
30.4k
  reserveRegisterTuples(Reserved, AMDGPU::EXEC);
155
30.4k
  reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
156
30.4k
157
30.4k
  // M0 has to be reserved so that llvm accepts it as a live-in into a block.
158
30.4k
  reserveRegisterTuples(Reserved, AMDGPU::M0);
159
30.4k
160
30.4k
  // Reserve the memory aperture registers.
161
30.4k
  reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE);
162
30.4k
  reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT);
163
30.4k
  reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE);
164
30.4k
  reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT);
165
30.4k
166
30.4k
  // Reserve Trap Handler registers - support is not implemented in Codegen.
167
30.4k
  reserveRegisterTuples(Reserved, AMDGPU::TBA);
168
30.4k
  reserveRegisterTuples(Reserved, AMDGPU::TMA);
169
30.4k
  reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1);
170
30.4k
  reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3);
171
30.4k
  reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5);
172
30.4k
  reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7);
173
30.4k
  reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
174
30.4k
  reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);
175
30.4k
176
30.4k
  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
177
30.4k
178
30.4k
  unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
179
30.4k
  unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
180
160k
  for (unsigned i = MaxNumSGPRs; 
i < TotalNumSGPRs160k
;
++i130k
) {
181
130k
    unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
182
130k
    reserveRegisterTuples(Reserved, Reg);
183
130k
  }
184
30.4k
185
30.4k
  unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF);
186
30.4k
  unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
187
35.4k
  for (unsigned i = MaxNumVGPRs; 
i < TotalNumVGPRs35.4k
;
++i5.04k
) {
188
5.04k
    unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
189
5.04k
    reserveRegisterTuples(Reserved, Reg);
190
5.04k
  }
191
30.4k
192
30.4k
  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
193
30.4k
194
30.4k
  unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
195
30.4k
  if (
ScratchWaveOffsetReg != AMDGPU::NoRegister30.4k
) {
196
30.4k
    // Reserve 1 SGPR for scratch wave offset in case we need to spill.
197
30.4k
    reserveRegisterTuples(Reserved, ScratchWaveOffsetReg);
198
30.4k
  }
199
30.4k
200
30.4k
  unsigned ScratchRSrcReg = MFI->getScratchRSrcReg();
201
30.4k
  if (
ScratchRSrcReg != AMDGPU::NoRegister30.4k
) {
202
30.4k
    // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need
203
30.4k
    // to spill.
204
30.4k
    // TODO: May need to reserve a VGPR if doing LDS spilling.
205
30.4k
    reserveRegisterTuples(Reserved, ScratchRSrcReg);
206
30.4k
    assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg));
207
30.4k
  }
208
30.4k
209
30.4k
  // We have to assume the SP is needed in case there are calls in the function,
210
30.4k
  // which is detected after the function is lowered. If we aren't really going
211
30.4k
  // to need SP, don't bother reserving it.
212
30.4k
  unsigned StackPtrReg = MFI->getStackPtrOffsetReg();
213
30.4k
214
30.4k
  if (
StackPtrReg != AMDGPU::NoRegister30.4k
) {
215
30.4k
    reserveRegisterTuples(Reserved, StackPtrReg);
216
30.4k
    assert(!isSubRegister(ScratchRSrcReg, StackPtrReg));
217
30.4k
  }
218
30.4k
219
30.4k
  unsigned FrameReg = MFI->getFrameOffsetReg();
220
30.4k
  if (
FrameReg != AMDGPU::NoRegister30.4k
) {
221
30.4k
    reserveRegisterTuples(Reserved, FrameReg);
222
30.4k
    assert(!isSubRegister(ScratchRSrcReg, FrameReg));
223
30.4k
  }
224
30.4k
225
30.4k
  return Reserved;
226
30.4k
}
227
228
30.1k
bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
229
30.1k
  const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>();
230
30.1k
  if (
Info->isEntryFunction()30.1k
) {
231
28.3k
    const MachineFrameInfo &MFI = Fn.getFrameInfo();
232
27.5k
    return MFI.hasStackObjects() || MFI.hasCalls();
233
28.3k
  }
234
1.73k
235
1.73k
  // May need scavenger for dealing with callee saved registers.
236
1.73k
  return true;
237
1.73k
}
238
239
bool SIRegisterInfo::requiresFrameIndexScavenging(
240
15.0k
  const MachineFunction &MF) const {
241
15.0k
  const MachineFrameInfo &MFI = MF.getFrameInfo();
242
15.0k
  if (MFI.hasStackObjects())
243
544
    return true;
244
14.5k
245
14.5k
  // May need to deal with callee saved registers.
246
14.5k
  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
247
14.5k
  return !Info->isEntryFunction();
248
14.5k
}
249
250
bool SIRegisterInfo::requiresFrameIndexReplacementScavenging(
251
14.7k
  const MachineFunction &MF) const {
252
14.7k
  // m0 is needed for the scalar store offset. m0 is unallocatable, so we can't
253
14.7k
  // create a virtual register for it during frame index elimination, so the
254
14.7k
  // scavenger is directly needed.
255
14.7k
  return MF.getFrameInfo().hasStackObjects() &&
256
544
         MF.getSubtarget<SISubtarget>().hasScalarStores() &&
257
259
         MF.getInfo<SIMachineFunctionInfo>()->hasSpilledSGPRs();
258
14.7k
}
259
260
bool SIRegisterInfo::requiresVirtualBaseRegisters(
261
15.0k
  const MachineFunction &) const {
262
15.0k
  // There are no special dedicated stack or frame pointers.
263
15.0k
  return true;
264
15.0k
}
265
266
30.1k
bool SIRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
267
30.1k
  // This helps catch bugs as verifier errors.
268
30.1k
  return true;
269
30.1k
}
270
271
4.40k
int64_t SIRegisterInfo::getMUBUFInstrOffset(const MachineInstr *MI) const {
272
4.40k
  assert(SIInstrInfo::isMUBUF(*MI));
273
4.40k
274
4.40k
  int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
275
4.40k
                                          AMDGPU::OpName::offset);
276
4.40k
  return MI->getOperand(OffIdx).getImm();
277
4.40k
}
278
279
int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI,
280
4
                                                 int Idx) const {
281
4
  if (!SIInstrInfo::isMUBUF(*MI))
282
0
    return 0;
283
4
284
4
  assert(Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
285
4
                                           AMDGPU::OpName::vaddr) &&
286
4
         "Should never see frame index on non-address operand");
287
4
288
4
  return getMUBUFInstrOffset(MI);
289
4
}
290
291
4.74k
bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
292
4.74k
  if (!MI->mayLoadOrStore())
293
342
    return false;
294
4.40k
295
4.40k
  int64_t FullOffset = Offset + getMUBUFInstrOffset(MI);
296
4.40k
297
4.40k
  return !isUInt<12>(FullOffset);
298
4.40k
}
299
300
void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
301
                                                  unsigned BaseReg,
302
                                                  int FrameIdx,
303
0
                                                  int64_t Offset) const {
304
0
  MachineBasicBlock::iterator Ins = MBB->begin();
305
0
  DebugLoc DL; // Defaults to "unknown"
306
0
307
0
  if (Ins != MBB->end())
308
0
    DL = Ins->getDebugLoc();
309
0
310
0
  MachineFunction *MF = MBB->getParent();
311
0
  const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>();
312
0
  const SIInstrInfo *TII = Subtarget.getInstrInfo();
313
0
314
0
  if (
Offset == 00
) {
315
0
    BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), BaseReg)
316
0
      .addFrameIndex(FrameIdx);
317
0
    return;
318
0
  }
319
0
320
0
  MachineRegisterInfo &MRI = MF->getRegInfo();
321
0
  unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
322
0
323
0
  unsigned FIReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
324
0
325
0
  BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
326
0
    .addImm(Offset);
327
0
  BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), FIReg)
328
0
    .addFrameIndex(FrameIdx);
329
0
330
0
  TII->getAddNoCarry(*MBB, Ins, DL, BaseReg)
331
0
    .addReg(OffsetReg, RegState::Kill)
332
0
    .addReg(FIReg);
333
0
}
334
335
void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
336
0
                                       int64_t Offset) const {
337
0
338
0
  MachineBasicBlock *MBB = MI.getParent();
339
0
  MachineFunction *MF = MBB->getParent();
340
0
  const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>();
341
0
  const SIInstrInfo *TII = Subtarget.getInstrInfo();
342
0
343
#ifndef NDEBUG
344
  // FIXME: Is it possible to be storing a frame index to itself?
345
  bool SeenFI = false;
346
  for (const MachineOperand &MO: MI.operands()) {
347
    if (MO.isFI()) {
348
      if (SeenFI)
349
        llvm_unreachable("should not see multiple frame indices");
350
351
      SeenFI = true;
352
    }
353
  }
354
#endif
355
356
0
  MachineOperand *FIOp = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
357
0
  assert(FIOp && FIOp->isFI() && "frame index must be address operand");
358
0
  assert(TII->isMUBUF(MI));
359
0
  assert(TII->getNamedOperand(MI, AMDGPU::OpName::soffset)->getReg() ==
360
0
         MF->getInfo<SIMachineFunctionInfo>()->getFrameOffsetReg() &&
361
0
         "should only be seeing frame offset relative FrameIndex");
362
0
363
0
364
0
  MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
365
0
  int64_t NewOffset = OffsetOp->getImm() + Offset;
366
0
  assert(isUInt<12>(NewOffset) && "offset should be legal");
367
0
368
0
  FIOp->ChangeToRegister(BaseReg, false);
369
0
  OffsetOp->setImm(NewOffset);
370
0
}
371
372
bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
373
                                        unsigned BaseReg,
374
0
                                        int64_t Offset) const {
375
0
  if (!SIInstrInfo::isMUBUF(*MI))
376
0
    return false;
377
0
378
0
  int64_t NewOffset = Offset + getMUBUFInstrOffset(MI);
379
0
380
0
  return isUInt<12>(NewOffset);
381
0
}
382
383
const TargetRegisterClass *SIRegisterInfo::getPointerRegClass(
384
0
  const MachineFunction &MF, unsigned Kind) const {
385
0
  // This is inaccurate. It depends on the instruction and address space. The
386
0
  // only place where we should hit this is for dealing with frame indexes /
387
0
  // private accesses, so this is correct in that case.
388
0
  return &AMDGPU::VGPR_32RegClass;
389
0
}
390
391
1.16k
static unsigned getNumSubRegsForSpillOp(unsigned Op) {
392
1.16k
393
1.16k
  switch (Op) {
394
0
  case AMDGPU::SI_SPILL_S512_SAVE:
395
0
  case AMDGPU::SI_SPILL_S512_RESTORE:
396
0
  case AMDGPU::SI_SPILL_V512_SAVE:
397
0
  case AMDGPU::SI_SPILL_V512_RESTORE:
398
0
    return 16;
399
0
  case AMDGPU::SI_SPILL_S256_SAVE:
400
0
  case AMDGPU::SI_SPILL_S256_RESTORE:
401
0
  case AMDGPU::SI_SPILL_V256_SAVE:
402
0
  case AMDGPU::SI_SPILL_V256_RESTORE:
403
0
    return 8;
404
657
  case AMDGPU::SI_SPILL_S128_SAVE:
405
657
  case AMDGPU::SI_SPILL_S128_RESTORE:
406
657
  case AMDGPU::SI_SPILL_V128_SAVE:
407
657
  case AMDGPU::SI_SPILL_V128_RESTORE:
408
657
    return 4;
409
0
  case AMDGPU::SI_SPILL_V96_SAVE:
410
0
  case AMDGPU::SI_SPILL_V96_RESTORE:
411
0
    return 3;
412
12
  case AMDGPU::SI_SPILL_S64_SAVE:
413
12
  case AMDGPU::SI_SPILL_S64_RESTORE:
414
12
  case AMDGPU::SI_SPILL_V64_SAVE:
415
12
  case AMDGPU::SI_SPILL_V64_RESTORE:
416
12
    return 2;
417
500
  case AMDGPU::SI_SPILL_S32_SAVE:
418
500
  case AMDGPU::SI_SPILL_S32_RESTORE:
419
500
  case AMDGPU::SI_SPILL_V32_SAVE:
420
500
  case AMDGPU::SI_SPILL_V32_RESTORE:
421
500
    return 1;
422
0
  
default: 0
llvm_unreachable0
("Invalid spill opcode");
423
0
  }
424
0
}
425
426
3.05k
static int getOffsetMUBUFStore(unsigned Opc) {
427
3.05k
  switch (Opc) {
428
2.80k
  case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
429
2.80k
    return AMDGPU::BUFFER_STORE_DWORD_OFFSET;
430
147
  case AMDGPU::BUFFER_STORE_BYTE_OFFEN:
431
147
    return AMDGPU::BUFFER_STORE_BYTE_OFFSET;
432
52
  case AMDGPU::BUFFER_STORE_SHORT_OFFEN:
433
52
    return AMDGPU::BUFFER_STORE_SHORT_OFFSET;
434
20
  case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
435
20
    return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
436
26
  case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN:
437
26
    return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
438
0
  default:
439
0
    return -1;
440
0
  }
441
0
}
442
443
1.73k
static int getOffsetMUBUFLoad(unsigned Opc) {
444
1.73k
  switch (Opc) {
445
1.60k
  case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
446
1.60k
    return AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
447
93
  case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN:
448
93
    return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET;
449
2
  case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN:
450
2
    return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET;
451
15
  case AMDGPU::BUFFER_LOAD_USHORT_OFFEN:
452
15
    return AMDGPU::BUFFER_LOAD_USHORT_OFFSET;
453
2
  case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN:
454
2
    return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET;
455
5
  case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN:
456
5
    return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
457
16
  case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN:
458
16
    return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET;
459
0
  default:
460
0
    return -1;
461
0
  }
462
0
}
463
464
// This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
465
// need to handle the case where an SGPR may need to be spilled while spilling.
466
static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII,
467
                                      MachineFrameInfo &MFI,
468
                                      MachineBasicBlock::iterator MI,
469
                                      int Index,
470
4.78k
                                      int64_t Offset) {
471
4.78k
  MachineBasicBlock *MBB = MI->getParent();
472
4.78k
  const DebugLoc &DL = MI->getDebugLoc();
473
4.78k
  bool IsStore = MI->mayStore();
474
4.78k
475
4.78k
  unsigned Opc = MI->getOpcode();
476
4.78k
  int LoadStoreOp = IsStore ?
477
4.78k
    
getOffsetMUBUFStore(Opc)3.05k
:
getOffsetMUBUFLoad(Opc)1.73k
;
478
4.78k
  if (LoadStoreOp == -1)
479
0
    return false;
480
4.78k
481
4.78k
  const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
482
4.78k
  BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
483
4.78k
    .add(*Reg)
484
4.78k
    .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
485
4.78k
    .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
486
4.78k
    .addImm(Offset)
487
4.78k
    .addImm(0) // glc
488
4.78k
    .addImm(0) // slc
489
4.78k
    .addImm(0) // tfe
490
4.78k
    .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
491
4.78k
  return true;
492
4.78k
}
493
494
void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
495
                                         unsigned LoadStoreOp,
496
                                         int Index,
497
                                         unsigned ValueReg,
498
                                         bool IsKill,
499
                                         unsigned ScratchRsrcReg,
500
                                         unsigned ScratchOffsetReg,
501
                                         int64_t InstOffset,
502
                                         MachineMemOperand *MMO,
503
2.25k
                                         RegScavenger *RS) const {
504
2.25k
  MachineBasicBlock *MBB = MI->getParent();
505
2.25k
  MachineFunction *MF = MI->getParent()->getParent();
506
2.25k
  const SISubtarget &ST =  MF->getSubtarget<SISubtarget>();
507
2.25k
  const SIInstrInfo *TII = ST.getInstrInfo();
508
2.25k
  const MachineFrameInfo &MFI = MF->getFrameInfo();
509
2.25k
510
2.25k
  const MCInstrDesc &Desc = TII->get(LoadStoreOp);
511
2.25k
  const DebugLoc &DL = MI->getDebugLoc();
512
2.25k
  bool IsStore = Desc.mayStore();
513
2.25k
514
2.25k
  bool RanOutOfSGPRs = false;
515
2.25k
  bool Scavenged = false;
516
2.25k
  unsigned SOffset = ScratchOffsetReg;
517
2.25k
518
2.25k
  const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
519
2.25k
  unsigned NumSubRegs = AMDGPU::getRegBitWidth(RC->getID()) / 32;
520
2.25k
  unsigned Size = NumSubRegs * 4;
521
2.25k
  int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
522
2.25k
  const int64_t OriginalImmOffset = Offset;
523
2.25k
524
2.25k
  unsigned Align = MFI.getObjectAlignment(Index);
525
2.25k
  const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
526
2.25k
527
2.25k
  if (
!isUInt<12>(Offset + Size)2.25k
) {
528
232
    SOffset = AMDGPU::NoRegister;
529
232
530
232
    // We don't have access to the register scavenger if this function is called
531
232
    // during  PEI::scavengeFrameVirtualRegs().
532
232
    if (RS)
533
0
      SOffset = RS->FindUnusedReg(&AMDGPU::SGPR_32RegClass);
534
232
535
232
    if (
SOffset == AMDGPU::NoRegister232
) {
536
232
      // There are no free SGPRs, and since we are in the process of spilling
537
232
      // VGPRs too.  Since we need a VGPR in order to spill SGPRs (this is true
538
232
      // on SI/CI and on VI it is true until we implement spilling using scalar
539
232
      // stores), we have no way to free up an SGPR.  Our solution here is to
540
232
      // add the offset directly to the ScratchOffset register, and then
541
232
      // subtract the offset after the spill to return ScratchOffset to it's
542
232
      // original value.
543
232
      RanOutOfSGPRs = true;
544
232
      SOffset = ScratchOffsetReg;
545
232
    } else {
546
0
      Scavenged = true;
547
0
    }
548
232
549
232
    BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset)
550
232
      .addReg(ScratchOffsetReg)
551
232
      .addImm(Offset);
552
232
553
232
    Offset = 0;
554
232
  }
555
2.25k
556
2.25k
  const unsigned EltSize = 4;
557
2.25k
558
8.47k
  for (unsigned i = 0, e = NumSubRegs; 
i != e8.47k
;
++i, Offset += EltSize6.22k
) {
559
6.22k
    unsigned SubReg = NumSubRegs == 1 ?
560
6.22k
      
ValueReg909
:
getSubReg(ValueReg, getSubRegFromChannel(i))5.31k
;
561
6.22k
562
6.22k
    unsigned SOffsetRegState = 0;
563
6.22k
    unsigned SrcDstRegState = getDefRegState(!IsStore);
564
6.22k
    if (
i + 1 == e6.22k
) {
565
2.25k
      SOffsetRegState |= getKillRegState(Scavenged);
566
2.25k
      // The last implicit use carries the "Kill" flag.
567
2.25k
      SrcDstRegState |= getKillRegState(IsKill);
568
2.25k
    }
569
6.22k
570
6.22k
    MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i);
571
6.22k
    MachineMemOperand *NewMMO
572
6.22k
      = MF->getMachineMemOperand(PInfo, MMO->getFlags(),
573
6.22k
                                 EltSize, MinAlign(Align, EltSize * i));
574
6.22k
575
6.22k
    auto MIB = BuildMI(*MBB, MI, DL, Desc)
576
6.22k
      .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill))
577
6.22k
      .addReg(ScratchRsrcReg)
578
6.22k
      .addReg(SOffset, SOffsetRegState)
579
6.22k
      .addImm(Offset)
580
6.22k
      .addImm(0) // glc
581
6.22k
      .addImm(0) // slc
582
6.22k
      .addImm(0) // tfe
583
6.22k
      .addMemOperand(NewMMO);
584
6.22k
585
6.22k
    if (NumSubRegs > 1)
586
5.31k
      MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
587
6.22k
  }
588
2.25k
589
2.25k
  if (
RanOutOfSGPRs2.25k
) {
590
232
    // Subtract the offset we added to the ScratchOffset register.
591
232
    BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffsetReg)
592
232
      .addReg(ScratchOffsetReg)
593
232
      .addImm(OriginalImmOffset);
594
232
  }
595
2.25k
}
596
597
static std::pair<unsigned, unsigned> getSpillEltSize(unsigned SuperRegSize,
598
28
                                                     bool Store) {
599
28
  if (
SuperRegSize % 16 == 028
) {
600
3
    return { 16, Store ? AMDGPU::S_BUFFER_STORE_DWORDX4_SGPR :
601
3
                         AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR };
602
6
  }
603
22
604
22
  
if (22
SuperRegSize % 8 == 022
) {
605
8
    return { 8, Store ? AMDGPU::S_BUFFER_STORE_DWORDX2_SGPR :
606
8
                        AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR };
607
16
  }
608
6
609
6
  
return { 4, Store ? 6
AMDGPU::S_BUFFER_STORE_DWORD_SGPR3
:
610
3
                      AMDGPU::S_BUFFER_LOAD_DWORD_SGPR};
611
28
}
612
613
bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
614
                               int Index,
615
                               RegScavenger *RS,
616
594
                               bool OnlyToVGPR) const {
617
594
  MachineBasicBlock *MBB = MI->getParent();
618
594
  MachineFunction *MF = MBB->getParent();
619
594
  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
620
594
621
594
  ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
622
594
    = MFI->getSGPRToVGPRSpills(Index);
623
594
  bool SpillToVGPR = !VGPRSpills.empty();
624
594
  if (
OnlyToVGPR && 594
!SpillToVGPR548
)
625
0
    return false;
626
594
627
594
  MachineRegisterInfo &MRI = MF->getRegInfo();
628
594
  const SISubtarget &ST =  MF->getSubtarget<SISubtarget>();
629
594
  const SIInstrInfo *TII = ST.getInstrInfo();
630
594
631
594
  unsigned SuperReg = MI->getOperand(0).getReg();
632
594
  bool IsKill = MI->getOperand(0).isKill();
633
594
  const DebugLoc &DL = MI->getDebugLoc();
634
594
635
594
  MachineFrameInfo &FrameInfo = MF->getFrameInfo();
636
594
637
594
  bool SpillToSMEM = spillSGPRToSMEM();
638
594
  if (
SpillToSMEM && 594
OnlyToVGPR14
)
639
0
    return false;
640
594
641
594
  assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
642
594
643
594
  unsigned OffsetReg = AMDGPU::M0;
644
594
  unsigned M0CopyReg = AMDGPU::NoRegister;
645
594
646
594
  if (
SpillToSMEM594
) {
647
14
    if (
RS->isRegUsed(AMDGPU::M0)14
) {
648
14
      M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
649
14
      BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg)
650
14
        .addReg(AMDGPU::M0);
651
14
    }
652
14
  }
653
594
654
594
  unsigned ScalarStoreOp;
655
594
  unsigned EltSize = 4;
656
594
  const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
657
594
  if (
SpillToSMEM && 594
isSGPRClass(RC)14
) {
658
14
    // XXX - if private_element_size is larger than 4 it might be useful to be
659
14
    // able to spill wider vmem spills.
660
14
    std::tie(EltSize, ScalarStoreOp) =
661
14
          getSpillEltSize(getRegSizeInBits(*RC) / 8, true);
662
14
  }
663
594
664
594
  ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
665
594
  unsigned NumSubRegs = SplitParts.empty() ? 
1466
:
SplitParts.size()128
;
666
594
667
594
  // SubReg carries the "Kill" flag when SubReg == SuperReg.
668
466
  unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill);
669
1.61k
  for (unsigned i = 0, e = NumSubRegs; 
i < e1.61k
;
++i1.02k
) {
670
1.02k
    unsigned SubReg = NumSubRegs == 1 ?
671
1.02k
      
SuperReg466
:
getSubReg(SuperReg, SplitParts[i])556
;
672
1.02k
673
1.02k
    if (
SpillToSMEM1.02k
) {
674
15
      int64_t FrOffset = FrameInfo.getObjectOffset(Index);
675
15
676
15
      // The allocated memory size is really the wavefront size * the frame
677
15
      // index size. The widest register class is 64 bytes, so a 4-byte scratch
678
15
      // allocation is enough to spill this in a single stack object.
679
15
      //
680
15
      // FIXME: Frame size/offsets are computed earlier than this, so the extra
681
15
      // space is still unnecessarily allocated.
682
15
683
15
      unsigned Align = FrameInfo.getObjectAlignment(Index);
684
15
      MachinePointerInfo PtrInfo
685
15
        = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
686
15
      MachineMemOperand *MMO
687
15
        = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
688
15
                                   EltSize, MinAlign(Align, EltSize * i));
689
15
690
15
      // SMEM instructions only support a single offset, so increment the wave
691
15
      // offset.
692
15
693
15
      int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i);
694
15
      if (
Offset != 015
) {
695
15
        BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg)
696
15
          .addReg(MFI->getFrameOffsetReg())
697
15
          .addImm(Offset);
698
15
      } else {
699
0
        BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
700
0
          .addReg(MFI->getFrameOffsetReg());
701
0
      }
702
15
703
15
      BuildMI(*MBB, MI, DL, TII->get(ScalarStoreOp))
704
15
        .addReg(SubReg, getKillRegState(IsKill)) // sdata
705
15
        .addReg(MFI->getScratchRSrcReg())        // sbase
706
15
        .addReg(OffsetReg, RegState::Kill)       // soff
707
15
        .addImm(0)                               // glc
708
15
        .addMemOperand(MMO);
709
15
710
15
      continue;
711
15
    }
712
1.00k
713
1.00k
    
if (1.00k
SpillToVGPR1.00k
) {
714
925
      SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
715
925
716
925
      BuildMI(*MBB, MI, DL,
717
925
              TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
718
925
              Spill.VGPR)
719
925
        .addReg(SubReg, getKillRegState(IsKill))
720
925
        .addImm(Spill.Lane);
721
925
722
925
      // FIXME: Since this spills to another register instead of an actual
723
925
      // frame index, we should delete the frame index when all references to
724
925
      // it are fixed.
725
1.00k
    } else {
726
82
      // XXX - Can to VGPR spill fail for some subregisters but not others?
727
82
      if (OnlyToVGPR)
728
0
        return false;
729
82
730
82
      // Spill SGPR to a frame index.
731
82
      // TODO: Should VI try to spill to VGPR and then spill to SMEM?
732
82
      unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
733
82
      // TODO: Should VI try to spill to VGPR and then spill to SMEM?
734
82
735
82
      MachineInstrBuilder Mov
736
82
        = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
737
82
        .addReg(SubReg, SubKillState);
738
82
739
82
740
82
      // There could be undef components of a spilled super register.
741
82
      // TODO: Can we detect this and skip the spill?
742
82
      if (
NumSubRegs > 182
) {
743
76
        // The last implicit use of the SuperReg carries the "Kill" flag.
744
76
        unsigned SuperKillState = 0;
745
76
        if (i + 1 == e)
746
26
          SuperKillState |= getKillRegState(IsKill);
747
76
        Mov.addReg(SuperReg, RegState::Implicit | SuperKillState);
748
76
      }
749
82
750
82
      unsigned Align = FrameInfo.getObjectAlignment(Index);
751
82
      MachinePointerInfo PtrInfo
752
82
        = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
753
82
      MachineMemOperand *MMO
754
82
        = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
755
82
                                   EltSize, MinAlign(Align, EltSize * i));
756
82
      BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE))
757
82
        .addReg(TmpReg, RegState::Kill)    // src
758
82
        .addFrameIndex(Index)              // vaddr
759
82
        .addReg(MFI->getScratchRSrcReg())  // srrsrc
760
82
        .addReg(MFI->getFrameOffsetReg())  // soffset
761
82
        .addImm(i * 4)                     // offset
762
82
        .addMemOperand(MMO);
763
82
    }
764
1.02k
  }
765
594
766
594
  
if (594
M0CopyReg != AMDGPU::NoRegister594
) {
767
14
    BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
768
14
      .addReg(M0CopyReg, RegState::Kill);
769
14
  }
770
594
771
594
  MI->eraseFromParent();
772
594
  MFI->addToSpilledSGPRs(NumSubRegs);
773
594
  return true;
774
594
}
775
776
bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
777
                                 int Index,
778
                                 RegScavenger *RS,
779
582
                                 bool OnlyToVGPR) const {
780
582
  MachineFunction *MF = MI->getParent()->getParent();
781
582
  MachineRegisterInfo &MRI = MF->getRegInfo();
782
582
  MachineBasicBlock *MBB = MI->getParent();
783
582
  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
784
582
785
582
  ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
786
582
    = MFI->getSGPRToVGPRSpills(Index);
787
582
  bool SpillToVGPR = !VGPRSpills.empty();
788
582
  if (
OnlyToVGPR && 582
!SpillToVGPR536
)
789
0
    return false;
790
582
791
582
  MachineFrameInfo &FrameInfo = MF->getFrameInfo();
792
582
  const SISubtarget &ST =  MF->getSubtarget<SISubtarget>();
793
582
  const SIInstrInfo *TII = ST.getInstrInfo();
794
582
  const DebugLoc &DL = MI->getDebugLoc();
795
582
796
582
  unsigned SuperReg = MI->getOperand(0).getReg();
797
582
  bool SpillToSMEM = spillSGPRToSMEM();
798
582
  if (
SpillToSMEM && 582
OnlyToVGPR14
)
799
0
    return false;
800
582
801
582
  assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
802
582
803
582
  unsigned OffsetReg = AMDGPU::M0;
804
582
  unsigned M0CopyReg = AMDGPU::NoRegister;
805
582
806
582
  if (
SpillToSMEM582
) {
807
14
    if (
RS->isRegUsed(AMDGPU::M0)14
) {
808
14
      M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
809
14
      BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg)
810
14
        .addReg(AMDGPU::M0);
811
14
    }
812
14
  }
813
582
814
582
  unsigned EltSize = 4;
815
582
  unsigned ScalarLoadOp;
816
582
817
582
  const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
818
582
  if (
SpillToSMEM && 582
isSGPRClass(RC)14
) {
819
14
    // XXX - if private_element_size is larger than 4 it might be useful to be
820
14
    // able to spill wider vmem spills.
821
14
    std::tie(EltSize, ScalarLoadOp) =
822
14
          getSpillEltSize(getRegSizeInBits(*RC) / 8, false);
823
14
  }
824
582
825
582
  ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
826
582
  unsigned NumSubRegs = SplitParts.empty() ? 
1460
:
SplitParts.size()122
;
827
582
828
582
  // SubReg carries the "Kill" flag when SubReg == SuperReg.
829
582
  int64_t FrOffset = FrameInfo.getObjectOffset(Index);
830
582
831
1.58k
  for (unsigned i = 0, e = NumSubRegs; 
i < e1.58k
;
++i1.00k
) {
832
1.00k
    unsigned SubReg = NumSubRegs == 1 ?
833
1.00k
      
SuperReg460
:
getSubReg(SuperReg, SplitParts[i])544
;
834
1.00k
835
1.00k
    if (
SpillToSMEM1.00k
) {
836
15
      // FIXME: Size may be > 4 but extra bytes wasted.
837
15
      unsigned Align = FrameInfo.getObjectAlignment(Index);
838
15
      MachinePointerInfo PtrInfo
839
15
        = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
840
15
      MachineMemOperand *MMO
841
15
        = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
842
15
                                   EltSize, MinAlign(Align, EltSize * i));
843
15
844
15
      // Add i * 4 offset
845
15
      int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i);
846
15
      if (
Offset != 015
) {
847
15
        BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg)
848
15
          .addReg(MFI->getFrameOffsetReg())
849
15
          .addImm(Offset);
850
15
      } else {
851
0
        BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
852
0
          .addReg(MFI->getFrameOffsetReg());
853
0
      }
854
15
855
15
      auto MIB =
856
15
        BuildMI(*MBB, MI, DL, TII->get(ScalarLoadOp), SubReg)
857
15
        .addReg(MFI->getScratchRSrcReg()) // sbase
858
15
        .addReg(OffsetReg, RegState::Kill)                // soff
859
15
        .addImm(0)                        // glc
860
15
        .addMemOperand(MMO);
861
15
862
15
      if (NumSubRegs > 1)
863
2
        MIB.addReg(SuperReg, RegState::ImplicitDefine);
864
15
865
15
      continue;
866
15
    }
867
989
868
989
    
if (989
SpillToVGPR989
) {
869
907
      SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
870
907
      auto MIB =
871
907
        BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
872
907
                SubReg)
873
907
        .addReg(Spill.VGPR)
874
907
        .addImm(Spill.Lane);
875
907
876
907
      if (NumSubRegs > 1)
877
466
        MIB.addReg(SuperReg, RegState::ImplicitDefine);
878
989
    } else {
879
82
      if (OnlyToVGPR)
880
0
        return false;
881
82
882
82
      // Restore SGPR from a stack slot.
883
82
      // FIXME: We should use S_LOAD_DWORD here for VI.
884
82
      unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
885
82
      unsigned Align = FrameInfo.getObjectAlignment(Index);
886
82
887
82
      MachinePointerInfo PtrInfo
888
82
        = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
889
82
890
82
      MachineMemOperand *MMO = MF->getMachineMemOperand(PtrInfo,
891
82
        MachineMemOperand::MOLoad, EltSize,
892
82
        MinAlign(Align, EltSize * i));
893
82
894
82
      BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg)
895
82
        .addFrameIndex(Index)              // vaddr
896
82
        .addReg(MFI->getScratchRSrcReg())  // srsrc
897
82
        .addReg(MFI->getFrameOffsetReg())  // soffset
898
82
        .addImm(i * 4)                     // offset
899
82
        .addMemOperand(MMO);
900
82
901
82
      auto MIB =
902
82
        BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
903
82
        .addReg(TmpReg, RegState::Kill);
904
82
905
82
      if (NumSubRegs > 1)
906
76
        MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
907
82
    }
908
1.00k
  }
909
582
910
582
  
if (582
M0CopyReg != AMDGPU::NoRegister582
) {
911
14
    BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
912
14
      .addReg(M0CopyReg, RegState::Kill);
913
14
  }
914
582
915
582
  MI->eraseFromParent();
916
582
  return true;
917
582
}
918
919
/// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to
920
/// a VGPR and the stack slot can be safely eliminated when all other users are
921
/// handled.
922
bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
923
  MachineBasicBlock::iterator MI,
924
  int FI,
925
1.08k
  RegScavenger *RS) const {
926
1.08k
  switch (MI->getOpcode()) {
927
548
  case AMDGPU::SI_SPILL_S512_SAVE:
928
548
  case AMDGPU::SI_SPILL_S256_SAVE:
929
548
  case AMDGPU::SI_SPILL_S128_SAVE:
930
548
  case AMDGPU::SI_SPILL_S64_SAVE:
931
548
  case AMDGPU::SI_SPILL_S32_SAVE:
932
548
    return spillSGPR(MI, FI, RS, true);
933
536
  case AMDGPU::SI_SPILL_S512_RESTORE:
934
536
  case AMDGPU::SI_SPILL_S256_RESTORE:
935
536
  case AMDGPU::SI_SPILL_S128_RESTORE:
936
536
  case AMDGPU::SI_SPILL_S64_RESTORE:
937
536
  case AMDGPU::SI_SPILL_S32_RESTORE:
938
536
    return restoreSGPR(MI, FI, RS, true);
939
0
  default:
940
0
    llvm_unreachable("not an SGPR spill instruction");
941
0
  }
942
0
}
943
944
void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
945
                                        int SPAdj, unsigned FIOperandNum,
946
7.48k
                                        RegScavenger *RS) const {
947
7.48k
  MachineFunction *MF = MI->getParent()->getParent();
948
7.48k
  MachineRegisterInfo &MRI = MF->getRegInfo();
949
7.48k
  MachineBasicBlock *MBB = MI->getParent();
950
7.48k
  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
951
7.48k
  MachineFrameInfo &FrameInfo = MF->getFrameInfo();
952
7.48k
  const SISubtarget &ST =  MF->getSubtarget<SISubtarget>();
953
7.48k
  const SIInstrInfo *TII = ST.getInstrInfo();
954
7.48k
  DebugLoc DL = MI->getDebugLoc();
955
7.48k
956
7.48k
  MachineOperand &FIOp = MI->getOperand(FIOperandNum);
957
7.48k
  int Index = MI->getOperand(FIOperandNum).getIndex();
958
7.48k
959
7.48k
  switch (MI->getOpcode()) {
960
7.48k
    // SGPR register spill
961
46
    case AMDGPU::SI_SPILL_S512_SAVE:
962
46
    case AMDGPU::SI_SPILL_S256_SAVE:
963
46
    case AMDGPU::SI_SPILL_S128_SAVE:
964
46
    case AMDGPU::SI_SPILL_S64_SAVE:
965
46
    case AMDGPU::SI_SPILL_S32_SAVE: {
966
46
      spillSGPR(MI, Index, RS);
967
46
      break;
968
46
    }
969
46
970
46
    // SGPR register restore
971
46
    case AMDGPU::SI_SPILL_S512_RESTORE:
972
46
    case AMDGPU::SI_SPILL_S256_RESTORE:
973
46
    case AMDGPU::SI_SPILL_S128_RESTORE:
974
46
    case AMDGPU::SI_SPILL_S64_RESTORE:
975
46
    case AMDGPU::SI_SPILL_S32_RESTORE: {
976
46
      restoreSGPR(MI, Index, RS);
977
46
      break;
978
46
    }
979
46
980
46
    // VGPR register spill
981
1.16k
    case AMDGPU::SI_SPILL_V512_SAVE:
982
1.16k
    case AMDGPU::SI_SPILL_V256_SAVE:
983
1.16k
    case AMDGPU::SI_SPILL_V128_SAVE:
984
1.16k
    case AMDGPU::SI_SPILL_V96_SAVE:
985
1.16k
    case AMDGPU::SI_SPILL_V64_SAVE:
986
1.16k
    case AMDGPU::SI_SPILL_V32_SAVE: {
987
1.16k
      const MachineOperand *VData = TII->getNamedOperand(*MI,
988
1.16k
                                                         AMDGPU::OpName::vdata);
989
1.16k
      buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET,
990
1.16k
            Index,
991
1.16k
            VData->getReg(), VData->isKill(),
992
1.16k
            TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(),
993
1.16k
            TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(),
994
1.16k
            TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
995
1.16k
            *MI->memoperands_begin(),
996
1.16k
            RS);
997
1.16k
      MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode()));
998
1.16k
      MI->eraseFromParent();
999
1.16k
      break;
1000
1.16k
    }
1001
1.08k
    case AMDGPU::SI_SPILL_V32_RESTORE:
1002
1.08k
    case AMDGPU::SI_SPILL_V64_RESTORE:
1003
1.08k
    case AMDGPU::SI_SPILL_V96_RESTORE:
1004
1.08k
    case AMDGPU::SI_SPILL_V128_RESTORE:
1005
1.08k
    case AMDGPU::SI_SPILL_V256_RESTORE:
1006
1.08k
    case AMDGPU::SI_SPILL_V512_RESTORE: {
1007
1.08k
      const MachineOperand *VData = TII->getNamedOperand(*MI,
1008
1.08k
                                                         AMDGPU::OpName::vdata);
1009
1.08k
1010
1.08k
      buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
1011
1.08k
            Index,
1012
1.08k
            VData->getReg(), VData->isKill(),
1013
1.08k
            TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(),
1014
1.08k
            TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(),
1015
1.08k
            TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
1016
1.08k
            *MI->memoperands_begin(),
1017
1.08k
            RS);
1018
1.08k
      MI->eraseFromParent();
1019
1.08k
      break;
1020
1.08k
    }
1021
1.08k
1022
5.14k
    default: {
1023
5.14k
      const DebugLoc &DL = MI->getDebugLoc();
1024
5.14k
      bool IsMUBUF = TII->isMUBUF(*MI);
1025
5.14k
1026
5.14k
      if (!IsMUBUF &&
1027
5.14k
          
MFI->getFrameOffsetReg() != MFI->getScratchWaveOffsetReg()340
) {
1028
21
        // Convert to an absolute stack address by finding the offset from the
1029
21
        // scratch wave base and scaling by the wave size.
1030
21
        //
1031
21
        // In an entry function/kernel the stack address is already the absolute
1032
21
        // address relative to the the scratch wave offset.
1033
21
1034
21
        unsigned DiffReg
1035
21
          = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
1036
21
1037
21
        bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32;
1038
21
        unsigned ResultReg = IsCopy ?
1039
21
          MI->getOperand(0).getReg() :
1040
0
          MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1041
21
1042
21
        BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), DiffReg)
1043
21
          .addReg(MFI->getFrameOffsetReg())
1044
21
          .addReg(MFI->getScratchWaveOffsetReg());
1045
21
1046
21
        int64_t Offset = FrameInfo.getObjectOffset(Index);
1047
21
        if (
Offset == 021
) {
1048
0
          // XXX - This never happens because of emergency scavenging slot at 0?
1049
0
          BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg)
1050
0
            .addImm(Log2_32(ST.getWavefrontSize()))
1051
0
            .addReg(DiffReg);
1052
21
        } else {
1053
21
          unsigned CarryOut
1054
21
            = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
1055
21
          unsigned ScaledReg
1056
21
            = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1057
21
1058
21
          BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ScaledReg)
1059
21
            .addImm(Log2_32(ST.getWavefrontSize()))
1060
21
            .addReg(DiffReg, RegState::Kill);
1061
21
1062
21
          // TODO: Fold if use instruction is another add of a constant.
1063
21
          if (
AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())21
) {
1064
19
            BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_I32_e64), ResultReg)
1065
19
              .addReg(CarryOut, RegState::Define | RegState::Dead)
1066
19
              .addImm(Offset)
1067
19
              .addReg(ScaledReg, RegState::Kill);
1068
21
          } else {
1069
2
            unsigned ConstOffsetReg
1070
2
              = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
1071
2
1072
2
            BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg)
1073
2
              .addImm(Offset);
1074
2
            BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_I32_e64), ResultReg)
1075
2
              .addReg(CarryOut, RegState::Define | RegState::Dead)
1076
2
              .addReg(ConstOffsetReg, RegState::Kill)
1077
2
              .addReg(ScaledReg, RegState::Kill);
1078
2
          }
1079
21
1080
21
          MRI.setRegAllocationHint(CarryOut, 0, AMDGPU::VCC);
1081
21
        }
1082
21
1083
21
        // Don't introduce an extra copy if we're just materializing in a mov.
1084
21
        if (IsCopy)
1085
21
          MI->eraseFromParent();
1086
21
        else
1087
0
          FIOp.ChangeToRegister(ResultReg, false, false, true);
1088
21
        return;
1089
21
      }
1090
5.12k
1091
5.12k
      
if (5.12k
IsMUBUF5.12k
) {
1092
4.80k
        // Disable offen so we don't need a 0 vgpr base.
1093
4.80k
        assert(static_cast<int>(FIOperandNum) ==
1094
4.80k
               AMDGPU::getNamedOperandIdx(MI->getOpcode(),
1095
4.80k
                                          AMDGPU::OpName::vaddr));
1096
4.80k
1097
4.80k
        assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg()
1098
4.80k
               == MFI->getFrameOffsetReg());
1099
4.80k
1100
4.80k
        int64_t Offset = FrameInfo.getObjectOffset(Index);
1101
4.80k
        int64_t OldImm
1102
4.80k
          = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm();
1103
4.80k
        int64_t NewOffset = OldImm + Offset;
1104
4.80k
1105
4.80k
        if (isUInt<12>(NewOffset) &&
1106
4.80k
            
buildMUBUFOffsetLoadStore(TII, FrameInfo, MI, Index, NewOffset)4.78k
) {
1107
4.78k
          MI->eraseFromParent();
1108
4.78k
          return;
1109
4.78k
        }
1110
335
      }
1111
335
1112
335
      // If the offset is simply too big, don't convert to a scratch wave offset
1113
335
      // relative index.
1114
335
1115
335
      int64_t Offset = FrameInfo.getObjectOffset(Index);
1116
335
      FIOp.ChangeToImmediate(Offset);
1117
335
      if (
!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)335
) {
1118
16
        unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1119
16
        BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
1120
16
          .addImm(Offset);
1121
16
        FIOp.ChangeToRegister(TmpReg, false, false, true);
1122
16
      }
1123
46
    }
1124
7.48k
  }
1125
7.48k
}
1126
1127
6.56M
StringRef SIRegisterInfo::getRegAsmName(unsigned Reg) const {
1128
6.56M
  #define AMDGPU_REG_ASM_NAMES
1129
6.56M
  #include "AMDGPURegAsmNames.inc.cpp"
1130
6.56M
1131
6.56M
  #define REG_RANGE(BeginReg, EndReg, RegTable)            \
1132
49.3M
    
if (49.3M
Reg >= BeginReg && 49.3M
Reg <= EndReg14.5M
) { \
1133
3.04M
      unsigned Index = Reg - BeginReg;                     \
1134
3.04M
      assert(Index < array_lengthof(RegTable));            \
1135
3.04M
      return RegTable[Index];                              \
1136
3.04M
    }
1137
6.56M
1138
6.56M
  
REG_RANGE6.56M
(AMDGPU::VGPR0, AMDGPU::VGPR255, VGPR32RegNames);
1139
5.73M
  
REG_RANGE5.73M
(AMDGPU::SGPR0, AMDGPU::SGPR103, SGPR32RegNames);
1140
4.99M
  
REG_RANGE4.99M
(AMDGPU::VGPR0_VGPR1, AMDGPU::VGPR254_VGPR255, VGPR64RegNames);
1141
4.50M
  
REG_RANGE4.50M
(AMDGPU::SGPR0_SGPR1, AMDGPU::SGPR102_SGPR103, SGPR64RegNames);
1142
4.30M
  REG_RANGE(AMDGPU::VGPR0_VGPR1_VGPR2, AMDGPU::VGPR253_VGPR254_VGPR255,
1143
4.30M
            VGPR96RegNames);
1144
4.30M
1145
4.30M
  REG_RANGE(AMDGPU::VGPR0_VGPR1_VGPR2_VGPR3,
1146
4.30M
            AMDGPU::VGPR252_VGPR253_VGPR254_VGPR255,
1147
4.06M
            VGPR128RegNames);
1148
4.06M
  REG_RANGE(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
1149
4.06M
            AMDGPU::SGPR100_SGPR101_SGPR102_SGPR103,
1150
4.02M
            SGPR128RegNames);
1151
4.02M
1152
4.02M
  REG_RANGE(AMDGPU::VGPR0_VGPR1_VGPR2_VGPR3_VGPR4_VGPR5_VGPR6_VGPR7,
1153
4.02M
            AMDGPU::VGPR248_VGPR249_VGPR250_VGPR251_VGPR252_VGPR253_VGPR254_VGPR255,
1154
3.78M
            VGPR256RegNames);
1155
3.78M
1156
3.78M
  REG_RANGE(
1157
3.78M
    AMDGPU::VGPR0_VGPR1_VGPR2_VGPR3_VGPR4_VGPR5_VGPR6_VGPR7_VGPR8_VGPR9_VGPR10_VGPR11_VGPR12_VGPR13_VGPR14_VGPR15,
1158
3.78M
    AMDGPU::VGPR240_VGPR241_VGPR242_VGPR243_VGPR244_VGPR245_VGPR246_VGPR247_VGPR248_VGPR249_VGPR250_VGPR251_VGPR252_VGPR253_VGPR254_VGPR255,
1159
3.56M
    VGPR512RegNames);
1160
3.56M
1161
3.56M
  REG_RANGE(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7,
1162
3.56M
            AMDGPU::SGPR96_SGPR97_SGPR98_SGPR99_SGPR100_SGPR101_SGPR102_SGPR103,
1163
3.54M
            SGPR256RegNames);
1164
3.54M
1165
3.54M
  REG_RANGE(
1166
3.54M
    AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7_SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15,
1167
3.54M
    AMDGPU::SGPR88_SGPR89_SGPR90_SGPR91_SGPR92_SGPR93_SGPR94_SGPR95_SGPR96_SGPR97_SGPR98_SGPR99_SGPR100_SGPR101_SGPR102_SGPR103,
1168
3.54M
    SGPR512RegNames
1169
3.51M
  );
1170
3.51M
1171
3.51M
#undef REG_RANGE
1172
3.51M
1173
3.51M
  // FIXME: Rename flat_scr so we don't need to special case this.
1174
3.51M
  switch (Reg) {
1175
2.85k
  case AMDGPU::FLAT_SCR:
1176
2.85k
    return "flat_scratch";
1177
5.98k
  case AMDGPU::FLAT_SCR_LO:
1178
5.98k
    return "flat_scratch_lo";
1179
5.98k
  case AMDGPU::FLAT_SCR_HI:
1180
5.98k
    return "flat_scratch_hi";
1181
3.50M
  default:
1182
3.50M
    // For the special named registers the default is fine.
1183
3.50M
    return TargetRegisterInfo::getRegAsmName(Reg);
1184
0
  }
1185
0
}
1186
1187
// FIXME: This is very slow. It might be worth creating a map from physreg to
1188
// register class.
1189
4.42M
const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
1190
4.42M
  assert(!TargetRegisterInfo::isVirtualRegister(Reg));
1191
4.42M
1192
4.42M
  static const TargetRegisterClass *const BaseClasses[] = {
1193
4.42M
    &AMDGPU::VGPR_32RegClass,
1194
4.42M
    &AMDGPU::SReg_32RegClass,
1195
4.42M
    &AMDGPU::VReg_64RegClass,
1196
4.42M
    &AMDGPU::SReg_64RegClass,
1197
4.42M
    &AMDGPU::VReg_96RegClass,
1198
4.42M
    &AMDGPU::VReg_128RegClass,
1199
4.42M
    &AMDGPU::SReg_128RegClass,
1200
4.42M
    &AMDGPU::VReg_256RegClass,
1201
4.42M
    &AMDGPU::SReg_256RegClass,
1202
4.42M
    &AMDGPU::VReg_512RegClass,
1203
4.42M
    &AMDGPU::SReg_512RegClass,
1204
4.42M
    &AMDGPU::SCC_CLASSRegClass,
1205
4.42M
  };
1206
4.42M
1207
14.0M
  for (const TargetRegisterClass *BaseClass : BaseClasses) {
1208
14.0M
    if (
BaseClass->contains(Reg)14.0M
) {
1209
4.42M
      return BaseClass;
1210
4.42M
    }
1211
0
  }
1212
0
  return nullptr;
1213
0
}
1214
1215
// TODO: It might be helpful to have some target specific flags in
1216
// TargetRegisterClass to mark which classes are VGPRs to make this trivial.
1217
9.31M
bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
1218
9.31M
  unsigned Size = getRegSizeInBits(*RC);
1219
9.31M
  if (Size < 32)
1220
3.85k
    return false;
1221
9.31M
  switch (Size) {
1222
4.94M
  case 32:
1223
4.94M
    return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr;
1224
3.18M
  case 64:
1225
3.18M
    return getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) != nullptr;
1226
1.19k
  case 96:
1227
1.19k
    return getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) != nullptr;
1228
1.07M
  case 128:
1229
1.07M
    return getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) != nullptr;
1230
74.6k
  case 256:
1231
74.6k
    return getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) != nullptr;
1232
37.2k
  case 512:
1233
37.2k
    return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr;
1234
0
  default:
1235
0
    llvm_unreachable("Invalid register class size");
1236
0
  }
1237
0
}
1238
1239
const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
1240
154k
                                         const TargetRegisterClass *SRC) const {
1241
154k
  switch (getRegSizeInBits(*SRC)) {
1242
129k
  case 32:
1243
129k
    return &AMDGPU::VGPR_32RegClass;
1244
20.0k
  case 64:
1245
20.0k
    return &AMDGPU::VReg_64RegClass;
1246
0
  case 96:
1247
0
    return &AMDGPU::VReg_96RegClass;
1248
4.65k
  case 128:
1249
4.65k
    return &AMDGPU::VReg_128RegClass;
1250
53
  case 256:
1251
53
    return &AMDGPU::VReg_256RegClass;
1252
51
  case 512:
1253
51
    return &AMDGPU::VReg_512RegClass;
1254
0
  default:
1255
0
    llvm_unreachable("Invalid register class size");
1256
0
  }
1257
0
}
1258
1259
const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass(
1260
1.76k
                                         const TargetRegisterClass *VRC) const {
1261
1.76k
  switch (getRegSizeInBits(*VRC)) {
1262
1.27k
  case 32:
1263
1.27k
    return &AMDGPU::SGPR_32RegClass;
1264
473
  case 64:
1265
473
    return &AMDGPU::SReg_64RegClass;
1266
8
  case 128:
1267
8
    return &AMDGPU::SReg_128RegClass;
1268
2
  case 256:
1269
2
    return &AMDGPU::SReg_256RegClass;
1270
0
  case 512:
1271
0
    return &AMDGPU::SReg_512RegClass;
1272
0
  default:
1273
0
    llvm_unreachable("Invalid register class size");
1274
0
  }
1275
0
}
1276
1277
const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
1278
335k
                         const TargetRegisterClass *RC, unsigned SubIdx) const {
1279
335k
  if (SubIdx == AMDGPU::NoSubRegister)
1280
285k
    return RC;
1281
49.8k
1282
49.8k
  // We can assume that each lane corresponds to one 32-bit register.
1283
49.8k
  unsigned Count = getSubRegIndexLaneMask(SubIdx).getNumLanes();
1284
49.8k
  if (
isSGPRClass(RC)49.8k
) {
1285
22.8k
    switch (Count) {
1286
22.8k
    case 1:
1287
22.8k
      return &AMDGPU::SGPR_32RegClass;
1288
0
    case 2:
1289
0
      return &AMDGPU::SReg_64RegClass;
1290
0
    case 4:
1291
0
      return &AMDGPU::SReg_128RegClass;
1292
0
    case 8:
1293
0
      return &AMDGPU::SReg_256RegClass;
1294
0
    case 16: /* fall-through */
1295
0
    default:
1296
0
      llvm_unreachable("Invalid sub-register class size");
1297
49.8k
    }
1298
26.9k
  } else {
1299
26.9k
    switch (Count) {
1300
26.9k
    case 1:
1301
26.9k
      return &AMDGPU::VGPR_32RegClass;
1302
46
    case 2:
1303
46
      return &AMDGPU::VReg_64RegClass;
1304
0
    case 3:
1305
0
      return &AMDGPU::VReg_96RegClass;
1306
0
    case 4:
1307
0
      return &AMDGPU::VReg_128RegClass;
1308
0
    case 8:
1309
0
      return &AMDGPU::VReg_256RegClass;
1310
0
    case 16: /* fall-through */
1311
0
    default:
1312
0
      llvm_unreachable("Invalid sub-register class size");
1313
0
    }
1314
0
  }
1315
335k
}
1316
1317
bool SIRegisterInfo::shouldRewriteCopySrc(
1318
  const TargetRegisterClass *DefRC,
1319
  unsigned DefSubReg,
1320
  const TargetRegisterClass *SrcRC,
1321
355k
  unsigned SrcSubReg) const {
1322
355k
  // We want to prefer the smallest register class possible, so we don't want to
1323
355k
  // stop and rewrite on anything that looks like a subregister
1324
355k
  // extract. Operations mostly don't care about the super register class, so we
1325
355k
  // only want to stop on the most basic of copies between the same register
1326
355k
  // class.
1327
355k
  //
1328
355k
  // e.g. if we have something like
1329
355k
  // vreg0 = ...
1330
355k
  // vreg1 = ...
1331
355k
  // vreg2 = REG_SEQUENCE vreg0, sub0, vreg1, sub1, vreg2, sub2
1332
355k
  // vreg3 = COPY vreg2, sub0
1333
355k
  //
1334
355k
  // We want to look through the COPY to find:
1335
355k
  //  => vreg3 = COPY vreg0
1336
355k
1337
355k
  // Plain copy.
1338
355k
  return getCommonSubClass(DefRC, SrcRC) != nullptr;
1339
355k
}
1340
1341
/// \brief Returns a register that is not used at any point in the function.
1342
///        If all registers are used, then this function will return
1343
//         AMDGPU::NoRegister.
1344
unsigned
1345
SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
1346
                                   const TargetRegisterClass *RC,
1347
133
                                   const MachineFunction &MF) const {
1348
133
1349
133
  for (unsigned Reg : *RC)
1350
4.21k
    
if (4.21k
MRI.isAllocatable(Reg) && 4.21k
!MRI.isPhysRegUsed(Reg)3.74k
)
1351
129
      return Reg;
1352
4
  return AMDGPU::NoRegister;
1353
4
}
1354
1355
ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC,
1356
5.44k
                                                   unsigned EltSize) const {
1357
5.44k
  if (
EltSize == 45.44k
) {
1358
5.28k
    static const int16_t Sub0_15[] = {
1359
5.28k
      AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
1360
5.28k
      AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
1361
5.28k
      AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
1362
5.28k
      AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
1363
5.28k
    };
1364
5.28k
1365
5.28k
    static const int16_t Sub0_7[] = {
1366
5.28k
      AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
1367
5.28k
      AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
1368
5.28k
    };
1369
5.28k
1370
5.28k
    static const int16_t Sub0_3[] = {
1371
5.28k
      AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
1372
5.28k
    };
1373
5.28k
1374
5.28k
    static const int16_t Sub0_2[] = {
1375
5.28k
      AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2,
1376
5.28k
    };
1377
5.28k
1378
5.28k
    static const int16_t Sub0_1[] = {
1379
5.28k
      AMDGPU::sub0, AMDGPU::sub1,
1380
5.28k
    };
1381
5.28k
1382
5.28k
    switch (AMDGPU::getRegBitWidth(*RC->MC)) {
1383
906
    case 32:
1384
906
      return {};
1385
4.13k
    case 64:
1386
4.13k
      return makeArrayRef(Sub0_1);
1387
0
    case 96:
1388
0
      return makeArrayRef(Sub0_2);
1389
163
    case 128:
1390
163
      return makeArrayRef(Sub0_3);
1391
60
    case 256:
1392
60
      return makeArrayRef(Sub0_7);
1393
21
    case 512:
1394
21
      return makeArrayRef(Sub0_15);
1395
0
    default:
1396
0
      llvm_unreachable("unhandled register size");
1397
158
    }
1398
158
  }
1399
158
1400
158
  
if (158
EltSize == 8158
) {
1401
152
    static const int16_t Sub0_15_64[] = {
1402
152
      AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
1403
152
      AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
1404
152
      AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
1405
152
      AMDGPU::sub12_sub13, AMDGPU::sub14_sub15
1406
152
    };
1407
152
1408
152
    static const int16_t Sub0_7_64[] = {
1409
152
      AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
1410
152
      AMDGPU::sub4_sub5, AMDGPU::sub6_sub7
1411
152
    };
1412
152
1413
152
1414
152
    static const int16_t Sub0_3_64[] = {
1415
152
      AMDGPU::sub0_sub1, AMDGPU::sub2_sub3
1416
152
    };
1417
152
1418
152
    switch (AMDGPU::getRegBitWidth(*RC->MC)) {
1419
16
    case 64:
1420
16
      return {};
1421
136
    case 128:
1422
136
      return makeArrayRef(Sub0_3_64);
1423
0
    case 256:
1424
0
      return makeArrayRef(Sub0_7_64);
1425
0
    case 512:
1426
0
      return makeArrayRef(Sub0_15_64);
1427
0
    default:
1428
0
      llvm_unreachable("unhandled register size");
1429
6
    }
1430
6
  }
1431
6
1432
158
  assert(EltSize == 16 && "unhandled register spill split size");
1433
6
1434
6
  static const int16_t Sub0_15_128[] = {
1435
6
    AMDGPU::sub0_sub1_sub2_sub3,
1436
6
    AMDGPU::sub4_sub5_sub6_sub7,
1437
6
    AMDGPU::sub8_sub9_sub10_sub11,
1438
6
    AMDGPU::sub12_sub13_sub14_sub15
1439
6
  };
1440
6
1441
6
  static const int16_t Sub0_7_128[] = {
1442
6
    AMDGPU::sub0_sub1_sub2_sub3,
1443
6
    AMDGPU::sub4_sub5_sub6_sub7
1444
6
  };
1445
6
1446
6
  switch (AMDGPU::getRegBitWidth(*RC->MC)) {
1447
4
  case 128:
1448
4
    return {};
1449
2
  case 256:
1450
2
    return makeArrayRef(Sub0_7_128);
1451
0
  case 512:
1452
0
    return makeArrayRef(Sub0_15_128);
1453
0
  default:
1454
0
    llvm_unreachable("unhandled register size");
1455
0
  }
1456
0
}
1457
1458
const TargetRegisterClass*
1459
SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI,
1460
3.41M
                                  unsigned Reg) const {
1461
3.41M
  if (TargetRegisterInfo::isVirtualRegister(Reg))
1462
112k
    return  MRI.getRegClass(Reg);
1463
3.30M
1464
3.30M
  return getPhysRegClass(Reg);
1465
3.30M
}
1466
1467
bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI,
1468
3.25M
                            unsigned Reg) const {
1469
3.25M
  return hasVGPRs(getRegClassForReg(MRI, Reg));
1470
3.25M
}
1471
1472
bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
1473
                                    const TargetRegisterClass *SrcRC,
1474
                                    unsigned SubReg,
1475
                                    const TargetRegisterClass *DstRC,
1476
                                    unsigned DstSubReg,
1477
145k
                                    const TargetRegisterClass *NewRC) const {
1478
145k
  unsigned SrcSize = getRegSizeInBits(*SrcRC);
1479
145k
  unsigned DstSize = getRegSizeInBits(*DstRC);
1480
145k
  unsigned NewSize = getRegSizeInBits(*NewRC);
1481
145k
1482
145k
  // Do not increase size of registers beyond dword, we would need to allocate
1483
145k
  // adjacent registers and constraint regalloc more than needed.
1484
145k
1485
145k
  // Always allow dword coalescing.
1486
145k
  if (
SrcSize <= 32 || 145k
DstSize <= 3275.2k
)
1487
104k
    return true;
1488
41.2k
1489
41.2k
  
return NewSize <= DstSize || 41.2k
NewSize <= SrcSize4.79k
;
1490
145k
}
1491
1492
unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
1493
94.0k
                                             MachineFunction &MF) const {
1494
94.0k
1495
94.0k
  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
1496
94.0k
  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1497
94.0k
1498
94.0k
  unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(),
1499
94.0k
                                                       *MF.getFunction());
1500
94.0k
  switch (RC->getID()) {
1501
0
  default:
1502
0
    return AMDGPURegisterInfo::getRegPressureLimit(RC, MF);
1503
47.0k
  case AMDGPU::VGPR_32RegClassID:
1504
47.0k
    return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF));
1505
47.0k
  case AMDGPU::SGPR_32RegClassID:
1506
47.0k
    return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF));
1507
0
  }
1508
0
}
1509
1510
unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
1511
871k
                                                unsigned Idx) const {
1512
871k
  if (Idx == getVGPRPressureSet())
1513
47.0k
    return getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
1514
47.0k
                               const_cast<MachineFunction &>(MF));
1515
824k
1516
824k
  
if (824k
Idx == getSGPRPressureSet()824k
)
1517
47.0k
    return getRegPressureLimit(&AMDGPU::SGPR_32RegClass,
1518
47.0k
                               const_cast<MachineFunction &>(MF));
1519
777k
1520
777k
  return AMDGPURegisterInfo::getRegPressureSetLimit(MF, Idx);
1521
777k
}
1522
1523
3.76M
const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
1524
3.76M
  static const int Empty[] = { -1 };
1525
3.76M
1526
3.76M
  if (hasRegUnit(AMDGPU::M0, RegUnit))
1527
1.81k
    return Empty;
1528
3.76M
  return AMDGPURegisterInfo::getRegUnitPressureSets(RegUnit);
1529
3.76M
}