Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
Line
Count
Source (jump to first uncovered line)
1
//===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
/// \file
10
/// SI implementation of the TargetRegisterInfo class.
11
//
12
//===----------------------------------------------------------------------===//
13
14
#include "SIRegisterInfo.h"
15
#include "AMDGPURegisterBankInfo.h"
16
#include "AMDGPUSubtarget.h"
17
#include "SIInstrInfo.h"
18
#include "SIMachineFunctionInfo.h"
19
#include "MCTargetDesc/AMDGPUInstPrinter.h"
20
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21
#include "llvm/CodeGen/LiveIntervals.h"
22
#include "llvm/CodeGen/MachineDominators.h"
23
#include "llvm/CodeGen/MachineFrameInfo.h"
24
#include "llvm/CodeGen/MachineInstrBuilder.h"
25
#include "llvm/CodeGen/RegisterScavenging.h"
26
#include "llvm/CodeGen/SlotIndexes.h"
27
#include "llvm/IR/Function.h"
28
#include "llvm/IR/LLVMContext.h"
29
30
using namespace llvm;
31
32
2.76M
static bool hasPressureSet(const int *PSets, unsigned PSetID) {
33
97.6M
  for (unsigned i = 0; PSets[i] != -1; 
++i94.8M
) {
34
95.3M
    if (PSets[i] == (int)PSetID)
35
517k
      return true;
36
95.3M
  }
37
2.76M
  
return false2.24M
;
38
2.76M
}
39
40
void SIRegisterInfo::classifyPressureSet(unsigned PSetID, unsigned Reg,
41
2.76M
                                         BitVector &PressureSets) const {
42
5.01M
  for (MCRegUnitIterator U(Reg, this); U.isValid(); 
++U2.24M
) {
43
2.76M
    const int *PSets = getRegUnitPressureSets(*U);
44
2.76M
    if (hasPressureSet(PSets, PSetID)) {
45
517k
      PressureSets.set(PSetID);
46
517k
      break;
47
517k
    }
48
2.76M
  }
49
2.76M
}
50
51
static cl::opt<bool> EnableSpillSGPRToSMEM(
52
  "amdgpu-spill-sgpr-to-smem",
53
  cl::desc("Use scalar stores to spill SGPRs if supported by subtarget"),
54
  cl::init(false));
55
56
static cl::opt<bool> EnableSpillSGPRToVGPR(
57
  "amdgpu-spill-sgpr-to-vgpr",
58
  cl::desc("Enable spilling VGPRs to SGPRs"),
59
  cl::ReallyHidden,
60
  cl::init(true));
61
62
SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) :
63
  AMDGPURegisterInfo(),
64
  SGPRPressureSets(getNumRegPressureSets()),
65
  VGPRPressureSets(getNumRegPressureSets()),
66
  AGPRPressureSets(getNumRegPressureSets()),
67
  SpillSGPRToVGPR(false),
68
  SpillSGPRToSMEM(false),
69
3.64k
  isWave32(ST.isWave32()) {
70
3.64k
  if (EnableSpillSGPRToSMEM && 
ST.hasScalarStores()5
)
71
5
    SpillSGPRToSMEM = true;
72
3.63k
  else if (EnableSpillSGPRToVGPR)
73
3.63k
    SpillSGPRToVGPR = true;
74
3.64k
75
3.64k
  unsigned NumRegPressureSets = getNumRegPressureSets();
76
3.64k
77
3.64k
  SGPRSetID = NumRegPressureSets;
78
3.64k
  VGPRSetID = NumRegPressureSets;
79
3.64k
  AGPRSetID = NumRegPressureSets;
80
3.64k
81
924k
  for (unsigned i = 0; i < NumRegPressureSets; 
++i921k
) {
82
921k
    classifyPressureSet(i, AMDGPU::SGPR0, SGPRPressureSets);
83
921k
    classifyPressureSet(i, AMDGPU::VGPR0, VGPRPressureSets);
84
921k
    classifyPressureSet(i, AMDGPU::AGPR0, AGPRPressureSets);
85
921k
  }
86
3.64k
87
3.64k
  // Determine the number of reg units for each pressure set.
88
3.64k
  std::vector<unsigned> PressureSetRegUnits(NumRegPressureSets, 0);
89
2.54M
  for (unsigned i = 0, e = getNumRegUnits(); i != e; 
++i2.54M
) {
90
2.54M
    const int *PSets = getRegUnitPressureSets(i);
91
58.6M
    for (unsigned j = 0; PSets[j] != -1; 
++j56.1M
) {
92
56.1M
      ++PressureSetRegUnits[PSets[j]];
93
56.1M
    }
94
2.54M
  }
95
3.64k
96
3.64k
  unsigned VGPRMax = 0, SGPRMax = 0, AGPRMax = 0;
97
924k
  for (unsigned i = 0; i < NumRegPressureSets; 
++i921k
) {
98
921k
    if (isVGPRPressureSet(i) && 
PressureSetRegUnits[i] > VGPRMax3.64k
) {
99
3.64k
      VGPRSetID = i;
100
3.64k
      VGPRMax = PressureSetRegUnits[i];
101
3.64k
      continue;
102
3.64k
    }
103
917k
    if (isSGPRPressureSet(i) && 
PressureSetRegUnits[i] > SGPRMax509k
) {
104
218k
      SGPRSetID = i;
105
218k
      SGPRMax = PressureSetRegUnits[i];
106
218k
    }
107
917k
    if (isAGPRPressureSet(i) && 
PressureSetRegUnits[i] > AGPRMax3.64k
) {
108
3.64k
      AGPRSetID = i;
109
3.64k
      AGPRMax = PressureSetRegUnits[i];
110
3.64k
      continue;
111
3.64k
    }
112
917k
  }
113
3.64k
114
3.64k
  assert(SGPRSetID < NumRegPressureSets &&
115
3.64k
         VGPRSetID < NumRegPressureSets &&
116
3.64k
         AGPRSetID < NumRegPressureSets);
117
3.64k
}
118
119
unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
120
46.3k
  const MachineFunction &MF) const {
121
46.3k
122
46.3k
  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
123
46.3k
  unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4;
124
46.3k
  unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
125
46.3k
  return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
126
46.3k
}
127
128
45.1k
static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) {
129
45.1k
  unsigned Reg;
130
45.1k
131
45.1k
  // Try to place it in a hole after PrivateSegmentBufferReg.
132
45.1k
  if (RegCount & 3) {
133
37.9k
    // We cannot put the segment buffer in (Idx - 4) ... (Idx - 1) due to
134
37.9k
    // alignment constraints, so we have a hole where can put the wave offset.
135
37.9k
    Reg = RegCount - 1;
136
37.9k
  } else {
137
7.23k
    // We can put the segment buffer in (Idx - 4) ... (Idx - 1) and put the
138
7.23k
    // wave offset before it.
139
7.23k
    Reg = RegCount - 5;
140
7.23k
  }
141
45.1k
142
45.1k
  return Reg;
143
45.1k
}
144
145
unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
146
45.1k
  const MachineFunction &MF) const {
147
45.1k
  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
148
45.1k
  unsigned Reg = findPrivateSegmentWaveByteOffsetRegIndex(ST.getMaxNumSGPRs(MF));
149
45.1k
  return AMDGPU::SGPR_32RegClass.getRegister(Reg);
150
45.1k
}
151
152
93.3k
BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
153
93.3k
  BitVector Reserved(getNumRegs());
154
93.3k
155
93.3k
  // EXEC_LO and EXEC_HI could be allocated and used as regular register, but
156
93.3k
  // this seems likely to result in bugs, so I'm marking them as reserved.
157
93.3k
  reserveRegisterTuples(Reserved, AMDGPU::EXEC);
158
93.3k
  reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
159
93.3k
160
93.3k
  // M0 has to be reserved so that llvm accepts it as a live-in into a block.
161
93.3k
  reserveRegisterTuples(Reserved, AMDGPU::M0);
162
93.3k
163
93.3k
  // Reserve src_vccz, src_execz, src_scc.
164
93.3k
  reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ);
165
93.3k
  reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ);
166
93.3k
  reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC);
167
93.3k
168
93.3k
  // Reserve the memory aperture registers.
169
93.3k
  reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE);
170
93.3k
  reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT);
171
93.3k
  reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE);
172
93.3k
  reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT);
173
93.3k
174
93.3k
  // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen.
175
93.3k
  reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID);
176
93.3k
177
93.3k
  // Reserve xnack_mask registers - support is not implemented in Codegen.
178
93.3k
  reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK);
179
93.3k
180
93.3k
  // Reserve lds_direct register - support is not implemented in Codegen.
181
93.3k
  reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT);
182
93.3k
183
93.3k
  // Reserve Trap Handler registers - support is not implemented in Codegen.
184
93.3k
  reserveRegisterTuples(Reserved, AMDGPU::TBA);
185
93.3k
  reserveRegisterTuples(Reserved, AMDGPU::TMA);
186
93.3k
  reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1);
187
93.3k
  reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3);
188
93.3k
  reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5);
189
93.3k
  reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7);
190
93.3k
  reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
191
93.3k
  reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);
192
93.3k
  reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13);
193
93.3k
  reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15);
194
93.3k
195
93.3k
  // Reserve null register - it shall never be allocated
196
93.3k
  reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL);
197
93.3k
198
93.3k
  // Disallow vcc_hi allocation in wave32. It may be allocated but most likely
199
93.3k
  // will result in bugs.
200
93.3k
  if (isWave32) {
201
6.64k
    Reserved.set(AMDGPU::VCC);
202
6.64k
    Reserved.set(AMDGPU::VCC_HI);
203
6.64k
  }
204
93.3k
205
93.3k
  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
206
93.3k
207
93.3k
  unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
208
93.3k
  unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
209
607k
  for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; 
++i513k
) {
210
513k
    unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
211
513k
    reserveRegisterTuples(Reserved, Reg);
212
513k
  }
213
93.3k
214
93.3k
  unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF);
215
93.3k
  unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
216
146k
  for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; 
++i52.9k
) {
217
52.9k
    unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
218
52.9k
    reserveRegisterTuples(Reserved, Reg);
219
52.9k
    Reg = AMDGPU::AGPR_32RegClass.getRegister(i);
220
52.9k
    reserveRegisterTuples(Reserved, Reg);
221
52.9k
  }
222
93.3k
223
93.3k
  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
224
93.3k
225
93.3k
  unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
226
93.3k
  if (ScratchWaveOffsetReg != AMDGPU::NoRegister) {
227
93.3k
    // Reserve 1 SGPR for scratch wave offset in case we need to spill.
228
93.3k
    reserveRegisterTuples(Reserved, ScratchWaveOffsetReg);
229
93.3k
  }
230
93.3k
231
93.3k
  unsigned ScratchRSrcReg = MFI->getScratchRSrcReg();
232
93.3k
  if (ScratchRSrcReg != AMDGPU::NoRegister) {
233
93.3k
    // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need
234
93.3k
    // to spill.
235
93.3k
    // TODO: May need to reserve a VGPR if doing LDS spilling.
236
93.3k
    reserveRegisterTuples(Reserved, ScratchRSrcReg);
237
93.3k
    assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg));
238
93.3k
  }
239
93.3k
240
93.3k
  // We have to assume the SP is needed in case there are calls in the function,
241
93.3k
  // which is detected after the function is lowered. If we aren't really going
242
93.3k
  // to need SP, don't bother reserving it.
243
93.3k
  unsigned StackPtrReg = MFI->getStackPtrOffsetReg();
244
93.3k
245
93.3k
  if (StackPtrReg != AMDGPU::NoRegister) {
246
93.3k
    reserveRegisterTuples(Reserved, StackPtrReg);
247
93.3k
    assert(!isSubRegister(ScratchRSrcReg, StackPtrReg));
248
93.3k
  }
249
93.3k
250
93.3k
  unsigned FrameReg = MFI->getFrameOffsetReg();
251
93.3k
  if (FrameReg != AMDGPU::NoRegister) {
252
93.3k
    reserveRegisterTuples(Reserved, FrameReg);
253
93.3k
    assert(!isSubRegister(ScratchRSrcReg, FrameReg));
254
93.3k
  }
255
93.3k
256
93.3k
  for (unsigned Reg : MFI->WWMReservedRegs) {
257
322
    reserveRegisterTuples(Reserved, Reg);
258
322
  }
259
93.3k
260
93.3k
  // FIXME: Stop using reserved registers for this.
261
93.3k
  for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs())
262
512
    reserveRegisterTuples(Reserved, Reg);
263
93.3k
264
93.3k
  for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs())
265
32
    reserveRegisterTuples(Reserved, Reg);
266
93.3k
267
93.3k
  return Reserved;
268
93.3k
}
269
270
5.60k
bool SIRegisterInfo::canRealignStack(const MachineFunction &MF) const {
271
5.60k
  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
272
5.60k
  // On entry, the base address is 0, so it can't possibly need any more
273
5.60k
  // alignment.
274
5.60k
275
5.60k
  // FIXME: Should be able to specify the entry frame alignment per calling
276
5.60k
  // convention instead.
277
5.60k
  if (Info->isEntryFunction())
278
5.51k
    return false;
279
95
280
95
  return TargetRegisterInfo::canRealignStack(MF);
281
95
}
282
283
50.8k
bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
284
50.8k
  const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>();
285
50.8k
  if (Info->isEntryFunction()) {
286
46.3k
    const MachineFrameInfo &MFI = Fn.getFrameInfo();
287
46.3k
    return MFI.hasStackObjects() || 
MFI.hasCalls()45.1k
;
288
46.3k
  }
289
4.52k
290
4.52k
  // May need scavenger for dealing with callee saved registers.
291
4.52k
  return true;
292
4.52k
}
293
294
bool SIRegisterInfo::requiresFrameIndexScavenging(
295
25.4k
  const MachineFunction &MF) const {
296
25.4k
  const MachineFrameInfo &MFI = MF.getFrameInfo();
297
25.4k
  if (MFI.hasStackObjects())
298
968
    return true;
299
24.4k
300
24.4k
  // May need to deal with callee saved registers.
301
24.4k
  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
302
24.4k
  return !Info->isEntryFunction();
303
24.4k
}
304
305
bool SIRegisterInfo::requiresFrameIndexReplacementScavenging(
306
983
  const MachineFunction &MF) const {
307
983
  const MachineFrameInfo &MFI = MF.getFrameInfo();
308
983
  if (!MFI.hasStackObjects())
309
0
    return false;
310
983
311
983
  // The scavenger is used for large frames which may require finding a free
312
983
  // register for large offsets.
313
983
  if (!isUInt<12>(MFI.getStackSize()))
314
74
    return true;
315
909
316
909
  // If using scalar stores, for spills, m0 is needed for the scalar store
317
909
  // offset (pre-GFX9). m0 is unallocatable, so we can't create a virtual
318
909
  // register for it during frame index elimination, so the scavenger is
319
909
  // directly needed.
320
909
  return MF.getSubtarget<GCNSubtarget>().hasScalarStores() &&
321
909
         
MF.getInfo<SIMachineFunctionInfo>()->hasSpilledSGPRs()565
;
322
909
}
323
324
bool SIRegisterInfo::requiresVirtualBaseRegisters(
325
25.4k
  const MachineFunction &) const {
326
25.4k
  // There are no special dedicated stack or frame pointers.
327
25.4k
  return true;
328
25.4k
}
329
330
50.9k
bool SIRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
331
50.9k
  // This helps catch bugs as verifier errors.
332
50.9k
  return true;
333
50.9k
}
334
335
5.78k
int64_t SIRegisterInfo::getMUBUFInstrOffset(const MachineInstr *MI) const {
336
5.78k
  assert(SIInstrInfo::isMUBUF(*MI));
337
5.78k
338
5.78k
  int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
339
5.78k
                                          AMDGPU::OpName::offset);
340
5.78k
  return MI->getOperand(OffIdx).getImm();
341
5.78k
}
342
343
int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI,
344
4
                                                 int Idx) const {
345
4
  if (!SIInstrInfo::isMUBUF(*MI))
346
0
    return 0;
347
4
348
4
  assert(Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
349
4
                                           AMDGPU::OpName::vaddr) &&
350
4
         "Should never see frame index on non-address operand");
351
4
352
4
  return getMUBUFInstrOffset(MI);
353
4
}
354
355
6.18k
bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
356
6.18k
  if (!MI->mayLoadOrStore())
357
404
    return false;
358
5.77k
359
5.77k
  int64_t FullOffset = Offset + getMUBUFInstrOffset(MI);
360
5.77k
361
5.77k
  return !isUInt<12>(FullOffset);
362
5.77k
}
363
364
void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
365
                                                  unsigned BaseReg,
366
                                                  int FrameIdx,
367
0
                                                  int64_t Offset) const {
368
0
  MachineBasicBlock::iterator Ins = MBB->begin();
369
0
  DebugLoc DL; // Defaults to "unknown"
370
0
371
0
  if (Ins != MBB->end())
372
0
    DL = Ins->getDebugLoc();
373
0
374
0
  MachineFunction *MF = MBB->getParent();
375
0
  const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>();
376
0
  const SIInstrInfo *TII = Subtarget.getInstrInfo();
377
0
378
0
  if (Offset == 0) {
379
0
    BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), BaseReg)
380
0
      .addFrameIndex(FrameIdx);
381
0
    return;
382
0
  }
383
0
384
0
  MachineRegisterInfo &MRI = MF->getRegInfo();
385
0
  unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
386
0
387
0
  unsigned FIReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
388
0
389
0
  BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
390
0
    .addImm(Offset);
391
0
  BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), FIReg)
392
0
    .addFrameIndex(FrameIdx);
393
0
394
0
  TII->getAddNoCarry(*MBB, Ins, DL, BaseReg)
395
0
    .addReg(OffsetReg, RegState::Kill)
396
0
    .addReg(FIReg)
397
0
    .addImm(0); // clamp bit
398
0
}
399
400
void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
401
0
                                       int64_t Offset) const {
402
0
403
0
  MachineBasicBlock *MBB = MI.getParent();
404
0
  MachineFunction *MF = MBB->getParent();
405
0
  const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>();
406
0
  const SIInstrInfo *TII = Subtarget.getInstrInfo();
407
0
408
#ifndef NDEBUG
409
  // FIXME: Is it possible to be storing a frame index to itself?
410
  bool SeenFI = false;
411
  for (const MachineOperand &MO: MI.operands()) {
412
    if (MO.isFI()) {
413
      if (SeenFI)
414
        llvm_unreachable("should not see multiple frame indices");
415
416
      SeenFI = true;
417
    }
418
  }
419
#endif
420
421
0
  MachineOperand *FIOp = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
422
0
  assert(FIOp && FIOp->isFI() && "frame index must be address operand");
423
0
  assert(TII->isMUBUF(MI));
424
0
  assert(TII->getNamedOperand(MI, AMDGPU::OpName::soffset)->getReg() ==
425
0
         MF->getInfo<SIMachineFunctionInfo>()->getFrameOffsetReg() &&
426
0
         "should only be seeing frame offset relative FrameIndex");
427
0
428
0
429
0
  MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
430
0
  int64_t NewOffset = OffsetOp->getImm() + Offset;
431
0
  assert(isUInt<12>(NewOffset) && "offset should be legal");
432
0
433
0
  FIOp->ChangeToRegister(BaseReg, false);
434
0
  OffsetOp->setImm(NewOffset);
435
0
}
436
437
bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
438
                                        unsigned BaseReg,
439
0
                                        int64_t Offset) const {
440
0
  if (!SIInstrInfo::isMUBUF(*MI))
441
0
    return false;
442
0
443
0
  int64_t NewOffset = Offset + getMUBUFInstrOffset(MI);
444
0
445
0
  return isUInt<12>(NewOffset);
446
0
}
447
448
const TargetRegisterClass *SIRegisterInfo::getPointerRegClass(
449
0
  const MachineFunction &MF, unsigned Kind) const {
450
0
  // This is inaccurate. It depends on the instruction and address space. The
451
0
  // only place where we should hit this is for dealing with frame indexes /
452
0
  // private accesses, so this is correct in that case.
453
0
  return &AMDGPU::VGPR_32RegClass;
454
0
}
455
456
1.40k
static unsigned getNumSubRegsForSpillOp(unsigned Op) {
457
1.40k
458
1.40k
  switch (Op) {
459
1.40k
  case AMDGPU::SI_SPILL_S1024_SAVE:
460
9
  case AMDGPU::SI_SPILL_S1024_RESTORE:
461
9
  case AMDGPU::SI_SPILL_V1024_SAVE:
462
9
  case AMDGPU::SI_SPILL_V1024_RESTORE:
463
9
  case AMDGPU::SI_SPILL_A1024_SAVE:
464
9
  case AMDGPU::SI_SPILL_A1024_RESTORE:
465
9
    return 32;
466
11
  case AMDGPU::SI_SPILL_S512_SAVE:
467
11
  case AMDGPU::SI_SPILL_S512_RESTORE:
468
11
  case AMDGPU::SI_SPILL_V512_SAVE:
469
11
  case AMDGPU::SI_SPILL_V512_RESTORE:
470
11
  case AMDGPU::SI_SPILL_A512_SAVE:
471
11
  case AMDGPU::SI_SPILL_A512_RESTORE:
472
11
    return 16;
473
11
  case AMDGPU::SI_SPILL_S256_SAVE:
474
0
  case AMDGPU::SI_SPILL_S256_RESTORE:
475
0
  case AMDGPU::SI_SPILL_V256_SAVE:
476
0
  case AMDGPU::SI_SPILL_V256_RESTORE:
477
0
    return 8;
478
0
  case AMDGPU::SI_SPILL_S160_SAVE:
479
0
  case AMDGPU::SI_SPILL_S160_RESTORE:
480
0
  case AMDGPU::SI_SPILL_V160_SAVE:
481
0
  case AMDGPU::SI_SPILL_V160_RESTORE:
482
0
    return 5;
483
690
  case AMDGPU::SI_SPILL_S128_SAVE:
484
690
  case AMDGPU::SI_SPILL_S128_RESTORE:
485
690
  case AMDGPU::SI_SPILL_V128_SAVE:
486
690
  case AMDGPU::SI_SPILL_V128_RESTORE:
487
690
  case AMDGPU::SI_SPILL_A128_SAVE:
488
690
  case AMDGPU::SI_SPILL_A128_RESTORE:
489
690
    return 4;
490
690
  case AMDGPU::SI_SPILL_S96_SAVE:
491
1
  case AMDGPU::SI_SPILL_S96_RESTORE:
492
1
  case AMDGPU::SI_SPILL_V96_SAVE:
493
1
  case AMDGPU::SI_SPILL_V96_RESTORE:
494
1
    return 3;
495
41
  case AMDGPU::SI_SPILL_S64_SAVE:
496
41
  case AMDGPU::SI_SPILL_S64_RESTORE:
497
41
  case AMDGPU::SI_SPILL_V64_SAVE:
498
41
  case AMDGPU::SI_SPILL_V64_RESTORE:
499
41
  case AMDGPU::SI_SPILL_A64_SAVE:
500
41
  case AMDGPU::SI_SPILL_A64_RESTORE:
501
41
    return 2;
502
657
  case AMDGPU::SI_SPILL_S32_SAVE:
503
657
  case AMDGPU::SI_SPILL_S32_RESTORE:
504
657
  case AMDGPU::SI_SPILL_V32_SAVE:
505
657
  case AMDGPU::SI_SPILL_V32_RESTORE:
506
657
  case AMDGPU::SI_SPILL_A32_SAVE:
507
657
  case AMDGPU::SI_SPILL_A32_RESTORE:
508
657
    return 1;
509
657
  
default: 0
llvm_unreachable0
("Invalid spill opcode");
510
1.40k
  }
511
1.40k
}
512
513
4.41k
static int getOffsetMUBUFStore(unsigned Opc) {
514
4.41k
  switch (Opc) {
515
4.41k
  case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
516
4.28k
    return AMDGPU::BUFFER_STORE_DWORD_OFFSET;
517
4.41k
  case AMDGPU::BUFFER_STORE_BYTE_OFFEN:
518
75
    return AMDGPU::BUFFER_STORE_BYTE_OFFSET;
519
4.41k
  case AMDGPU::BUFFER_STORE_SHORT_OFFEN:
520
12
    return AMDGPU::BUFFER_STORE_SHORT_OFFSET;
521
4.41k
  case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
522
20
    return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
523
4.41k
  case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN:
524
18
    return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
525
4.41k
  case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN:
526
3
    return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET;
527
4.41k
  case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN:
528
1
    return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET;
529
4.41k
  default:
530
0
    return -1;
531
4.41k
  }
532
4.41k
}
533
534
1.82k
static int getOffsetMUBUFLoad(unsigned Opc) {
535
1.82k
  switch (Opc) {
536
1.82k
  case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
537
1.68k
    return AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
538
1.82k
  case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN:
539
68
    return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET;
540
1.82k
  case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN:
541
14
    return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET;
542
1.82k
  case AMDGPU::BUFFER_LOAD_USHORT_OFFEN:
543
30
    return AMDGPU::BUFFER_LOAD_USHORT_OFFSET;
544
1.82k
  case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN:
545
2
    return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET;
546
1.82k
  case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN:
547
0
    return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
548
1.82k
  case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN:
549
8
    return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET;
550
1.82k
  case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN:
551
3
    return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET;
552
1.82k
  case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN:
553
3
    return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET;
554
1.82k
  case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN:
555
3
    return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET;
556
1.82k
  case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN:
557
3
    return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET;
558
1.82k
  case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN:
559
3
    return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET;
560
1.82k
  case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN:
561
5
    return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET;
562
1.82k
  default:
563
0
    return -1;
564
1.82k
  }
565
1.82k
}
566
567
static MachineInstrBuilder spillVGPRtoAGPR(MachineBasicBlock::iterator MI,
568
                                           int Index,
569
                                           unsigned Lane,
570
                                           unsigned ValueReg,
571
14.1k
                                           bool IsKill) {
572
14.1k
  MachineBasicBlock *MBB = MI->getParent();
573
14.1k
  MachineFunction *MF = MI->getParent()->getParent();
574
14.1k
  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
575
14.1k
  const GCNSubtarget &ST =  MF->getSubtarget<GCNSubtarget>();
576
14.1k
  const SIInstrInfo *TII = ST.getInstrInfo();
577
14.1k
578
14.1k
  MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane);
579
14.1k
580
14.1k
  if (Reg == AMDGPU::NoRegister)
581
13.4k
    return MachineInstrBuilder();
582
695
583
695
  bool IsStore = MI->mayStore();
584
695
  MachineRegisterInfo &MRI = MF->getRegInfo();
585
695
  auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
586
695
587
695
  unsigned Dst = IsStore ? 
Reg344
:
ValueReg351
;
588
695
  unsigned Src = IsStore ? 
ValueReg344
:
Reg351
;
589
695
  unsigned Opc = (IsStore ^ TRI->isVGPR(MRI, Reg)) ? 
AMDGPU::V_ACCVGPR_WRITE_B32344
590
695
                                                   : 
AMDGPU::V_ACCVGPR_READ_B32351
;
591
695
592
695
  return BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst)
593
695
           .addReg(Src, getKillRegState(IsKill));
594
695
}
595
596
// This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
597
// need to handle the case where an SGPR may need to be spilled while spilling.
598
static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII,
599
                                      MachineFrameInfo &MFI,
600
                                      MachineBasicBlock::iterator MI,
601
                                      int Index,
602
6.23k
                                      int64_t Offset) {
603
6.23k
  MachineBasicBlock *MBB = MI->getParent();
604
6.23k
  const DebugLoc &DL = MI->getDebugLoc();
605
6.23k
  bool IsStore = MI->mayStore();
606
6.23k
607
6.23k
  unsigned Opc = MI->getOpcode();
608
6.23k
  int LoadStoreOp = IsStore ?
609
4.41k
    getOffsetMUBUFStore(Opc) : 
getOffsetMUBUFLoad(Opc)1.82k
;
610
6.23k
  if (LoadStoreOp == -1)
611
0
    return false;
612
6.23k
613
6.23k
  const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
614
6.23k
  if (spillVGPRtoAGPR(MI, Index, 0, Reg->getReg(), false).getInstr())
615
0
    return true;
616
6.23k
617
6.23k
  MachineInstrBuilder NewMI =
618
6.23k
      BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
619
6.23k
          .add(*Reg)
620
6.23k
          .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
621
6.23k
          .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
622
6.23k
          .addImm(Offset)
623
6.23k
          .addImm(0) // glc
624
6.23k
          .addImm(0) // slc
625
6.23k
          .addImm(0) // tfe
626
6.23k
          .addImm(0) // dlc
627
6.23k
          .cloneMemRefs(*MI);
628
6.23k
629
6.23k
  const MachineOperand *VDataIn = TII->getNamedOperand(*MI,
630
6.23k
                                                       AMDGPU::OpName::vdata_in);
631
6.23k
  if (VDataIn)
632
20
    NewMI.add(*VDataIn);
633
6.23k
  return true;
634
6.23k
}
635
636
void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
637
                                         unsigned LoadStoreOp,
638
                                         int Index,
639
                                         unsigned ValueReg,
640
                                         bool IsKill,
641
                                         unsigned ScratchRsrcReg,
642
                                         unsigned ScratchOffsetReg,
643
                                         int64_t InstOffset,
644
                                         MachineMemOperand *MMO,
645
2.77k
                                         RegScavenger *RS) const {
646
2.77k
  MachineBasicBlock *MBB = MI->getParent();
647
2.77k
  MachineFunction *MF = MI->getParent()->getParent();
648
2.77k
  const GCNSubtarget &ST =  MF->getSubtarget<GCNSubtarget>();
649
2.77k
  const SIInstrInfo *TII = ST.getInstrInfo();
650
2.77k
  const MachineFrameInfo &MFI = MF->getFrameInfo();
651
2.77k
652
2.77k
  const MCInstrDesc &Desc = TII->get(LoadStoreOp);
653
2.77k
  const DebugLoc &DL = MI->getDebugLoc();
654
2.77k
  bool IsStore = Desc.mayStore();
655
2.77k
656
2.77k
  bool Scavenged = false;
657
2.77k
  unsigned SOffset = ScratchOffsetReg;
658
2.77k
659
2.77k
  const unsigned EltSize = 4;
660
2.77k
  const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
661
2.77k
  unsigned NumSubRegs = AMDGPU::getRegBitWidth(RC->getID()) / (EltSize * CHAR_BIT);
662
2.77k
  unsigned Size = NumSubRegs * EltSize;
663
2.77k
  int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
664
2.77k
  int64_t ScratchOffsetRegDelta = 0;
665
2.77k
666
2.77k
  unsigned Align = MFI.getObjectAlignment(Index);
667
2.77k
  const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
668
2.77k
669
2.77k
  Register TmpReg =
670
2.77k
    hasAGPRs(RC) ? 
TII->getNamedOperand(*MI, AMDGPU::OpName::tmp)->getReg()28
671
2.77k
                 : 
Register()2.74k
;
672
2.77k
673
2.77k
  assert((Offset % EltSize) == 0 && "unexpected VGPR spill offset");
674
2.77k
675
2.77k
  if (!isUInt<12>(Offset + Size - EltSize)) {
676
246
    SOffset = AMDGPU::NoRegister;
677
246
678
246
    // We currently only support spilling VGPRs to EltSize boundaries, meaning
679
246
    // we can simplify the adjustment of Offset here to just scale with
680
246
    // WavefrontSize.
681
246
    Offset *= ST.getWavefrontSize();
682
246
683
246
    // We don't have access to the register scavenger if this function is called
684
246
    // during  PEI::scavengeFrameVirtualRegs().
685
246
    if (RS)
686
246
      SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0, false);
687
246
688
246
    if (SOffset == AMDGPU::NoRegister) {
689
3
      // There are no free SGPRs, and since we are in the process of spilling
690
3
      // VGPRs too.  Since we need a VGPR in order to spill SGPRs (this is true
691
3
      // on SI/CI and on VI it is true until we implement spilling using scalar
692
3
      // stores), we have no way to free up an SGPR.  Our solution here is to
693
3
      // add the offset directly to the ScratchOffset register, and then
694
3
      // subtract the offset after the spill to return ScratchOffset to it's
695
3
      // original value.
696
3
      SOffset = ScratchOffsetReg;
697
3
      ScratchOffsetRegDelta = Offset;
698
243
    } else {
699
243
      Scavenged = true;
700
243
    }
701
246
702
246
    BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset)
703
246
      .addReg(ScratchOffsetReg)
704
246
      .addImm(Offset);
705
246
706
246
    Offset = 0;
707
246
  }
708
2.77k
709
10.6k
  for (unsigned i = 0, e = NumSubRegs; i != e; 
++i, Offset += EltSize7.91k
) {
710
7.91k
    unsigned SubReg = NumSubRegs == 1 ?
711
6.65k
      
ValueReg1.25k
: getSubReg(ValueReg, getSubRegFromChannel(i));
712
7.91k
713
7.91k
    unsigned SOffsetRegState = 0;
714
7.91k
    unsigned SrcDstRegState = getDefRegState(!IsStore);
715
7.91k
    if (i + 1 == e) {
716
2.77k
      SOffsetRegState |= getKillRegState(Scavenged);
717
2.77k
      // The last implicit use carries the "Kill" flag.
718
2.77k
      SrcDstRegState |= getKillRegState(IsKill);
719
2.77k
    }
720
7.91k
721
7.91k
    auto MIB = spillVGPRtoAGPR(MI, Index, i, SubReg, IsKill);
722
7.91k
723
7.91k
    if (!MIB.getInstr()) {
724
7.21k
      unsigned FinalReg = SubReg;
725
7.21k
      if (TmpReg != AMDGPU::NoRegister) {
726
44
        if (IsStore)
727
22
          BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_READ_B32), TmpReg)
728
22
            .addReg(SubReg, getKillRegState(IsKill));
729
44
        SubReg = TmpReg;
730
44
      }
731
7.21k
732
7.21k
      MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i);
733
7.21k
      MachineMemOperand *NewMMO
734
7.21k
        = MF->getMachineMemOperand(PInfo, MMO->getFlags(),
735
7.21k
                                   EltSize, MinAlign(Align, EltSize * i));
736
7.21k
737
7.21k
      MIB = BuildMI(*MBB, MI, DL, Desc)
738
7.21k
        .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill))
739
7.21k
        .addReg(ScratchRsrcReg)
740
7.21k
        .addReg(SOffset, SOffsetRegState)
741
7.21k
        .addImm(Offset)
742
7.21k
        .addImm(0) // glc
743
7.21k
        .addImm(0) // slc
744
7.21k
        .addImm(0) // tfe
745
7.21k
        .addImm(0) // dlc
746
7.21k
        .addMemOperand(NewMMO);
747
7.21k
748
7.21k
      if (!IsStore && 
TmpReg != AMDGPU::NoRegister3.59k
)
749
22
        MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32),
750
22
                      FinalReg)
751
22
          .addReg(TmpReg, RegState::Kill);
752
7.21k
    }
753
7.91k
754
7.91k
    if (NumSubRegs > 1)
755
6.65k
      MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
756
7.91k
  }
757
2.77k
758
2.77k
  if (ScratchOffsetRegDelta != 0) {
759
3
    // Subtract the offset we added to the ScratchOffset register.
760
3
    BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffsetReg)
761
3
        .addReg(ScratchOffsetReg)
762
3
        .addImm(ScratchOffsetRegDelta);
763
3
  }
764
2.77k
}
765
766
static std::pair<unsigned, unsigned> getSpillEltSize(unsigned SuperRegSize,
767
32
                                                     bool Store) {
768
32
  if (SuperRegSize % 16 == 0) {
769
6
    return { 16, Store ? 
AMDGPU::S_BUFFER_STORE_DWORDX4_SGPR3
:
770
6
                         
AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR3
};
771
6
  }
772
26
773
26
  if (SuperRegSize % 8 == 0) {
774
16
    return { 8, Store ? 
AMDGPU::S_BUFFER_STORE_DWORDX2_SGPR8
:
775
16
                        
AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR8
};
776
16
  }
777
10
778
10
  return { 4, Store ? 
AMDGPU::S_BUFFER_STORE_DWORD_SGPR5
:
779
10
                      
AMDGPU::S_BUFFER_LOAD_DWORD_SGPR5
};
780
10
}
781
782
bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
783
                               int Index,
784
                               RegScavenger *RS,
785
878
                               bool OnlyToVGPR) const {
786
878
  MachineBasicBlock *MBB = MI->getParent();
787
878
  MachineFunction *MF = MBB->getParent();
788
878
  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
789
878
  DenseSet<unsigned> SGPRSpillVGPRDefinedSet;
790
878
791
878
  ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
792
878
    = MFI->getSGPRToVGPRSpills(Index);
793
878
  bool SpillToVGPR = !VGPRSpills.empty();
794
878
  if (OnlyToVGPR && 
!SpillToVGPR828
)
795
0
    return false;
796
878
797
878
  MachineRegisterInfo &MRI = MF->getRegInfo();
798
878
  const GCNSubtarget &ST =  MF->getSubtarget<GCNSubtarget>();
799
878
  const SIInstrInfo *TII = ST.getInstrInfo();
800
878
801
878
  unsigned SuperReg = MI->getOperand(0).getReg();
802
878
  bool IsKill = MI->getOperand(0).isKill();
803
878
  const DebugLoc &DL = MI->getDebugLoc();
804
878
805
878
  MachineFrameInfo &FrameInfo = MF->getFrameInfo();
806
878
807
878
  bool SpillToSMEM = spillSGPRToSMEM();
808
878
  if (SpillToSMEM && 
OnlyToVGPR16
)
809
0
    return false;
810
878
811
878
  Register FrameReg = getFrameRegister(*MF);
812
878
813
878
  assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() &&
814
878
                         SuperReg != MFI->getFrameOffsetReg() &&
815
878
                         SuperReg != MFI->getScratchWaveOffsetReg()));
816
878
817
878
  assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
818
878
819
878
  unsigned OffsetReg = AMDGPU::M0;
820
878
  unsigned M0CopyReg = AMDGPU::NoRegister;
821
878
822
878
  if (SpillToSMEM) {
823
16
    if (RS->isRegUsed(AMDGPU::M0)) {
824
16
      M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
825
16
      BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg)
826
16
        .addReg(AMDGPU::M0);
827
16
    }
828
16
  }
829
878
830
878
  unsigned ScalarStoreOp;
831
878
  unsigned EltSize = 4;
832
878
  const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
833
878
  if (SpillToSMEM && 
isSGPRClass(RC)16
) {
834
16
    // XXX - if private_element_size is larger than 4 it might be useful to be
835
16
    // able to spill wider vmem spills.
836
16
    std::tie(EltSize, ScalarStoreOp) =
837
16
          getSpillEltSize(getRegSizeInBits(*RC) / 8, true);
838
16
  }
839
878
840
878
  ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
841
878
  unsigned NumSubRegs = SplitParts.empty() ? 
1520
:
SplitParts.size()358
;
842
878
843
878
  // SubReg carries the "Kill" flag when SubReg == SuperReg.
844
878
  unsigned SubKillState = getKillRegState((NumSubRegs == 1) && 
IsKill520
);
845
2.59k
  for (unsigned i = 0, e = NumSubRegs; i < e; 
++i1.72k
) {
846
1.72k
    unsigned SubReg = NumSubRegs == 1 ?
847
1.20k
      
SuperReg520
: getSubReg(SuperReg, SplitParts[i]);
848
1.72k
849
1.72k
    if (SpillToSMEM) {
850
23
      int64_t FrOffset = FrameInfo.getObjectOffset(Index);
851
23
852
23
      // The allocated memory size is really the wavefront size * the frame
853
23
      // index size. The widest register class is 64 bytes, so a 4-byte scratch
854
23
      // allocation is enough to spill this in a single stack object.
855
23
      //
856
23
      // FIXME: Frame size/offsets are computed earlier than this, so the extra
857
23
      // space is still unnecessarily allocated.
858
23
859
23
      unsigned Align = FrameInfo.getObjectAlignment(Index);
860
23
      MachinePointerInfo PtrInfo
861
23
        = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
862
23
      MachineMemOperand *MMO
863
23
        = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
864
23
                                   EltSize, MinAlign(Align, EltSize * i));
865
23
866
23
      // SMEM instructions only support a single offset, so increment the wave
867
23
      // offset.
868
23
869
23
      int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i);
870
23
      if (Offset != 0) {
871
23
        BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg)
872
23
          .addReg(FrameReg)
873
23
          .addImm(Offset);
874
23
      } else {
875
0
        BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
876
0
          .addReg(FrameReg);
877
0
      }
878
23
879
23
      BuildMI(*MBB, MI, DL, TII->get(ScalarStoreOp))
880
23
        .addReg(SubReg, getKillRegState(IsKill)) // sdata
881
23
        .addReg(MFI->getScratchRSrcReg())        // sbase
882
23
        .addReg(OffsetReg, RegState::Kill)       // soff
883
23
        .addImm(0)                               // glc
884
23
        .addImm(0)                               // dlc
885
23
        .addMemOperand(MMO);
886
23
887
23
      continue;
888
23
    }
889
1.69k
890
1.69k
    if (SpillToVGPR) {
891
1.62k
      SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
892
1.62k
893
1.62k
      // During SGPR spilling to VGPR, determine if the VGPR is defined. The
894
1.62k
      // only circumstance in which we say it is undefined is when it is the
895
1.62k
      // first spill to this VGPR in the first basic block.
896
1.62k
      bool VGPRDefined = true;
897
1.62k
      if (MBB == &MF->front())
898
1.54k
        VGPRDefined = !SGPRSpillVGPRDefinedSet.insert(Spill.VGPR).second;
899
1.62k
900
1.62k
      // Mark the "old value of vgpr" input undef only if this is the first sgpr
901
1.62k
      // spill to this specific vgpr in the first basic block.
902
1.62k
      BuildMI(*MBB, MI, DL,
903
1.62k
              TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
904
1.62k
              Spill.VGPR)
905
1.62k
        .addReg(SubReg, getKillRegState(IsKill))
906
1.62k
        .addImm(Spill.Lane)
907
1.62k
        .addReg(Spill.VGPR, VGPRDefined ? 
0834
:
RegState::Undef787
);
908
1.62k
909
1.62k
      // FIXME: Since this spills to another register instead of an actual
910
1.62k
      // frame index, we should delete the frame index when all references to
911
1.62k
      // it are fixed.
912
1.62k
    } else {
913
76
      // XXX - Can to VGPR spill fail for some subregisters but not others?
914
76
      if (OnlyToVGPR)
915
0
        return false;
916
76
917
76
      // Spill SGPR to a frame index.
918
76
      // TODO: Should VI try to spill to VGPR and then spill to SMEM?
919
76
      unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
920
76
      // TODO: Should VI try to spill to VGPR and then spill to SMEM?
921
76
922
76
      MachineInstrBuilder Mov
923
76
        = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
924
76
        .addReg(SubReg, SubKillState);
925
76
926
76
927
76
      // There could be undef components of a spilled super register.
928
76
      // TODO: Can we detect this and skip the spill?
929
76
      if (NumSubRegs > 1) {
930
70
        // The last implicit use of the SuperReg carries the "Kill" flag.
931
70
        unsigned SuperKillState = 0;
932
70
        if (i + 1 == e)
933
28
          SuperKillState |= getKillRegState(IsKill);
934
70
        Mov.addReg(SuperReg, RegState::Implicit | SuperKillState);
935
70
      }
936
76
937
76
      unsigned Align = FrameInfo.getObjectAlignment(Index);
938
76
      MachinePointerInfo PtrInfo
939
76
        = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
940
76
      MachineMemOperand *MMO
941
76
        = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
942
76
                                   EltSize, MinAlign(Align, EltSize * i));
943
76
      BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE))
944
76
        .addReg(TmpReg, RegState::Kill)       // src
945
76
        .addFrameIndex(Index)                 // vaddr
946
76
        .addReg(MFI->getScratchRSrcReg())     // srrsrc
947
76
        .addReg(MFI->getStackPtrOffsetReg())  // soffset
948
76
        .addImm(i * 4)                        // offset
949
76
        .addMemOperand(MMO);
950
76
    }
951
1.69k
  }
952
878
953
878
  if (M0CopyReg != AMDGPU::NoRegister) {
954
16
    BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
955
16
      .addReg(M0CopyReg, RegState::Kill);
956
16
  }
957
878
958
878
  MI->eraseFromParent();
959
878
  MFI->addToSpilledSGPRs(NumSubRegs);
960
878
  return true;
961
878
}
962
963
bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
964
                                 int Index,
965
                                 RegScavenger *RS,
966
873
                                 bool OnlyToVGPR) const {
967
873
  MachineFunction *MF = MI->getParent()->getParent();
968
873
  MachineRegisterInfo &MRI = MF->getRegInfo();
969
873
  MachineBasicBlock *MBB = MI->getParent();
970
873
  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
971
873
972
873
  ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
973
873
    = MFI->getSGPRToVGPRSpills(Index);
974
873
  bool SpillToVGPR = !VGPRSpills.empty();
975
873
  if (OnlyToVGPR && 
!SpillToVGPR822
)
976
0
    return false;
977
873
978
873
  MachineFrameInfo &FrameInfo = MF->getFrameInfo();
979
873
  const GCNSubtarget &ST =  MF->getSubtarget<GCNSubtarget>();
980
873
  const SIInstrInfo *TII = ST.getInstrInfo();
981
873
  const DebugLoc &DL = MI->getDebugLoc();
982
873
983
873
  unsigned SuperReg = MI->getOperand(0).getReg();
984
873
  bool SpillToSMEM = spillSGPRToSMEM();
985
873
  if (SpillToSMEM && 
OnlyToVGPR16
)
986
0
    return false;
987
873
988
873
  assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
989
873
990
873
  unsigned OffsetReg = AMDGPU::M0;
991
873
  unsigned M0CopyReg = AMDGPU::NoRegister;
992
873
993
873
  if (SpillToSMEM) {
994
16
    if (RS->isRegUsed(AMDGPU::M0)) {
995
16
      M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
996
16
      BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg)
997
16
        .addReg(AMDGPU::M0);
998
16
    }
999
16
  }
1000
873
1001
873
  unsigned EltSize = 4;
1002
873
  unsigned ScalarLoadOp;
1003
873
1004
873
  Register FrameReg = getFrameRegister(*MF);
1005
873
1006
873
  const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
1007
873
  if (SpillToSMEM && 
isSGPRClass(RC)16
) {
1008
16
    // XXX - if private_element_size is larger than 4 it might be useful to be
1009
16
    // able to spill wider vmem spills.
1010
16
    std::tie(EltSize, ScalarLoadOp) =
1011
16
          getSpillEltSize(getRegSizeInBits(*RC) / 8, false);
1012
16
  }
1013
873
1014
873
  ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
1015
873
  unsigned NumSubRegs = SplitParts.empty() ? 
1515
:
SplitParts.size()358
;
1016
873
1017
873
  // SubReg carries the "Kill" flag when SubReg == SuperReg.
1018
873
  int64_t FrOffset = FrameInfo.getObjectOffset(Index);
1019
873
1020
2.59k
  for (unsigned i = 0, e = NumSubRegs; i < e; 
++i1.71k
) {
1021
1.71k
    unsigned SubReg = NumSubRegs == 1 ?
1022
1.20k
      
SuperReg515
: getSubReg(SuperReg, SplitParts[i]);
1023
1.71k
1024
1.71k
    if (SpillToSMEM) {
1025
23
      // FIXME: Size may be > 4 but extra bytes wasted.
1026
23
      unsigned Align = FrameInfo.getObjectAlignment(Index);
1027
23
      MachinePointerInfo PtrInfo
1028
23
        = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
1029
23
      MachineMemOperand *MMO
1030
23
        = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
1031
23
                                   EltSize, MinAlign(Align, EltSize * i));
1032
23
1033
23
      // Add i * 4 offset
1034
23
      int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i);
1035
23
      if (Offset != 0) {
1036
23
        BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg)
1037
23
          .addReg(FrameReg)
1038
23
          .addImm(Offset);
1039
23
      } else {
1040
0
        BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
1041
0
          .addReg(FrameReg);
1042
0
      }
1043
23
1044
23
      auto MIB =
1045
23
        BuildMI(*MBB, MI, DL, TII->get(ScalarLoadOp), SubReg)
1046
23
        .addReg(MFI->getScratchRSrcReg())  // sbase
1047
23
        .addReg(OffsetReg, RegState::Kill) // soff
1048
23
        .addImm(0)                         // glc
1049
23
        .addImm(0)                         // dlc
1050
23
        .addMemOperand(MMO);
1051
23
1052
23
      if (NumSubRegs > 1 && 
i == 010
)
1053
3
        MIB.addReg(SuperReg, RegState::ImplicitDefine);
1054
23
1055
23
      continue;
1056
23
    }
1057
1.69k
1058
1.69k
    if (SpillToVGPR) {
1059
1.61k
      SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
1060
1.61k
      auto MIB =
1061
1.61k
        BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
1062
1.61k
                SubReg)
1063
1.61k
        .addReg(Spill.VGPR)
1064
1.61k
        .addImm(Spill.Lane);
1065
1.61k
1066
1.61k
      if (NumSubRegs > 1 && 
i == 01.12k
)
1067
326
        MIB.addReg(SuperReg, RegState::ImplicitDefine);
1068
1.61k
    } else {
1069
78
      if (OnlyToVGPR)
1070
0
        return false;
1071
78
1072
78
      // Restore SGPR from a stack slot.
1073
78
      // FIXME: We should use S_LOAD_DWORD here for VI.
1074
78
      unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1075
78
      unsigned Align = FrameInfo.getObjectAlignment(Index);
1076
78
1077
78
      MachinePointerInfo PtrInfo
1078
78
        = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
1079
78
1080
78
      MachineMemOperand *MMO = MF->getMachineMemOperand(PtrInfo,
1081
78
        MachineMemOperand::MOLoad, EltSize,
1082
78
        MinAlign(Align, EltSize * i));
1083
78
1084
78
      BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg)
1085
78
        .addFrameIndex(Index)                 // vaddr
1086
78
        .addReg(MFI->getScratchRSrcReg())     // srsrc
1087
78
        .addReg(MFI->getStackPtrOffsetReg())  // soffset
1088
78
        .addImm(i * 4)                        // offset
1089
78
        .addMemOperand(MMO);
1090
78
1091
78
      auto MIB =
1092
78
        BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
1093
78
        .addReg(TmpReg, RegState::Kill);
1094
78
1095
78
      if (NumSubRegs > 1)
1096
72
        MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
1097
78
    }
1098
1.69k
  }
1099
873
1100
873
  if (M0CopyReg != AMDGPU::NoRegister) {
1101
16
    BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
1102
16
      .addReg(M0CopyReg, RegState::Kill);
1103
16
  }
1104
873
1105
873
  MI->eraseFromParent();
1106
873
  return true;
1107
873
}
1108
1109
/// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to
1110
/// a VGPR and the stack slot can be safely eliminated when all other users are
1111
/// handled.
1112
bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
1113
  MachineBasicBlock::iterator MI,
1114
  int FI,
1115
1.65k
  RegScavenger *RS) const {
1116
1.65k
  switch (MI->getOpcode()) {
1117
1.65k
  case AMDGPU::SI_SPILL_S1024_SAVE:
1118
828
  case AMDGPU::SI_SPILL_S512_SAVE:
1119
828
  case AMDGPU::SI_SPILL_S256_SAVE:
1120
828
  case AMDGPU::SI_SPILL_S160_SAVE:
1121
828
  case AMDGPU::SI_SPILL_S128_SAVE:
1122
828
  case AMDGPU::SI_SPILL_S96_SAVE:
1123
828
  case AMDGPU::SI_SPILL_S64_SAVE:
1124
828
  case AMDGPU::SI_SPILL_S32_SAVE:
1125
828
    return spillSGPR(MI, FI, RS, true);
1126
828
  case AMDGPU::SI_SPILL_S1024_RESTORE:
1127
822
  case AMDGPU::SI_SPILL_S512_RESTORE:
1128
822
  case AMDGPU::SI_SPILL_S256_RESTORE:
1129
822
  case AMDGPU::SI_SPILL_S160_RESTORE:
1130
822
  case AMDGPU::SI_SPILL_S128_RESTORE:
1131
822
  case AMDGPU::SI_SPILL_S96_RESTORE:
1132
822
  case AMDGPU::SI_SPILL_S64_RESTORE:
1133
822
  case AMDGPU::SI_SPILL_S32_RESTORE:
1134
822
    return restoreSGPR(MI, FI, RS, true);
1135
822
  default:
1136
0
    llvm_unreachable("not an SGPR spill instruction");
1137
1.65k
  }
1138
1.65k
}
1139
1140
void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
1141
                                        int SPAdj, unsigned FIOperandNum,
1142
9.53k
                                        RegScavenger *RS) const {
1143
9.53k
  MachineFunction *MF = MI->getParent()->getParent();
1144
9.53k
  MachineRegisterInfo &MRI = MF->getRegInfo();
1145
9.53k
  MachineBasicBlock *MBB = MI->getParent();
1146
9.53k
  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1147
9.53k
  MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1148
9.53k
  const GCNSubtarget &ST =  MF->getSubtarget<GCNSubtarget>();
1149
9.53k
  const SIInstrInfo *TII = ST.getInstrInfo();
1150
9.53k
  DebugLoc DL = MI->getDebugLoc();
1151
9.53k
1152
9.53k
  assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?");
1153
9.53k
1154
9.53k
  MachineOperand &FIOp = MI->getOperand(FIOperandNum);
1155
9.53k
  int Index = MI->getOperand(FIOperandNum).getIndex();
1156
9.53k
1157
9.53k
  Register FrameReg = getFrameRegister(*MF);
1158
9.53k
1159
9.53k
  switch (MI->getOpcode()) {
1160
9.53k
    // SGPR register spill
1161
9.53k
    case AMDGPU::SI_SPILL_S1024_SAVE:
1162
50
    case AMDGPU::SI_SPILL_S512_SAVE:
1163
50
    case AMDGPU::SI_SPILL_S256_SAVE:
1164
50
    case AMDGPU::SI_SPILL_S160_SAVE:
1165
50
    case AMDGPU::SI_SPILL_S128_SAVE:
1166
50
    case AMDGPU::SI_SPILL_S96_SAVE:
1167
50
    case AMDGPU::SI_SPILL_S64_SAVE:
1168
50
    case AMDGPU::SI_SPILL_S32_SAVE: {
1169
50
      spillSGPR(MI, Index, RS);
1170
50
      break;
1171
50
    }
1172
50
1173
50
    // SGPR register restore
1174
51
    case AMDGPU::SI_SPILL_S1024_RESTORE:
1175
51
    case AMDGPU::SI_SPILL_S512_RESTORE:
1176
51
    case AMDGPU::SI_SPILL_S256_RESTORE:
1177
51
    case AMDGPU::SI_SPILL_S160_RESTORE:
1178
51
    case AMDGPU::SI_SPILL_S128_RESTORE:
1179
51
    case AMDGPU::SI_SPILL_S96_RESTORE:
1180
51
    case AMDGPU::SI_SPILL_S64_RESTORE:
1181
51
    case AMDGPU::SI_SPILL_S32_RESTORE: {
1182
51
      restoreSGPR(MI, Index, RS);
1183
51
      break;
1184
51
    }
1185
51
1186
51
    // VGPR register spill
1187
1.40k
    case AMDGPU::SI_SPILL_V1024_SAVE:
1188
1.40k
    case AMDGPU::SI_SPILL_V512_SAVE:
1189
1.40k
    case AMDGPU::SI_SPILL_V256_SAVE:
1190
1.40k
    case AMDGPU::SI_SPILL_V160_SAVE:
1191
1.40k
    case AMDGPU::SI_SPILL_V128_SAVE:
1192
1.40k
    case AMDGPU::SI_SPILL_V96_SAVE:
1193
1.40k
    case AMDGPU::SI_SPILL_V64_SAVE:
1194
1.40k
    case AMDGPU::SI_SPILL_V32_SAVE:
1195
1.40k
    case AMDGPU::SI_SPILL_A1024_SAVE:
1196
1.40k
    case AMDGPU::SI_SPILL_A512_SAVE:
1197
1.40k
    case AMDGPU::SI_SPILL_A128_SAVE:
1198
1.40k
    case AMDGPU::SI_SPILL_A64_SAVE:
1199
1.40k
    case AMDGPU::SI_SPILL_A32_SAVE: {
1200
1.40k
      const MachineOperand *VData = TII->getNamedOperand(*MI,
1201
1.40k
                                                         AMDGPU::OpName::vdata);
1202
1.40k
      assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
1203
1.40k
             MFI->getStackPtrOffsetReg());
1204
1.40k
1205
1.40k
      buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET,
1206
1.40k
            Index,
1207
1.40k
            VData->getReg(), VData->isKill(),
1208
1.40k
            TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(),
1209
1.40k
            FrameReg,
1210
1.40k
            TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
1211
1.40k
            *MI->memoperands_begin(),
1212
1.40k
            RS);
1213
1.40k
      MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode()));
1214
1.40k
      MI->eraseFromParent();
1215
1.40k
      break;
1216
1.40k
    }
1217
1.40k
    case AMDGPU::SI_SPILL_V32_RESTORE:
1218
1.36k
    case AMDGPU::SI_SPILL_V64_RESTORE:
1219
1.36k
    case AMDGPU::SI_SPILL_V96_RESTORE:
1220
1.36k
    case AMDGPU::SI_SPILL_V128_RESTORE:
1221
1.36k
    case AMDGPU::SI_SPILL_V160_RESTORE:
1222
1.36k
    case AMDGPU::SI_SPILL_V256_RESTORE:
1223
1.36k
    case AMDGPU::SI_SPILL_V512_RESTORE:
1224
1.36k
    case AMDGPU::SI_SPILL_V1024_RESTORE:
1225
1.36k
    case AMDGPU::SI_SPILL_A32_RESTORE:
1226
1.36k
    case AMDGPU::SI_SPILL_A64_RESTORE:
1227
1.36k
    case AMDGPU::SI_SPILL_A128_RESTORE:
1228
1.36k
    case AMDGPU::SI_SPILL_A512_RESTORE:
1229
1.36k
    case AMDGPU::SI_SPILL_A1024_RESTORE: {
1230
1.36k
      const MachineOperand *VData = TII->getNamedOperand(*MI,
1231
1.36k
                                                         AMDGPU::OpName::vdata);
1232
1.36k
      assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
1233
1.36k
             MFI->getStackPtrOffsetReg());
1234
1.36k
1235
1.36k
      buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
1236
1.36k
            Index,
1237
1.36k
            VData->getReg(), VData->isKill(),
1238
1.36k
            TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(),
1239
1.36k
            FrameReg,
1240
1.36k
            TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
1241
1.36k
            *MI->memoperands_begin(),
1242
1.36k
            RS);
1243
1.36k
      MI->eraseFromParent();
1244
1.36k
      break;
1245
1.36k
    }
1246
1.36k
1247
6.66k
    default: {
1248
6.66k
      const DebugLoc &DL = MI->getDebugLoc();
1249
6.66k
      bool IsMUBUF = TII->isMUBUF(*MI);
1250
6.66k
1251
6.66k
      if (!IsMUBUF && 
!MFI->isEntryFunction()404
) {
1252
52
        // Convert to an absolute stack address by finding the offset from the
1253
52
        // scratch wave base and scaling by the wave size.
1254
52
        //
1255
52
        // In an entry function/kernel the offset is already the absolute
1256
52
        // address relative to the frame register.
1257
52
1258
52
        unsigned DiffReg
1259
52
          = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
1260
52
1261
52
        bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32;
1262
52
        Register ResultReg = IsCopy ?
1263
48
          MI->getOperand(0).getReg() :
1264
52
          
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass)4
;
1265
52
1266
52
        BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), DiffReg)
1267
52
          .addReg(FrameReg)
1268
52
          .addReg(MFI->getScratchWaveOffsetReg());
1269
52
1270
52
        int64_t Offset = FrameInfo.getObjectOffset(Index);
1271
52
        if (Offset == 0) {
1272
36
          // XXX - This never happens because of emergency scavenging slot at 0?
1273
36
          BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg)
1274
36
            .addImm(Log2_32(ST.getWavefrontSize()))
1275
36
            .addReg(DiffReg);
1276
36
        } else {
1277
16
          unsigned ScaledReg
1278
16
            = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1279
16
1280
16
          BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ScaledReg)
1281
16
            .addImm(Log2_32(ST.getWavefrontSize()))
1282
16
            .addReg(DiffReg, RegState::Kill);
1283
16
1284
16
          // TODO: Fold if use instruction is another add of a constant.
1285
16
          if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
1286
12
            TII->getAddNoCarry(*MBB, MI, DL, ResultReg)
1287
12
              .addImm(Offset)
1288
12
              .addReg(ScaledReg, RegState::Kill)
1289
12
              .addImm(0); // clamp bit
1290
12
          } else {
1291
4
            unsigned ConstOffsetReg
1292
4
              = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
1293
4
1294
4
            BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg)
1295
4
              .addImm(Offset);
1296
4
            TII->getAddNoCarry(*MBB, MI, DL, ResultReg)
1297
4
              .addReg(ConstOffsetReg, RegState::Kill)
1298
4
              .addReg(ScaledReg, RegState::Kill)
1299
4
              .addImm(0); // clamp bit
1300
4
          }
1301
16
        }
1302
52
1303
52
        // Don't introduce an extra copy if we're just materializing in a mov.
1304
52
        if (IsCopy)
1305
48
          MI->eraseFromParent();
1306
4
        else
1307
4
          FIOp.ChangeToRegister(ResultReg, false, false, true);
1308
52
        return;
1309
52
      }
1310
6.60k
1311
6.60k
      if (IsMUBUF) {
1312
6.25k
        // Disable offen so we don't need a 0 vgpr base.
1313
6.25k
        assert(static_cast<int>(FIOperandNum) ==
1314
6.25k
               AMDGPU::getNamedOperandIdx(MI->getOpcode(),
1315
6.25k
                                          AMDGPU::OpName::vaddr));
1316
6.25k
1317
6.25k
        assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
1318
6.25k
               MFI->getStackPtrOffsetReg());
1319
6.25k
1320
6.25k
        TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->setReg(FrameReg);
1321
6.25k
1322
6.25k
        int64_t Offset = FrameInfo.getObjectOffset(Index);
1323
6.25k
        int64_t OldImm
1324
6.25k
          = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm();
1325
6.25k
        int64_t NewOffset = OldImm + Offset;
1326
6.25k
1327
6.25k
        if (isUInt<12>(NewOffset) &&
1328
6.25k
            
buildMUBUFOffsetLoadStore(TII, FrameInfo, MI, Index, NewOffset)6.23k
) {
1329
6.23k
          MI->eraseFromParent();
1330
6.23k
          return;
1331
6.23k
        }
1332
370
      }
1333
370
1334
370
      // If the offset is simply too big, don't convert to a scratch wave offset
1335
370
      // relative index.
1336
370
1337
370
      int64_t Offset = FrameInfo.getObjectOffset(Index);
1338
370
      FIOp.ChangeToImmediate(Offset);
1339
370
      if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) {
1340
18
        unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1341
18
        BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
1342
18
          .addImm(Offset);
1343
18
        FIOp.ChangeToRegister(TmpReg, false, false, true);
1344
18
      }
1345
370
    }
1346
9.53k
  }
1347
9.53k
}
1348
1349
19.8M
StringRef SIRegisterInfo::getRegAsmName(unsigned Reg) const {
1350
19.8M
  return AMDGPUInstPrinter::getRegisterName(Reg);
1351
19.8M
}
1352
1353
// FIXME: This is very slow. It might be worth creating a map from physreg to
1354
// register class.
1355
8.50M
const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
1356
8.50M
  assert(!TargetRegisterInfo::isVirtualRegister(Reg));
1357
8.50M
1358
8.50M
  static const TargetRegisterClass *const BaseClasses[] = {
1359
8.50M
    &AMDGPU::VGPR_32RegClass,
1360
8.50M
    &AMDGPU::SReg_32RegClass,
1361
8.50M
    &AMDGPU::AGPR_32RegClass,
1362
8.50M
    &AMDGPU::VReg_64RegClass,
1363
8.50M
    &AMDGPU::SReg_64RegClass,
1364
8.50M
    &AMDGPU::AReg_64RegClass,
1365
8.50M
    &AMDGPU::VReg_96RegClass,
1366
8.50M
    &AMDGPU::SReg_96RegClass,
1367
8.50M
    &AMDGPU::VReg_128RegClass,
1368
8.50M
    &AMDGPU::SReg_128RegClass,
1369
8.50M
    &AMDGPU::AReg_128RegClass,
1370
8.50M
    &AMDGPU::VReg_160RegClass,
1371
8.50M
    &AMDGPU::SReg_160RegClass,
1372
8.50M
    &AMDGPU::VReg_256RegClass,
1373
8.50M
    &AMDGPU::SReg_256RegClass,
1374
8.50M
    &AMDGPU::VReg_512RegClass,
1375
8.50M
    &AMDGPU::SReg_512RegClass,
1376
8.50M
    &AMDGPU::AReg_512RegClass,
1377
8.50M
    &AMDGPU::SReg_1024RegClass,
1378
8.50M
    &AMDGPU::VReg_1024RegClass,
1379
8.50M
    &AMDGPU::AReg_1024RegClass,
1380
8.50M
    &AMDGPU::SCC_CLASSRegClass,
1381
8.50M
    &AMDGPU::Pseudo_SReg_32RegClass,
1382
8.50M
    &AMDGPU::Pseudo_SReg_128RegClass,
1383
8.50M
  };
1384
8.50M
1385
33.0M
  for (const TargetRegisterClass *BaseClass : BaseClasses) {
1386
33.0M
    if (BaseClass->contains(Reg)) {
1387
8.50M
      return BaseClass;
1388
8.50M
    }
1389
33.0M
  }
1390
8.50M
  
return nullptr0
;
1391
8.50M
}
1392
1393
// TODO: It might be helpful to have some target specific flags in
1394
// TargetRegisterClass to mark which classes are VGPRs to make this trivial.
1395
15.7M
bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
1396
15.7M
  unsigned Size = getRegSizeInBits(*RC);
1397
15.7M
  if (Size < 32)
1398
10.1k
    return false;
1399
15.7M
  switch (Size) {
1400
15.7M
  case 32:
1401
8.38M
    return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr;
1402
15.7M
  case 64:
1403
5.00M
    return getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) != nullptr;
1404
15.7M
  case 96:
1405
11.2k
    return getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) != nullptr;
1406
15.7M
  case 128:
1407
1.91M
    return getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) != nullptr;
1408
15.7M
  case 160:
1409
4.93k
    return getCommonSubClass(&AMDGPU::VReg_160RegClass, RC) != nullptr;
1410
15.7M
  case 256:
1411
279k
    return getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) != nullptr;
1412
15.7M
  case 512:
1413
86.7k
    return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr;
1414
15.7M
  case 1024:
1415
46.0k
    return getCommonSubClass(&AMDGPU::VReg_1024RegClass, RC) != nullptr;
1416
15.7M
  default:
1417
0
    llvm_unreachable("Invalid register class size");
1418
15.7M
  }
1419
15.7M
}
1420
1421
7.64M
bool SIRegisterInfo::hasAGPRs(const TargetRegisterClass *RC) const {
1422
7.64M
  unsigned Size = getRegSizeInBits(*RC);
1423
7.64M
  if (Size < 32)
1424
1
    return false;
1425
7.64M
  switch (Size) {
1426
7.64M
  case 32:
1427
3.50M
    return getCommonSubClass(&AMDGPU::AGPR_32RegClass, RC) != nullptr;
1428
7.64M
  case 64:
1429
2.72M
    return getCommonSubClass(&AMDGPU::AReg_64RegClass, RC) != nullptr;
1430
7.64M
  case 96:
1431
3.42k
    return false;
1432
7.64M
  case 128:
1433
1.10M
    return getCommonSubClass(&AMDGPU::AReg_128RegClass, RC) != nullptr;
1434
7.64M
  case 160:
1435
238k
  case 256:
1436
238k
    return false;
1437
238k
  case 512:
1438
49.8k
    return getCommonSubClass(&AMDGPU::AReg_512RegClass, RC) != nullptr;
1439
238k
  case 1024:
1440
17.4k
    return getCommonSubClass(&AMDGPU::AReg_1024RegClass, RC) != nullptr;
1441
238k
  default:
1442
0
    llvm_unreachable("Invalid register class size");
1443
7.64M
  }
1444
7.64M
}
1445
1446
const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
1447
169k
                                         const TargetRegisterClass *SRC) const {
1448
169k
  switch (getRegSizeInBits(*SRC)) {
1449
169k
  case 32:
1450
145k
    return &AMDGPU::VGPR_32RegClass;
1451
169k
  case 64:
1452
17.6k
    return &AMDGPU::VReg_64RegClass;
1453
169k
  case 96:
1454
86
    return &AMDGPU::VReg_96RegClass;
1455
169k
  case 128:
1456
5.94k
    return &AMDGPU::VReg_128RegClass;
1457
169k
  case 160:
1458
0
    return &AMDGPU::VReg_160RegClass;
1459
169k
  case 256:
1460
94
    return &AMDGPU::VReg_256RegClass;
1461
169k
  case 512:
1462
60
    return &AMDGPU::VReg_512RegClass;
1463
169k
  case 1024:
1464
11
    return &AMDGPU::VReg_1024RegClass;
1465
169k
  default:
1466
0
    llvm_unreachable("Invalid register class size");
1467
169k
  }
1468
169k
}
1469
1470
const TargetRegisterClass *SIRegisterInfo::getEquivalentAGPRClass(
1471
950
                                         const TargetRegisterClass *SRC) const {
1472
950
  switch (getRegSizeInBits(*SRC)) {
1473
950
  case 32:
1474
948
    return &AMDGPU::AGPR_32RegClass;
1475
950
  case 64:
1476
2
    return &AMDGPU::AReg_64RegClass;
1477
950
  case 128:
1478
0
    return &AMDGPU::AReg_128RegClass;
1479
950
  case 512:
1480
0
    return &AMDGPU::AReg_512RegClass;
1481
950
  case 1024:
1482
0
    return &AMDGPU::AReg_1024RegClass;
1483
950
  default:
1484
0
    llvm_unreachable("Invalid register class size");
1485
950
  }
1486
950
}
1487
1488
const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass(
1489
2.51k
                                         const TargetRegisterClass *VRC) const {
1490
2.51k
  switch (getRegSizeInBits(*VRC)) {
1491
2.51k
  case 32:
1492
2.24k
    return &AMDGPU::SGPR_32RegClass;
1493
2.51k
  case 64:
1494
248
    return &AMDGPU::SReg_64RegClass;
1495
2.51k
  case 96:
1496
0
    return &AMDGPU::SReg_96RegClass;
1497
2.51k
  case 128:
1498
15
    return &AMDGPU::SReg_128RegClass;
1499
2.51k
  case 160:
1500
0
    return &AMDGPU::SReg_160RegClass;
1501
2.51k
  case 256:
1502
2
    return &AMDGPU::SReg_256RegClass;
1503
2.51k
  case 512:
1504
0
    return &AMDGPU::SReg_512RegClass;
1505
2.51k
  case 1024:
1506
0
    return &AMDGPU::SReg_1024RegClass;
1507
2.51k
  default:
1508
0
    llvm_unreachable("Invalid register class size");
1509
2.51k
  }
1510
2.51k
}
1511
1512
const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
1513
499k
                         const TargetRegisterClass *RC, unsigned SubIdx) const {
1514
499k
  if (SubIdx == AMDGPU::NoSubRegister)
1515
420k
    return RC;
1516
78.6k
1517
78.6k
  // We can assume that each lane corresponds to one 32-bit register.
1518
78.6k
  unsigned Count = getSubRegIndexLaneMask(SubIdx).getNumLanes();
1519
78.6k
  if (isSGPRClass(RC)) {
1520
37.7k
    switch (Count) {
1521
37.7k
    case 1:
1522
37.6k
      return &AMDGPU::SGPR_32RegClass;
1523
37.7k
    case 2:
1524
60
      return &AMDGPU::SReg_64RegClass;
1525
37.7k
    case 3:
1526
0
      return &AMDGPU::SReg_96RegClass;
1527
37.7k
    case 4:
1528
0
      return &AMDGPU::SReg_128RegClass;
1529
37.7k
    case 5:
1530
0
      return &AMDGPU::SReg_160RegClass;
1531
37.7k
    case 8:
1532
0
      return &AMDGPU::SReg_256RegClass;
1533
37.7k
    case 16:
1534
0
      return &AMDGPU::SReg_512RegClass;
1535
37.7k
    case 32: /* fall-through */
1536
0
    default:
1537
0
      llvm_unreachable("Invalid sub-register class size");
1538
40.9k
    }
1539
40.9k
  } else if (hasAGPRs(RC)) {
1540
4
    switch (Count) {
1541
4
    case 1:
1542
4
      return &AMDGPU::AGPR_32RegClass;
1543
4
    case 2:
1544
0
      return &AMDGPU::AReg_64RegClass;
1545
4
    case 4:
1546
0
      return &AMDGPU::AReg_128RegClass;
1547
4
    case 16:
1548
0
      return &AMDGPU::AReg_512RegClass;
1549
4
    case 32: /* fall-through */
1550
0
    default:
1551
0
      llvm_unreachable("Invalid sub-register class size");
1552
40.9k
    }
1553
40.9k
  } else {
1554
40.9k
    switch (Count) {
1555
40.9k
    case 1:
1556
40.4k
      return &AMDGPU::VGPR_32RegClass;
1557
40.9k
    case 2:
1558
546
      return &AMDGPU::VReg_64RegClass;
1559
40.9k
    case 3:
1560
0
      return &AMDGPU::VReg_96RegClass;
1561
40.9k
    case 4:
1562
0
      return &AMDGPU::VReg_128RegClass;
1563
40.9k
    case 5:
1564
0
      return &AMDGPU::VReg_160RegClass;
1565
40.9k
    case 8:
1566
0
      return &AMDGPU::VReg_256RegClass;
1567
40.9k
    case 16:
1568
0
      return &AMDGPU::VReg_512RegClass;
1569
40.9k
    case 32: /* fall-through */
1570
0
    default:
1571
0
      llvm_unreachable("Invalid sub-register class size");
1572
40.9k
    }
1573
40.9k
  }
1574
78.6k
}
1575
1576
bool SIRegisterInfo::shouldRewriteCopySrc(
1577
  const TargetRegisterClass *DefRC,
1578
  unsigned DefSubReg,
1579
  const TargetRegisterClass *SrcRC,
1580
640k
  unsigned SrcSubReg) const {
1581
640k
  // We want to prefer the smallest register class possible, so we don't want to
1582
640k
  // stop and rewrite on anything that looks like a subregister
1583
640k
  // extract. Operations mostly don't care about the super register class, so we
1584
640k
  // only want to stop on the most basic of copies between the same register
1585
640k
  // class.
1586
640k
  //
1587
640k
  // e.g. if we have something like
1588
640k
  // %0 = ...
1589
640k
  // %1 = ...
1590
640k
  // %2 = REG_SEQUENCE %0, sub0, %1, sub1, %2, sub2
1591
640k
  // %3 = COPY %2, sub0
1592
640k
  //
1593
640k
  // We want to look through the COPY to find:
1594
640k
  //  => %3 = COPY %0
1595
640k
1596
640k
  // Plain copy.
1597
640k
  return getCommonSubClass(DefRC, SrcRC) != nullptr;
1598
640k
}
1599
1600
/// Returns a register that is not used at any point in the function.
1601
///        If all registers are used, then this function will return
1602
//         AMDGPU::NoRegister.
1603
unsigned
1604
SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
1605
                                   const TargetRegisterClass *RC,
1606
206
                                   const MachineFunction &MF) const {
1607
206
1608
206
  for (unsigned Reg : *RC)
1609
5.83k
    if (MRI.isAllocatable(Reg) && 
!MRI.isPhysRegUsed(Reg)5.37k
)
1610
202
      return Reg;
1611
206
  
return AMDGPU::NoRegister4
;
1612
206
}
1613
1614
ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC,
1615
9.61k
                                                   unsigned EltSize) const {
1616
9.61k
  if (EltSize == 4) {
1617
9.31k
    static const int16_t Sub0_31[] = {
1618
9.31k
      AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
1619
9.31k
      AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
1620
9.31k
      AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
1621
9.31k
      AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
1622
9.31k
      AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19,
1623
9.31k
      AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23,
1624
9.31k
      AMDGPU::sub24, AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27,
1625
9.31k
      AMDGPU::sub28, AMDGPU::sub29, AMDGPU::sub30, AMDGPU::sub31,
1626
9.31k
    };
1627
9.31k
1628
9.31k
    static const int16_t Sub0_15[] = {
1629
9.31k
      AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
1630
9.31k
      AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
1631
9.31k
      AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
1632
9.31k
      AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
1633
9.31k
    };
1634
9.31k
1635
9.31k
    static const int16_t Sub0_7[] = {
1636
9.31k
      AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
1637
9.31k
      AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
1638
9.31k
    };
1639
9.31k
1640
9.31k
    static const int16_t Sub0_4[] = {
1641
9.31k
      AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4,
1642
9.31k
    };
1643
9.31k
1644
9.31k
    static const int16_t Sub0_3[] = {
1645
9.31k
      AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
1646
9.31k
    };
1647
9.31k
1648
9.31k
    static const int16_t Sub0_2[] = {
1649
9.31k
      AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2,
1650
9.31k
    };
1651
9.31k
1652
9.31k
    static const int16_t Sub0_1[] = {
1653
9.31k
      AMDGPU::sub0, AMDGPU::sub1,
1654
9.31k
    };
1655
9.31k
1656
9.31k
    switch (AMDGPU::getRegBitWidth(*RC->MC)) {
1657
9.31k
    case 32:
1658
1.01k
      return {};
1659
9.31k
    case 64:
1660
7.84k
      return makeArrayRef(Sub0_1);
1661
9.31k
    case 96:
1662
12
      return makeArrayRef(Sub0_2);
1663
9.31k
    case 128:
1664
314
      return makeArrayRef(Sub0_3);
1665
9.31k
    case 160:
1666
10
      return makeArrayRef(Sub0_4);
1667
9.31k
    case 256:
1668
64
      return makeArrayRef(Sub0_7);
1669
9.31k
    case 512:
1670
56
      return makeArrayRef(Sub0_15);
1671
9.31k
    case 1024:
1672
1
      return makeArrayRef(Sub0_31);
1673
9.31k
    default:
1674
0
      llvm_unreachable("unhandled register size");
1675
298
    }
1676
298
  }
1677
298
1678
298
  if (EltSize == 8) {
1679
282
    static const int16_t Sub0_31_64[] = {
1680
282
      AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
1681
282
      AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
1682
282
      AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
1683
282
      AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
1684
282
      AMDGPU::sub16_sub17, AMDGPU::sub18_sub19,
1685
282
      AMDGPU::sub20_sub21, AMDGPU::sub22_sub23,
1686
282
      AMDGPU::sub24_sub25, AMDGPU::sub26_sub27,
1687
282
      AMDGPU::sub28_sub29, AMDGPU::sub30_sub31
1688
282
    };
1689
282
1690
282
    static const int16_t Sub0_15_64[] = {
1691
282
      AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
1692
282
      AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
1693
282
      AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
1694
282
      AMDGPU::sub12_sub13, AMDGPU::sub14_sub15
1695
282
    };
1696
282
1697
282
    static const int16_t Sub0_7_64[] = {
1698
282
      AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
1699
282
      AMDGPU::sub4_sub5, AMDGPU::sub6_sub7
1700
282
    };
1701
282
1702
282
1703
282
    static const int16_t Sub0_3_64[] = {
1704
282
      AMDGPU::sub0_sub1, AMDGPU::sub2_sub3
1705
282
    };
1706
282
1707
282
    switch (AMDGPU::getRegBitWidth(*RC->MC)) {
1708
282
    case 64:
1709
16
      return {};
1710
282
    case 128:
1711
257
      return makeArrayRef(Sub0_3_64);
1712
282
    case 256:
1713
4
      return makeArrayRef(Sub0_7_64);
1714
282
    case 512:
1715
5
      return makeArrayRef(Sub0_15_64);
1716
282
    case 1024:
1717
0
      return makeArrayRef(Sub0_31_64);
1718
282
    default:
1719
0
      llvm_unreachable("unhandled register size");
1720
16
    }
1721
16
  }
1722
16
1723
16
  if (EltSize == 16) {
1724
12
1725
12
    static const int16_t Sub0_31_128[] = {
1726
12
      AMDGPU::sub0_sub1_sub2_sub3,
1727
12
      AMDGPU::sub4_sub5_sub6_sub7,
1728
12
      AMDGPU::sub8_sub9_sub10_sub11,
1729
12
      AMDGPU::sub12_sub13_sub14_sub15,
1730
12
      AMDGPU::sub16_sub17_sub18_sub19,
1731
12
      AMDGPU::sub20_sub21_sub22_sub23,
1732
12
      AMDGPU::sub24_sub25_sub26_sub27,
1733
12
      AMDGPU::sub28_sub29_sub30_sub31
1734
12
    };
1735
12
1736
12
    static const int16_t Sub0_15_128[] = {
1737
12
      AMDGPU::sub0_sub1_sub2_sub3,
1738
12
      AMDGPU::sub4_sub5_sub6_sub7,
1739
12
      AMDGPU::sub8_sub9_sub10_sub11,
1740
12
      AMDGPU::sub12_sub13_sub14_sub15
1741
12
    };
1742
12
1743
12
    static const int16_t Sub0_7_128[] = {
1744
12
      AMDGPU::sub0_sub1_sub2_sub3,
1745
12
      AMDGPU::sub4_sub5_sub6_sub7
1746
12
    };
1747
12
1748
12
    switch (AMDGPU::getRegBitWidth(*RC->MC)) {
1749
12
    case 128:
1750
4
      return {};
1751
12
    case 256:
1752
7
      return makeArrayRef(Sub0_7_128);
1753
12
    case 512:
1754
1
      return makeArrayRef(Sub0_15_128);
1755
12
    case 1024:
1756
0
      return makeArrayRef(Sub0_31_128);
1757
12
    default:
1758
0
      llvm_unreachable("unhandled register size");
1759
4
    }
1760
4
  }
1761
4
1762
4
  assert(EltSize == 32 && "unhandled elt size");
1763
4
1764
4
  static const int16_t Sub0_31_256[] = {
1765
4
    AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7,
1766
4
    AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15,
1767
4
    AMDGPU::sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23,
1768
4
    AMDGPU::sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31
1769
4
  };
1770
4
1771
4
  static const int16_t Sub0_15_256[] = {
1772
4
    AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7,
1773
4
    AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15
1774
4
  };
1775
4
1776
4
  switch (AMDGPU::getRegBitWidth(*RC->MC)) {
1777
4
  case 256:
1778
0
    return {};
1779
4
  case 512:
1780
4
    return makeArrayRef(Sub0_15_256);
1781
4
  case 1024:
1782
0
    return makeArrayRef(Sub0_31_256);
1783
4
  default:
1784
0
    llvm_unreachable("unhandled register size");
1785
4
  }
1786
4
}
1787
1788
const TargetRegisterClass*
1789
SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI,
1790
7.32M
                                  unsigned Reg) const {
1791
7.32M
  if (TargetRegisterInfo::isVirtualRegister(Reg))
1792
480k
    return  MRI.getRegClass(Reg);
1793
6.84M
1794
6.84M
  return getPhysRegClass(Reg);
1795
6.84M
}
1796
1797
bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI,
1798
5.37M
                            unsigned Reg) const {
1799
5.37M
  const TargetRegisterClass * RC = getRegClassForReg(MRI, Reg);
1800
5.37M
  assert(RC && "Register class for the reg not found");
1801
5.37M
  return hasVGPRs(RC);
1802
5.37M
}
1803
1804
bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI,
1805
1.67M
                            unsigned Reg) const {
1806
1.67M
  const TargetRegisterClass * RC = getRegClassForReg(MRI, Reg);
1807
1.67M
  assert(RC && "Register class for the reg not found");
1808
1.67M
  return hasAGPRs(RC);
1809
1.67M
}
1810
1811
bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
1812
                                    const TargetRegisterClass *SrcRC,
1813
                                    unsigned SubReg,
1814
                                    const TargetRegisterClass *DstRC,
1815
                                    unsigned DstSubReg,
1816
                                    const TargetRegisterClass *NewRC,
1817
225k
                                    LiveIntervals &LIS) const {
1818
225k
  unsigned SrcSize = getRegSizeInBits(*SrcRC);
1819
225k
  unsigned DstSize = getRegSizeInBits(*DstRC);
1820
225k
  unsigned NewSize = getRegSizeInBits(*NewRC);
1821
225k
1822
225k
  // Do not increase size of registers beyond dword, we would need to allocate
1823
225k
  // adjacent registers and constraint regalloc more than needed.
1824
225k
1825
225k
  // Always allow dword coalescing.
1826
225k
  if (SrcSize <= 32 || 
DstSize <= 32111k
)
1827
167k
    return true;
1828
57.5k
1829
57.5k
  return NewSize <= DstSize || 
NewSize <= SrcSize11.5k
;
1830
57.5k
}
1831
1832
unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
1833
214k
                                             MachineFunction &MF) const {
1834
214k
1835
214k
  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1836
214k
  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1837
214k
1838
214k
  unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(),
1839
214k
                                                       MF.getFunction());
1840
214k
  switch (RC->getID()) {
1841
214k
  default:
1842
0
    return AMDGPURegisterInfo::getRegPressureLimit(RC, MF);
1843
214k
  case AMDGPU::VGPR_32RegClassID:
1844
133k
    return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF));
1845
214k
  case AMDGPU::SGPR_32RegClassID:
1846
80.3k
    return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF));
1847
214k
  }
1848
214k
}
1849
1850
unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
1851
13.5M
                                                unsigned Idx) const {
1852
13.5M
  if (Idx == getVGPRPressureSet() || 
Idx == getAGPRPressureSet()13.4M
)
1853
133k
    return getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
1854
133k
                               const_cast<MachineFunction &>(MF));
1855
13.4M
1856
13.4M
  if (Idx == getSGPRPressureSet())
1857
80.3k
    return getRegPressureLimit(&AMDGPU::SGPR_32RegClass,
1858
80.3k
                               const_cast<MachineFunction &>(MF));
1859
13.3M
1860
13.3M
  return AMDGPURegisterInfo::getRegPressureSetLimit(MF, Idx);
1861
13.3M
}
1862
1863
6.27M
const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
1864
6.27M
  static const int Empty[] = { -1 };
1865
6.27M
1866
6.27M
  if (hasRegUnit(AMDGPU::M0, RegUnit))
1867
3.64k
    return Empty;
1868
6.26M
  return AMDGPURegisterInfo::getRegUnitPressureSets(RegUnit);
1869
6.26M
}
1870
1871
2.94k
unsigned SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const {
1872
2.94k
  // Not a callee saved register.
1873
2.94k
  return AMDGPU::SGPR30_SGPR31;
1874
2.94k
}
1875
1876
const TargetRegisterClass *
1877
SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size,
1878
                                         const RegisterBank &RB,
1879
2.38k
                                         const MachineRegisterInfo &MRI) const {
1880
2.38k
  switch (Size) {
1881
2.38k
  case 1: {
1882
155
    switch (RB.getID()) {
1883
155
    case AMDGPU::VGPRRegBankID:
1884
43
      return &AMDGPU::VGPR_32RegClass;
1885
155
    case AMDGPU::VCCRegBankID:
1886
14
      return isWave32 ?
1887
8
        
&AMDGPU::SReg_32_XM0_XEXECRegClass6
: &AMDGPU::SReg_64_XEXECRegClass;
1888
155
    case AMDGPU::SGPRRegBankID:
1889
51
      return &AMDGPU::SReg_32_XM0RegClass;
1890
155
    case AMDGPU::SCCRegBankID:
1891
47
      // This needs to return an allocatable class, so don't bother returning
1892
47
      // the dummy SCC class.
1893
47
      return &AMDGPU::SReg_32_XM0RegClass;
1894
155
    default:
1895
0
      llvm_unreachable("unknown register bank");
1896
0
    }
1897
0
  }
1898
1.11k
  case 32:
1899
1.11k
    return RB.getID() == AMDGPU::VGPRRegBankID ? 
&AMDGPU::VGPR_32RegClass735
:
1900
1.11k
                                                 
&AMDGPU::SReg_32_XM0RegClass380
;
1901
760
  case 64:
1902
760
    return RB.getID() == AMDGPU::VGPRRegBankID ? 
&AMDGPU::VReg_64RegClass415
:
1903
760
                                                 
&AMDGPU::SReg_64_XEXECRegClass345
;
1904
39
  case 96:
1905
39
    return RB.getID() == AMDGPU::VGPRRegBankID ? 
&AMDGPU::VReg_96RegClass31
:
1906
39
                                                 
&AMDGPU::SReg_96RegClass8
;
1907
96
  case 128:
1908
96
    return RB.getID() == AMDGPU::VGPRRegBankID ? 
&AMDGPU::VReg_128RegClass63
:
1909
96
                                                 
&AMDGPU::SReg_128RegClass33
;
1910
8
  case 160:
1911
8
    return RB.getID() == AMDGPU::VGPRRegBankID ? 
&AMDGPU::VReg_160RegClass4
:
1912
8
                                                 
&AMDGPU::SReg_160RegClass4
;
1913
36
  case 256:
1914
36
    return RB.getID() == AMDGPU::VGPRRegBankID ? 
&AMDGPU::VReg_256RegClass2
:
1915
36
                                                 
&AMDGPU::SReg_256RegClass34
;
1916
46
  case 512:
1917
46
    return RB.getID() == AMDGPU::VGPRRegBankID ? 
&AMDGPU::VReg_512RegClass7
:
1918
46
                                                 
&AMDGPU::SReg_512RegClass39
;
1919
129
  default:
1920
129
    if (Size < 32)
1921
123
      return RB.getID() == AMDGPU::VGPRRegBankID ? 
&AMDGPU::VGPR_32RegClass88
:
1922
123
                                                   
&AMDGPU::SReg_32_XM0RegClass35
;
1923
6
    return nullptr;
1924
2.38k
  }
1925
2.38k
}
1926
1927
const TargetRegisterClass *
1928
SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO,
1929
4.82k
                                         const MachineRegisterInfo &MRI) const {
1930
4.82k
  if (const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg()))
1931
2.05k
    return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB, MRI);
1932
2.76k
  return nullptr;
1933
2.76k
}
1934
1935
1.18k
unsigned SIRegisterInfo::getVCC() const {
1936
1.18k
  return isWave32 ? 
AMDGPU::VCC_LO156
:
AMDGPU::VCC1.02k
;
1937
1.18k
}
1938
1939
const TargetRegisterClass *
1940
59.8M
SIRegisterInfo::getRegClass(unsigned RCID) const {
1941
59.8M
  switch ((int)RCID) {
1942
59.8M
  case AMDGPU::SReg_1RegClassID:
1943
1.22M
    return getBoolRC();
1944
59.8M
  case AMDGPU::SReg_1_XEXECRegClassID:
1945
706k
    return isWave32 ? 
&AMDGPU::SReg_32_XM0_XEXECRegClass35.4k
1946
706k
      : 
&AMDGPU::SReg_64_XEXECRegClass670k
;
1947
59.8M
  case -1:
1948
13.8M
    return nullptr;
1949
59.8M
  default:
1950
44.1M
    return AMDGPURegisterInfo::getRegClass(RCID);
1951
59.8M
  }
1952
59.8M
}
1953
1954
// Find reaching register definition
1955
MachineInstr *SIRegisterInfo::findReachingDef(unsigned Reg, unsigned SubReg,
1956
                                              MachineInstr &Use,
1957
                                              MachineRegisterInfo &MRI,
1958
412
                                              LiveIntervals *LIS) const {
1959
412
  auto &MDT = LIS->getAnalysis<MachineDominatorTree>();
1960
412
  SlotIndex UseIdx = LIS->getInstructionIndex(Use);
1961
412
  SlotIndex DefIdx;
1962
412
1963
412
  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
1964
59
    if (!LIS->hasInterval(Reg))
1965
0
      return nullptr;
1966
59
    LiveInterval &LI = LIS->getInterval(Reg);
1967
59
    LaneBitmask SubLanes = SubReg ? 
getSubRegIndexLaneMask(SubReg)11
1968
59
                                  : 
MRI.getMaxLaneMaskForVReg(Reg)48
;
1969
59
    VNInfo *V = nullptr;
1970
59
    if (LI.hasSubRanges()) {
1971
17
      for (auto &S : LI.subranges()) {
1972
17
        if ((S.LaneMask & SubLanes) == SubLanes) {
1973
11
          V = S.getVNInfoAt(UseIdx);
1974
11
          break;
1975
11
        }
1976
17
      }
1977
47
    } else {
1978
47
      V = LI.getVNInfoAt(UseIdx);
1979
47
    }
1980
59
    if (!V)
1981
1
      return nullptr;
1982
58
    DefIdx = V->def;
1983
353
  } else {
1984
353
    // Find last def.
1985
1.01k
    for (MCRegUnitIterator Units(Reg, this); Units.isValid(); 
++Units658
) {
1986
661
      LiveRange &LR = LIS->getRegUnit(*Units);
1987
661
      if (VNInfo *V = LR.getVNInfoAt(UseIdx)) {
1988
658
        if (!DefIdx.isValid() ||
1989
658
            MDT.dominates(LIS->getInstructionFromIndex(DefIdx),
1990
308
                          LIS->getInstructionFromIndex(V->def)))
1991
657
          DefIdx = V->def;
1992
658
      } else {
1993
3
        return nullptr;
1994
3
      }
1995
661
    }
1996
353
  }
1997
412
1998
412
  MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx);
1999
408
2000
408
  if (!Def || 
!MDT.dominates(Def, &Use)406
)
2001
2
    return nullptr;
2002
406
2003
406
  assert(Def->modifiesRegister(Reg, this));
2004
406
2005
406
  return Def;
2006
406
}