Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
Line
Count
Source (jump to first uncovered line)
1
//===----------------------- SIFrameLowering.cpp --------------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//==-----------------------------------------------------------------------===//
8
9
#include "SIFrameLowering.h"
10
#include "AMDGPUSubtarget.h"
11
#include "SIInstrInfo.h"
12
#include "SIMachineFunctionInfo.h"
13
#include "SIRegisterInfo.h"
14
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
15
16
#include "llvm/CodeGen/LivePhysRegs.h"
17
#include "llvm/CodeGen/MachineFrameInfo.h"
18
#include "llvm/CodeGen/MachineFunction.h"
19
#include "llvm/CodeGen/MachineInstrBuilder.h"
20
#include "llvm/CodeGen/RegisterScavenging.h"
21
22
using namespace llvm;
23
24
#define DEBUG_TYPE "frame-info"
25
26
27
static ArrayRef<MCPhysReg> getAllSGPR128(const GCNSubtarget &ST,
28
546
                                         const MachineFunction &MF) {
29
546
  return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(),
30
546
                      ST.getMaxNumSGPRs(MF) / 4);
31
546
}
32
33
static ArrayRef<MCPhysReg> getAllSGPRs(const GCNSubtarget &ST,
34
945
                                       const MachineFunction &MF) {
35
945
  return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(),
36
945
                      ST.getMaxNumSGPRs(MF));
37
945
}
38
39
// Find a scratch register that we can use at the start of the prologue to
40
// re-align the stack pointer. We avoid using callee-save registers since they
41
// may appear to be free when this is called from canUseAsPrologue (during
42
// shrink wrapping), but then no longer be free when this is called from
43
// emitPrologue.
44
//
45
// FIXME: This is a bit conservative, since in the above case we could use one
46
// of the callee-save registers as a scratch temp to re-align the stack pointer,
47
// but we would then have to make sure that we were in fact saving at least one
48
// callee-save register in the prologue, which is additional complexity that
49
// doesn't seem worth the benefit.
50
static unsigned findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI,
51
                                                 LivePhysRegs &LiveRegs,
52
                                                 const TargetRegisterClass &RC,
53
263
                                                 bool Unused = false) {
54
263
  // Mark callee saved registers as used so we will not choose them.
55
263
  const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
56
78.6k
  for (unsigned i = 0; CSRegs[i]; 
++i78.3k
)
57
78.3k
    LiveRegs.addReg(CSRegs[i]);
58
263
59
263
  if (Unused) {
60
19
    // We are looking for a register that can be used throughout the entire
61
19
    // function, so any use is unacceptable.
62
105
    for (unsigned Reg : RC) {
63
105
      if (!MRI.isPhysRegUsed(Reg) && 
LiveRegs.available(MRI, Reg)27
)
64
19
        return Reg;
65
105
    }
66
244
  } else {
67
924
    for (unsigned Reg : RC) {
68
924
      if (LiveRegs.available(MRI, Reg))
69
244
        return Reg;
70
924
    }
71
244
  }
72
263
73
263
  // If we require an unused register, this is used in contexts where failure is
74
263
  // an option and has an alternative plan. In other contexts, this must
75
263
  // succeed0.
76
263
  
if (0
!Unused0
)
77
0
    report_fatal_error("failed to find free scratch register");
78
0
79
0
  return AMDGPU::NoRegister;
80
0
}
81
82
19
static MCPhysReg findUnusedSGPRNonCalleeSaved(MachineRegisterInfo &MRI) {
83
19
  LivePhysRegs LiveRegs;
84
19
  LiveRegs.init(*MRI.getTargetRegisterInfo());
85
19
  return findScratchNonCalleeSaveRegister(
86
19
    MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true);
87
19
}
88
89
// We need to specially emit stack operations here because a different frame
90
// register is used than in the rest of the function, as getFrameRegister would
91
// use.
92
static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
93
                             MachineBasicBlock::iterator I,
94
                             const SIInstrInfo *TII, unsigned SpillReg,
95
116
                             unsigned ScratchRsrcReg, unsigned SPReg, int FI) {
96
116
  MachineFunction *MF = MBB.getParent();
97
116
  MachineFrameInfo &MFI = MF->getFrameInfo();
98
116
99
116
  int64_t Offset = MFI.getObjectOffset(FI);
100
116
101
116
  MachineMemOperand *MMO = MF->getMachineMemOperand(
102
116
      MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, 4,
103
116
      MFI.getObjectAlignment(FI));
104
116
105
116
  if (isUInt<12>(Offset)) {
106
114
    BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFSET))
107
114
      .addReg(SpillReg, RegState::Kill)
108
114
      .addReg(ScratchRsrcReg)
109
114
      .addReg(SPReg)
110
114
      .addImm(Offset)
111
114
      .addImm(0) // glc
112
114
      .addImm(0) // slc
113
114
      .addImm(0) // tfe
114
114
      .addImm(0) // dlc
115
114
      .addMemOperand(MMO);
116
114
    return;
117
114
  }
118
2
119
2
  MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
120
2
    MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass);
121
2
122
2
  BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg)
123
2
    .addImm(Offset);
124
2
125
2
  BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFEN))
126
2
    .addReg(SpillReg, RegState::Kill)
127
2
    .addReg(OffsetReg, RegState::Kill)
128
2
    .addReg(ScratchRsrcReg)
129
2
    .addReg(SPReg)
130
2
    .addImm(0)
131
2
    .addImm(0) // glc
132
2
    .addImm(0) // slc
133
2
    .addImm(0) // tfe
134
2
    .addImm(0) // dlc
135
2
    .addMemOperand(MMO);
136
2
}
137
138
static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
139
                              MachineBasicBlock::iterator I,
140
                              const SIInstrInfo *TII, unsigned SpillReg,
141
116
                              unsigned ScratchRsrcReg, unsigned SPReg, int FI) {
142
116
  MachineFunction *MF = MBB.getParent();
143
116
  MachineFrameInfo &MFI = MF->getFrameInfo();
144
116
  int64_t Offset = MFI.getObjectOffset(FI);
145
116
146
116
  MachineMemOperand *MMO = MF->getMachineMemOperand(
147
116
      MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, 4,
148
116
      MFI.getObjectAlignment(FI));
149
116
150
116
  if (isUInt<12>(Offset)) {
151
114
    BuildMI(MBB, I, DebugLoc(),
152
114
            TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFSET), SpillReg)
153
114
      .addReg(ScratchRsrcReg)
154
114
      .addReg(SPReg)
155
114
      .addImm(Offset)
156
114
      .addImm(0) // glc
157
114
      .addImm(0) // slc
158
114
      .addImm(0) // tfe
159
114
      .addImm(0) // dlc
160
114
      .addMemOperand(MMO);
161
114
    return;
162
114
  }
163
2
164
2
  MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
165
2
    MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass);
166
2
167
2
  BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg)
168
2
    .addImm(Offset);
169
2
170
2
  BuildMI(MBB, I, DebugLoc(),
171
2
          TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), SpillReg)
172
2
    .addReg(OffsetReg, RegState::Kill)
173
2
    .addReg(ScratchRsrcReg)
174
2
    .addReg(SPReg)
175
2
    .addImm(0)
176
2
    .addImm(0) // glc
177
2
    .addImm(0) // slc
178
2
    .addImm(0) // tfe
179
2
    .addImm(0) // dlc
180
2
    .addMemOperand(MMO);
181
2
}
182
183
void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST,
184
                                          MachineFunction &MF,
185
431
                                          MachineBasicBlock &MBB) const {
186
431
  const SIInstrInfo *TII = ST.getInstrInfo();
187
431
  const SIRegisterInfo* TRI = &TII->getRegisterInfo();
188
431
  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
189
431
190
431
  // We don't need this if we only have spills since there is no user facing
191
431
  // scratch.
192
431
193
431
  // TODO: If we know we don't have flat instructions earlier, we can omit
194
431
  // this from the input registers.
195
431
  //
196
431
  // TODO: We only need to know if we access scratch space through a flat
197
431
  // pointer. Because we only detect if flat instructions are used at all,
198
431
  // this will be used more often than necessary on VI.
199
431
200
431
  // Debug location must be unknown since the first debug location is used to
201
431
  // determine the end of the prologue.
202
431
  DebugLoc DL;
203
431
  MachineBasicBlock::iterator I = MBB.begin();
204
431
205
431
  unsigned FlatScratchInitReg
206
431
    = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
207
431
208
431
  MachineRegisterInfo &MRI = MF.getRegInfo();
209
431
  MRI.addLiveIn(FlatScratchInitReg);
210
431
  MBB.addLiveIn(FlatScratchInitReg);
211
431
212
431
  unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
213
431
  unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
214
431
215
431
  unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
216
431
217
431
  // Do a 64-bit pointer add.
218
431
  if (ST.flatScratchIsPointer()) {
219
88
    if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
220
5
      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
221
5
        .addReg(FlatScrInitLo)
222
5
        .addReg(ScratchWaveOffsetReg);
223
5
      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), FlatScrInitHi)
224
5
        .addReg(FlatScrInitHi)
225
5
        .addImm(0);
226
5
      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
227
5
        addReg(FlatScrInitLo).
228
5
        addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_LO |
229
5
                       (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
230
5
      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
231
5
        addReg(FlatScrInitHi).
232
5
        addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI |
233
5
                       (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
234
5
      return;
235
5
    }
236
83
237
83
    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
238
83
      .addReg(FlatScrInitLo)
239
83
      .addReg(ScratchWaveOffsetReg);
240
83
    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), AMDGPU::FLAT_SCR_HI)
241
83
      .addReg(FlatScrInitHi)
242
83
      .addImm(0);
243
83
244
83
    return;
245
83
  }
246
343
247
343
  assert(ST.getGeneration() < AMDGPUSubtarget::GFX10);
248
343
249
343
  // Copy the size in bytes.
250
343
  BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
251
343
    .addReg(FlatScrInitHi, RegState::Kill);
252
343
253
343
  // Add wave offset in bytes to private base offset.
254
343
  // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
255
343
  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
256
343
    .addReg(FlatScrInitLo)
257
343
    .addReg(ScratchWaveOffsetReg);
258
343
259
343
  // Convert offset to 256-byte units.
260
343
  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI)
261
343
    .addReg(FlatScrInitLo, RegState::Kill)
262
343
    .addImm(8);
263
343
}
264
265
unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg(
266
  const GCNSubtarget &ST,
267
  const SIInstrInfo *TII,
268
  const SIRegisterInfo *TRI,
269
  SIMachineFunctionInfo *MFI,
270
23.1k
  MachineFunction &MF) const {
271
23.1k
  MachineRegisterInfo &MRI = MF.getRegInfo();
272
23.1k
273
23.1k
  // We need to insert initialization of the scratch resource descriptor.
274
23.1k
  unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
275
23.1k
  if (ScratchRsrcReg == AMDGPU::NoRegister ||
276
23.1k
      !MRI.isPhysRegUsed(ScratchRsrcReg))
277
22.1k
    return AMDGPU::NoRegister;
278
1.05k
279
1.05k
  if (ST.hasSGPRInitBug() ||
280
1.05k
      
ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF)928
)
281
510
    return ScratchRsrcReg;
282
546
283
546
  // We reserved the last registers for this. Shift it down to the end of those
284
546
  // which were actually used.
285
546
  //
286
546
  // FIXME: It might be safer to use a pseudoregister before replacement.
287
546
288
546
  // FIXME: We should be able to eliminate unused input registers. We only
289
546
  // cannot do this for the resources required for scratch access. For now we
290
546
  // skip over user SGPRs and may leave unused holes.
291
546
292
546
  // We find the resource first because it has an alignment requirement.
293
546
294
546
  unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
295
546
  ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(ST, MF);
296
546
  AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
297
546
298
546
  // Skip the last N reserved elements because they should have already been
299
546
  // reserved for VCC etc.
300
2.53k
  for (MCPhysReg Reg : AllSGPR128s) {
301
2.53k
    // Pick the first unallocated one. Make sure we don't clobber the other
302
2.53k
    // reserved input we needed.
303
2.53k
    if (!MRI.isPhysRegUsed(Reg) && 
MRI.isAllocatable(Reg)682
) {
304
537
      MRI.replaceRegWith(ScratchRsrcReg, Reg);
305
537
      MFI->setScratchRSrcReg(Reg);
306
537
      return Reg;
307
537
    }
308
2.53k
  }
309
546
310
546
  
return ScratchRsrcReg9
;
311
546
}
312
313
// Shift down registers reserved for the scratch wave offset.
314
std::pair<unsigned, bool>
315
SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
316
    const GCNSubtarget &ST, const SIInstrInfo *TII, const SIRegisterInfo *TRI,
317
23.1k
    SIMachineFunctionInfo *MFI, MachineFunction &MF) const {
318
23.1k
  MachineRegisterInfo &MRI = MF.getRegInfo();
319
23.1k
  unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
320
23.1k
321
23.1k
  assert(MFI->isEntryFunction());
322
23.1k
323
23.1k
  // No replacement necessary.
324
23.1k
  if (ScratchWaveOffsetReg == AMDGPU::NoRegister ||
325
23.1k
      (!hasFP(MF) && 
!MRI.isPhysRegUsed(ScratchWaveOffsetReg)22.6k
)) {
326
22.1k
    return std::make_pair(AMDGPU::NoRegister, false);
327
22.1k
  }
328
1.07k
329
1.07k
  if (ST.hasSGPRInitBug())
330
133
    return std::make_pair(ScratchWaveOffsetReg, false);
331
945
332
945
  unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
333
945
334
945
  ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(ST, MF);
335
945
  if (NumPreloaded > AllSGPRs.size())
336
1
    return std::make_pair(ScratchWaveOffsetReg, false);
337
944
338
944
  AllSGPRs = AllSGPRs.slice(NumPreloaded);
339
944
340
944
  // We need to drop register from the end of the list that we cannot use
341
944
  // for the scratch wave offset.
342
944
  // + 2 s102 and s103 do not exist on VI.
343
944
  // + 2 for vcc
344
944
  // + 2 for xnack_mask
345
944
  // + 2 for flat_scratch
346
944
  // + 4 for registers reserved for scratch resource register
347
944
  // + 1 for register reserved for scratch wave offset.  (By exluding this
348
944
  //     register from the list to consider, it means that when this
349
944
  //     register is being used for the scratch wave offset and there
350
944
  //     are no other free SGPRs, then the value will stay in this register.
351
944
  // + 1 if stack pointer is used.
352
944
  // ----
353
944
  //  13 (+1)
354
944
  unsigned ReservedRegCount = 13;
355
944
356
944
  if (AllSGPRs.size() < ReservedRegCount)
357
1
    return std::make_pair(ScratchWaveOffsetReg, false);
358
943
359
943
  bool HandledScratchWaveOffsetReg =
360
943
    ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
361
943
  bool FPAdjusted = false;
362
943
363
65.3k
  for (MCPhysReg Reg : AllSGPRs.drop_back(ReservedRegCount)) {
364
65.3k
    // Pick the first unallocated SGPR. Be careful not to pick an alias of the
365
65.3k
    // scratch descriptor, since we haven’t added its uses yet.
366
65.3k
    if (!MRI.isPhysRegUsed(Reg) && 
MRI.isAllocatable(Reg)49.5k
) {
367
48.8k
      if (!HandledScratchWaveOffsetReg) {
368
161
        HandledScratchWaveOffsetReg = true;
369
161
370
161
        MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
371
161
        if (MFI->getScratchWaveOffsetReg() == MFI->getStackPtrOffsetReg()) {
372
158
          assert(!hasFP(MF));
373
158
          MFI->setStackPtrOffsetReg(Reg);
374
158
        }
375
161
376
161
        MFI->setScratchWaveOffsetReg(Reg);
377
161
        MFI->setFrameOffsetReg(Reg);
378
161
        ScratchWaveOffsetReg = Reg;
379
161
        FPAdjusted = true;
380
161
        break;
381
161
      }
382
48.8k
    }
383
65.3k
  }
384
943
385
943
  return std::make_pair(ScratchWaveOffsetReg, FPAdjusted);
386
943
}
387
388
void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
389
23.1k
                                                MachineBasicBlock &MBB) const {
390
23.1k
  assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
391
23.1k
392
23.1k
  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
393
23.1k
394
23.1k
  // If we only have SGPR spills, we won't actually be using scratch memory
395
23.1k
  // since these spill to VGPRs.
396
23.1k
  //
397
23.1k
  // FIXME: We should be cleaning up these unused SGPR spill frame indices
398
23.1k
  // somewhere.
399
23.1k
400
23.1k
  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
401
23.1k
  const SIInstrInfo *TII = ST.getInstrInfo();
402
23.1k
  const SIRegisterInfo *TRI = &TII->getRegisterInfo();
403
23.1k
  MachineRegisterInfo &MRI = MF.getRegInfo();
404
23.1k
  const Function &F = MF.getFunction();
405
23.1k
406
23.1k
  // We need to do the replacement of the private segment buffer and wave offset
407
23.1k
  // register even if there are no stack objects. There could be stores to undef
408
23.1k
  // or a constant without an associated object.
409
23.1k
410
23.1k
  // FIXME: We still have implicit uses on SGPR spill instructions in case they
411
23.1k
  // need to spill to vector memory. It's likely that will not happen, but at
412
23.1k
  // this point it appears we need the setup. This part of the prolog should be
413
23.1k
  // emitted after frame indices are eliminated.
414
23.1k
415
23.1k
  if (MFI->hasFlatScratchInit())
416
431
    emitFlatScratchInit(ST, MF, MBB);
417
23.1k
418
23.1k
  unsigned ScratchRsrcReg
419
23.1k
    = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF);
420
23.1k
421
23.1k
  unsigned ScratchWaveOffsetReg;
422
23.1k
  bool FPAdjusted;
423
23.1k
  std::tie(ScratchWaveOffsetReg, FPAdjusted) =
424
23.1k
      getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF);
425
23.1k
426
23.1k
  // We need to insert initialization of the scratch resource descriptor.
427
23.1k
  unsigned PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
428
23.1k
    AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
429
23.1k
430
23.1k
  unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
431
23.1k
  if (ST.isAmdHsaOrMesa(F)) {
432
4.46k
    PreloadedPrivateBufferReg = MFI->getPreloadedReg(
433
4.46k
      AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
434
4.46k
  }
435
23.1k
436
23.1k
  bool OffsetRegUsed = ScratchWaveOffsetReg != AMDGPU::NoRegister &&
437
23.1k
                       
MRI.isPhysRegUsed(ScratchWaveOffsetReg)1.07k
;
438
23.1k
  bool ResourceRegUsed = ScratchRsrcReg != AMDGPU::NoRegister &&
439
23.1k
                         
MRI.isPhysRegUsed(ScratchRsrcReg)1.05k
;
440
23.1k
441
23.1k
  // FIXME: Hack to not crash in situations which emitted an error.
442
23.1k
  if (PreloadedScratchWaveOffsetReg == AMDGPU::NoRegister)
443
3
    return;
444
23.1k
445
23.1k
  // We added live-ins during argument lowering, but since they were not used
446
23.1k
  // they were deleted. We're adding the uses now, so add them back.
447
23.1k
  MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
448
23.1k
  MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
449
23.1k
450
23.1k
  if (ResourceRegUsed && 
PreloadedPrivateBufferReg != AMDGPU::NoRegister1.05k
) {
451
480
    assert(ST.isAmdHsaOrMesa(F) || ST.isMesaGfxShader(F));
452
480
    MRI.addLiveIn(PreloadedPrivateBufferReg);
453
480
    MBB.addLiveIn(PreloadedPrivateBufferReg);
454
480
  }
455
23.1k
456
23.1k
  // Make the register selected live throughout the function.
457
26.6k
  for (MachineBasicBlock &OtherBB : MF) {
458
26.6k
    if (&OtherBB == &MBB)
459
23.1k
      continue;
460
3.42k
461
3.42k
    if (OffsetRegUsed || 
FPAdjusted3.21k
)
462
210
      OtherBB.addLiveIn(ScratchWaveOffsetReg);
463
3.42k
464
3.42k
    if (ResourceRegUsed)
465
221
      OtherBB.addLiveIn(ScratchRsrcReg);
466
3.42k
  }
467
23.1k
468
23.1k
  DebugLoc DL;
469
23.1k
  MachineBasicBlock::iterator I = MBB.begin();
470
23.1k
471
23.1k
  // If we reserved the original input registers, we don't need to copy to the
472
23.1k
  // reserved registers.
473
23.1k
474
23.1k
  bool CopyBuffer = ResourceRegUsed &&
475
23.1k
    
PreloadedPrivateBufferReg != AMDGPU::NoRegister1.05k
&&
476
23.1k
    
ST.isAmdHsaOrMesa(F)480
&&
477
23.1k
    
ScratchRsrcReg != PreloadedPrivateBufferReg480
;
478
23.1k
479
23.1k
  // This needs to be careful of the copying order to avoid overwriting one of
480
23.1k
  // the input registers before it's been copied to it's final
481
23.1k
  // destination. Usually the offset should be copied first.
482
23.1k
  bool CopyBufferFirst = TRI->isSubRegisterEq(PreloadedPrivateBufferReg,
483
23.1k
                                              ScratchWaveOffsetReg);
484
23.1k
  if (CopyBuffer && 
CopyBufferFirst51
) {
485
0
    BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
486
0
      .addReg(PreloadedPrivateBufferReg, RegState::Kill);
487
0
  }
488
23.1k
489
23.1k
  unsigned SPReg = MFI->getStackPtrOffsetReg();
490
23.1k
  assert(SPReg != AMDGPU::SP_REG);
491
23.1k
492
23.1k
  // FIXME: Remove the isPhysRegUsed checks
493
23.1k
  const bool HasFP = hasFP(MF);
494
23.1k
495
23.1k
  if (HasFP || 
OffsetRegUsed22.6k
) {
496
1.07k
    assert(ScratchWaveOffsetReg);
497
1.07k
    BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
498
1.07k
      .addReg(PreloadedScratchWaveOffsetReg, HasFP ? 
RegState::Kill502
:
0576
);
499
1.07k
  }
500
23.1k
501
23.1k
  if (CopyBuffer && 
!CopyBufferFirst51
) {
502
51
    BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
503
51
      .addReg(PreloadedPrivateBufferReg, RegState::Kill);
504
51
  }
505
23.1k
506
23.1k
  if (ResourceRegUsed) {
507
1.05k
    emitEntryFunctionScratchSetup(ST, MF, MBB, MFI, I,
508
1.05k
        PreloadedPrivateBufferReg, ScratchRsrcReg);
509
1.05k
  }
510
23.1k
511
23.1k
  if (HasFP) {
512
502
    DebugLoc DL;
513
502
    const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
514
502
    int64_t StackSize = FrameInfo.getStackSize();
515
502
516
502
    // On kernel entry, the private scratch wave offset is the SP value.
517
502
    if (StackSize == 0) {
518
474
      BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), SPReg)
519
474
        .addReg(MFI->getScratchWaveOffsetReg());
520
474
    } else {
521
28
      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), SPReg)
522
28
        .addReg(MFI->getScratchWaveOffsetReg())
523
28
        .addImm(StackSize * ST.getWavefrontSize());
524
28
    }
525
502
  }
526
23.1k
}
527
528
// Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set.
529
void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
530
      MachineFunction &MF, MachineBasicBlock &MBB, SIMachineFunctionInfo *MFI,
531
      MachineBasicBlock::iterator I, unsigned PreloadedPrivateBufferReg,
532
1.05k
      unsigned ScratchRsrcReg) const {
533
1.05k
534
1.05k
  const SIInstrInfo *TII = ST.getInstrInfo();
535
1.05k
  const SIRegisterInfo *TRI = &TII->getRegisterInfo();
536
1.05k
  const Function &Fn = MF.getFunction();
537
1.05k
  DebugLoc DL;
538
1.05k
539
1.05k
  if (ST.isAmdPalOS()) {
540
7
    // The pointer to the GIT is formed from the offset passed in and either
541
7
    // the amdgpu-git-ptr-high function attribute or the top part of the PC
542
7
    unsigned RsrcLo = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
543
7
    unsigned RsrcHi = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
544
7
    unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
545
7
546
7
    const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
547
7
548
7
    if (MFI->getGITPtrHigh() != 0xffffffff) {
549
4
      BuildMI(MBB, I, DL, SMovB32, RsrcHi)
550
4
        .addImm(MFI->getGITPtrHigh())
551
4
        .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
552
4
    } else {
553
3
      const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
554
3
      BuildMI(MBB, I, DL, GetPC64, Rsrc01);
555
3
    }
556
7
    auto GitPtrLo = AMDGPU::SGPR0; // Low GIT address passed in
557
7
    if (ST.hasMergedShaders()) {
558
1
      switch (MF.getFunction().getCallingConv()) {
559
1
        case CallingConv::AMDGPU_HS:
560
1
        case CallingConv::AMDGPU_GS:
561
1
          // Low GIT address is passed in s8 rather than s0 for an LS+HS or
562
1
          // ES+GS merged shader on gfx9+.
563
1
          GitPtrLo = AMDGPU::SGPR8;
564
1
          break;
565
1
        default:
566
0
          break;
567
7
      }
568
7
    }
569
7
    MF.getRegInfo().addLiveIn(GitPtrLo);
570
7
    MBB.addLiveIn(GitPtrLo);
571
7
    BuildMI(MBB, I, DL, SMovB32, RsrcLo)
572
7
      .addReg(GitPtrLo)
573
7
      .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
574
7
575
7
    // We now have the GIT ptr - now get the scratch descriptor from the entry
576
7
    // at offset 0 (or offset 16 for a compute shader).
577
7
    PointerType *PtrTy =
578
7
      PointerType::get(Type::getInt64Ty(MF.getFunction().getContext()),
579
7
                       AMDGPUAS::CONSTANT_ADDRESS);
580
7
    MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
581
7
    const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM);
582
7
    auto MMO = MF.getMachineMemOperand(PtrInfo,
583
7
                                       MachineMemOperand::MOLoad |
584
7
                                       MachineMemOperand::MOInvariant |
585
7
                                       MachineMemOperand::MODereferenceable,
586
7
                                       16, 4);
587
7
    unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 
162
:
05
;
588
7
    const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
589
7
    unsigned EncodedOffset = AMDGPU::getSMRDEncodedOffset(Subtarget, Offset);
590
7
    BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg)
591
7
      .addReg(Rsrc01)
592
7
      .addImm(EncodedOffset) // offset
593
7
      .addImm(0) // glc
594
7
      .addImm(0) // dlc
595
7
      .addReg(ScratchRsrcReg, RegState::ImplicitDefine)
596
7
      .addMemOperand(MMO);
597
7
    return;
598
7
  }
599
1.04k
  if (ST.isMesaGfxShader(Fn)
600
1.04k
      || 
(PreloadedPrivateBufferReg == AMDGPU::NoRegister)1.04k
) {
601
569
    assert(!ST.isAmdHsaOrMesa(Fn));
602
569
    const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
603
569
604
569
    unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
605
569
    unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
606
569
607
569
    // Use relocations to get the pointer, and setup the other bits manually.
608
569
    uint64_t Rsrc23 = TII->getScratchRsrcWords23();
609
569
610
569
    if (MFI->hasImplicitBufferPtr()) {
611
5
      unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
612
5
613
5
      if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
614
1
        const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
615
1
616
1
        BuildMI(MBB, I, DL, Mov64, Rsrc01)
617
1
          .addReg(MFI->getImplicitBufferPtrUserSGPR())
618
1
          .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
619
4
      } else {
620
4
        const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
621
4
622
4
        PointerType *PtrTy =
623
4
          PointerType::get(Type::getInt64Ty(MF.getFunction().getContext()),
624
4
                           AMDGPUAS::CONSTANT_ADDRESS);
625
4
        MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
626
4
        auto MMO = MF.getMachineMemOperand(PtrInfo,
627
4
                                           MachineMemOperand::MOLoad |
628
4
                                           MachineMemOperand::MOInvariant |
629
4
                                           MachineMemOperand::MODereferenceable,
630
4
                                           8, 4);
631
4
        BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
632
4
          .addReg(MFI->getImplicitBufferPtrUserSGPR())
633
4
          .addImm(0) // offset
634
4
          .addImm(0) // glc
635
4
          .addImm(0) // dlc
636
4
          .addMemOperand(MMO)
637
4
          .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
638
4
639
4
        MF.getRegInfo().addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
640
4
        MBB.addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
641
4
      }
642
564
    } else {
643
564
      unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
644
564
      unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
645
564
646
564
      BuildMI(MBB, I, DL, SMovB32, Rsrc0)
647
564
        .addExternalSymbol("SCRATCH_RSRC_DWORD0")
648
564
        .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
649
564
650
564
      BuildMI(MBB, I, DL, SMovB32, Rsrc1)
651
564
        .addExternalSymbol("SCRATCH_RSRC_DWORD1")
652
564
        .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
653
564
654
564
    }
655
569
656
569
    BuildMI(MBB, I, DL, SMovB32, Rsrc2)
657
569
      .addImm(Rsrc23 & 0xffffffff)
658
569
      .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
659
569
660
569
    BuildMI(MBB, I, DL, SMovB32, Rsrc3)
661
569
      .addImm(Rsrc23 >> 32)
662
569
      .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
663
569
  }
664
1.04k
}
665
666
66
bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
667
66
  switch (ID) {
668
66
  case TargetStackID::Default:
669
66
  case TargetStackID::NoAlloc:
670
66
  case TargetStackID::SGPRSpill:
671
66
    return true;
672
0
  }
673
0
  llvm_unreachable("Invalid TargetStackID::Value");
674
0
}
675
676
void SIFrameLowering::emitPrologue(MachineFunction &MF,
677
25.4k
                                   MachineBasicBlock &MBB) const {
678
25.4k
  SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
679
25.4k
  if (FuncInfo->isEntryFunction()) {
680
23.1k
    emitEntryFunctionPrologue(MF, MBB);
681
23.1k
    return;
682
23.1k
  }
683
2.26k
684
2.26k
  const MachineFrameInfo &MFI = MF.getFrameInfo();
685
2.26k
  MachineRegisterInfo &MRI = MF.getRegInfo();
686
2.26k
  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
687
2.26k
  const SIInstrInfo *TII = ST.getInstrInfo();
688
2.26k
  const SIRegisterInfo &TRI = TII->getRegisterInfo();
689
2.26k
690
2.26k
  unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
691
2.26k
  unsigned FramePtrReg = FuncInfo->getFrameOffsetReg();
692
2.26k
  LivePhysRegs LiveRegs;
693
2.26k
694
2.26k
  MachineBasicBlock::iterator MBBI = MBB.begin();
695
2.26k
  DebugLoc DL;
696
2.26k
697
2.26k
  bool HasFP = false;
698
2.26k
  uint32_t NumBytes = MFI.getStackSize();
699
2.26k
  uint32_t RoundedSize = NumBytes;
700
2.26k
  // To avoid clobbering VGPRs in lanes that weren't active on function entry,
701
2.26k
  // turn on all lanes before doing the spill to memory.
702
2.26k
  unsigned ScratchExecCopy = AMDGPU::NoRegister;
703
2.26k
704
2.26k
  // Emit the copy if we need an FP, and are using a free SGPR to save it.
705
2.26k
  if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister) {
706
18
    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->SGPRForFPSaveRestoreCopy)
707
18
      .addReg(FramePtrReg)
708
18
      .setMIFlag(MachineInstr::FrameSetup);
709
18
  }
710
2.26k
711
2.26k
  for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
712
2.26k
         : FuncInfo->getSGPRSpillVGPRs()) {
713
140
    if (!Reg.FI.hasValue())
714
24
      continue;
715
116
716
116
    if (ScratchExecCopy == AMDGPU::NoRegister) {
717
116
      if (LiveRegs.empty()) {
718
116
        LiveRegs.init(TRI);
719
116
        LiveRegs.addLiveIns(MBB);
720
116
        if (FuncInfo->SGPRForFPSaveRestoreCopy)
721
0
          LiveRegs.removeReg(FuncInfo->SGPRForFPSaveRestoreCopy);
722
116
      }
723
116
724
116
      ScratchExecCopy
725
116
        = findScratchNonCalleeSaveRegister(MRI, LiveRegs,
726
116
                                           *TRI.getWaveMaskRegClass());
727
116
      assert(FuncInfo->SGPRForFPSaveRestoreCopy != ScratchExecCopy);
728
116
729
116
      const unsigned OrSaveExec = ST.isWave32() ?
730
111
        
AMDGPU::S_OR_SAVEEXEC_B325
: AMDGPU::S_OR_SAVEEXEC_B64;
731
116
      BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec),
732
116
              ScratchExecCopy)
733
116
        .addImm(-1);
734
116
    }
735
116
736
116
    buildPrologSpill(LiveRegs, MBB, MBBI, TII, Reg.VGPR,
737
116
                     FuncInfo->getScratchRSrcReg(),
738
116
                     StackPtrReg,
739
116
                     Reg.FI.getValue());
740
116
  }
741
2.26k
742
2.26k
  if (ScratchExecCopy != AMDGPU::NoRegister) {
743
116
    // FIXME: Split block and make terminator.
744
116
    unsigned ExecMov = ST.isWave32() ? 
AMDGPU::S_MOV_B325
:
AMDGPU::S_MOV_B64111
;
745
116
    unsigned Exec = ST.isWave32() ? 
AMDGPU::EXEC_LO5
:
AMDGPU::EXEC111
;
746
116
    BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
747
116
      .addReg(ScratchExecCopy, RegState::Kill);
748
116
    LiveRegs.addReg(ScratchExecCopy);
749
116
  }
750
2.26k
751
2.26k
752
2.26k
  if (FuncInfo->FramePointerSaveIndex) {
753
118
    const int FI = FuncInfo->FramePointerSaveIndex.getValue();
754
118
    assert(!MFI.isDeadObjectIndex(FI) &&
755
118
           MFI.getStackID(FI) == TargetStackID::SGPRSpill);
756
118
    ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill
757
118
      = FuncInfo->getSGPRToVGPRSpills(FI);
758
118
    assert(Spill.size() == 1);
759
118
760
118
    // Save FP before setting it up.
761
118
    // FIXME: This should respect spillSGPRToVGPR;
762
118
    BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
763
118
            Spill[0].VGPR)
764
118
      .addReg(FramePtrReg)
765
118
      .addImm(Spill[0].Lane)
766
118
      .addReg(Spill[0].VGPR, RegState::Undef);
767
118
  }
768
2.26k
769
2.26k
  if (TRI.needsStackRealignment(MF)) {
770
8
    HasFP = true;
771
8
    const unsigned Alignment = MFI.getMaxAlignment();
772
8
773
8
    RoundedSize += Alignment;
774
8
    if (LiveRegs.empty()) {
775
8
      LiveRegs.init(TRI);
776
8
      LiveRegs.addLiveIns(MBB);
777
8
      LiveRegs.addReg(FuncInfo->SGPRForFPSaveRestoreCopy);
778
8
    }
779
8
780
8
    unsigned ScratchSPReg = findScratchNonCalleeSaveRegister(
781
8
        MRI, LiveRegs, AMDGPU::SReg_32_XM0RegClass);
782
8
    assert(ScratchSPReg != AMDGPU::NoRegister &&
783
8
           ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy);
784
8
785
8
    // s_add_u32 tmp_reg, s32, NumBytes
786
8
    // s_and_b32 s32, tmp_reg, 0b111...0000
787
8
    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), ScratchSPReg)
788
8
      .addReg(StackPtrReg)
789
8
      .addImm((Alignment - 1) * ST.getWavefrontSize())
790
8
      .setMIFlag(MachineInstr::FrameSetup);
791
8
    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
792
8
      .addReg(ScratchSPReg, RegState::Kill)
793
8
      .addImm(-Alignment * ST.getWavefrontSize())
794
8
      .setMIFlag(MachineInstr::FrameSetup);
795
8
    FuncInfo->setIsStackRealigned(true);
796
2.25k
  } else if ((HasFP = hasFP(MF))) {
797
128
    // If we need a base pointer, set it up here. It's whatever the value of
798
128
    // the stack pointer is at this point. Any variable size objects will be
799
128
    // allocated after this, so we can still use the base pointer to reference
800
128
    // locals.
801
128
    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
802
128
      .addReg(StackPtrReg)
803
128
      .setMIFlag(MachineInstr::FrameSetup);
804
128
  }
805
2.26k
806
2.26k
  if (HasFP && 
RoundedSize != 0136
) {
807
134
    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg)
808
134
      .addReg(StackPtrReg)
809
134
      .addImm(RoundedSize * ST.getWavefrontSize())
810
134
      .setMIFlag(MachineInstr::FrameSetup);
811
134
  }
812
2.26k
813
2.26k
  assert((!HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister ||
814
2.26k
                     FuncInfo->FramePointerSaveIndex)) &&
815
2.26k
         "Needed to save FP but didn't save it anywhere");
816
2.26k
817
2.26k
  assert((HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy == AMDGPU::NoRegister &&
818
2.26k
                    !FuncInfo->FramePointerSaveIndex)) &&
819
2.26k
         "Saved FP but didn't need it");
820
2.26k
}
821
822
void SIFrameLowering::emitEpilogue(MachineFunction &MF,
823
25.4k
                                   MachineBasicBlock &MBB) const {
824
25.4k
  const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
825
25.4k
  if (FuncInfo->isEntryFunction())
826
23.1k
    return;
827
2.25k
828
2.25k
  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
829
2.25k
  const SIInstrInfo *TII = ST.getInstrInfo();
830
2.25k
  MachineRegisterInfo &MRI = MF.getRegInfo();
831
2.25k
  MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
832
2.25k
  LivePhysRegs LiveRegs;
833
2.25k
  DebugLoc DL;
834
2.25k
835
2.25k
  const MachineFrameInfo &MFI = MF.getFrameInfo();
836
2.25k
  uint32_t NumBytes = MFI.getStackSize();
837
2.25k
  uint32_t RoundedSize = FuncInfo->isStackRealigned() ?
838
2.24k
    
NumBytes + MFI.getMaxAlignment()8
: NumBytes;
839
2.25k
840
2.25k
  if (RoundedSize != 0 && 
hasFP(MF)366
) {
841
134
    const unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
842
134
    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
843
134
      .addReg(StackPtrReg)
844
134
      .addImm(RoundedSize * ST.getWavefrontSize())
845
134
      .setMIFlag(MachineInstr::FrameDestroy);
846
134
  }
847
2.25k
848
2.25k
  if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister) {
849
18
    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->getFrameOffsetReg())
850
18
      .addReg(FuncInfo->SGPRForFPSaveRestoreCopy)
851
18
      .setMIFlag(MachineInstr::FrameSetup);
852
18
  }
853
2.25k
854
2.25k
  if (FuncInfo->FramePointerSaveIndex) {
855
118
    const int FI = FuncInfo->FramePointerSaveIndex.getValue();
856
118
857
118
    assert(!MF.getFrameInfo().isDeadObjectIndex(FI) &&
858
118
           MF.getFrameInfo().getStackID(FI) == TargetStackID::SGPRSpill);
859
118
860
118
    ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill
861
118
      = FuncInfo->getSGPRToVGPRSpills(FI);
862
118
    assert(Spill.size() == 1);
863
118
    BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
864
118
            FuncInfo->getFrameOffsetReg())
865
118
      .addReg(Spill[0].VGPR)
866
118
      .addImm(Spill[0].Lane);
867
118
  }
868
2.25k
869
2.25k
  unsigned ScratchExecCopy = AMDGPU::NoRegister;
870
2.25k
  for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
871
2.25k
         : FuncInfo->getSGPRSpillVGPRs()) {
872
140
    if (!Reg.FI.hasValue())
873
24
      continue;
874
116
875
116
    const SIRegisterInfo &TRI = TII->getRegisterInfo();
876
116
    if (ScratchExecCopy == AMDGPU::NoRegister) {
877
116
      // See emitPrologue
878
116
      if (LiveRegs.empty()) {
879
116
        LiveRegs.init(*ST.getRegisterInfo());
880
116
        LiveRegs.addLiveOuts(MBB);
881
116
        LiveRegs.stepBackward(*MBBI);
882
116
      }
883
116
884
116
      ScratchExecCopy = findScratchNonCalleeSaveRegister(
885
116
          MRI, LiveRegs, *TRI.getWaveMaskRegClass());
886
116
      LiveRegs.removeReg(ScratchExecCopy);
887
116
888
116
      const unsigned OrSaveExec =
889
116
          ST.isWave32() ? 
AMDGPU::S_OR_SAVEEXEC_B325
:
AMDGPU::S_OR_SAVEEXEC_B64111
;
890
116
891
116
      BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), ScratchExecCopy)
892
116
        .addImm(-1);
893
116
    }
894
116
895
116
    buildEpilogReload(LiveRegs, MBB, MBBI, TII, Reg.VGPR,
896
116
                      FuncInfo->getScratchRSrcReg(),
897
116
                      FuncInfo->getStackPtrOffsetReg(), Reg.FI.getValue());
898
116
  }
899
2.25k
900
2.25k
  if (ScratchExecCopy != AMDGPU::NoRegister) {
901
116
    // FIXME: Split block and make terminator.
902
116
    unsigned ExecMov = ST.isWave32() ? 
AMDGPU::S_MOV_B325
:
AMDGPU::S_MOV_B64111
;
903
116
    unsigned Exec = ST.isWave32() ? 
AMDGPU::EXEC_LO5
:
AMDGPU::EXEC111
;
904
116
    BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
905
116
      .addReg(ScratchExecCopy, RegState::Kill);
906
116
  }
907
2.25k
}
908
909
// Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
910
// memory. They should have been removed by now.
911
25.4k
static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
912
25.4k
  for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
913
25.9k
       I != E; 
++I479
) {
914
1.39k
    if (!MFI.isDeadObjectIndex(I))
915
920
      return false;
916
1.39k
  }
917
25.4k
918
25.4k
  
return true24.5k
;
919
25.4k
}
920
921
#ifndef NDEBUG
922
static bool allSGPRSpillsAreDead(const MachineFrameInfo &MFI,
923
                                 Optional<int> FramePointerSaveIndex) {
924
  for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
925
       I != E; ++I) {
926
    if (!MFI.isDeadObjectIndex(I) &&
927
        MFI.getStackID(I) == TargetStackID::SGPRSpill &&
928
        FramePointerSaveIndex && I != FramePointerSaveIndex) {
929
      return false;
930
    }
931
  }
932
933
  return true;
934
}
935
#endif
936
937
int SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
938
16
                                            unsigned &FrameReg) const {
939
16
  const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
940
16
941
16
  FrameReg = RI->getFrameRegister(MF);
942
16
  return MF.getFrameInfo().getObjectOffset(FI);
943
16
}
944
945
void SIFrameLowering::processFunctionBeforeFrameFinalized(
946
  MachineFunction &MF,
947
25.4k
  RegScavenger *RS) const {
948
25.4k
  MachineFrameInfo &MFI = MF.getFrameInfo();
949
25.4k
950
25.4k
  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
951
25.4k
  const SIRegisterInfo *TRI = ST.getRegisterInfo();
952
25.4k
  SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
953
25.4k
954
25.4k
  FuncInfo->removeDeadFrameIndices(MFI);
955
25.4k
  assert(allSGPRSpillsAreDead(MFI, None) &&
956
25.4k
         "SGPR spill should have been removed in SILowerSGPRSpills");
957
25.4k
958
25.4k
  // FIXME: The other checks should be redundant with allStackObjectsAreDead,
959
25.4k
  // but currently hasNonSpillStackObjects is set only from source
960
25.4k
  // allocas. Stack temps produced from legalization are not counted currently.
961
25.4k
  if (!allStackObjectsAreDead(MFI)) {
962
918
    assert(RS && "RegScavenger required if spilling");
963
918
964
918
    if (FuncInfo->isEntryFunction()) {
965
552
      int ScavengeFI = MFI.CreateFixedObject(
966
552
        TRI->getSpillSize(AMDGPU::SGPR_32RegClass), 0, false);
967
552
      RS->addScavengingFrameIndex(ScavengeFI);
968
552
    } else {
969
366
      int ScavengeFI = MFI.CreateStackObject(
970
366
        TRI->getSpillSize(AMDGPU::SGPR_32RegClass),
971
366
        TRI->getSpillAlignment(AMDGPU::SGPR_32RegClass),
972
366
        false);
973
366
      RS->addScavengingFrameIndex(ScavengeFI);
974
366
    }
975
918
  }
976
25.4k
}
977
978
// Only report VGPRs to generic code.
979
void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
980
                                           BitVector &SavedVGPRs,
981
25.5k
                                           RegScavenger *RS) const {
982
25.5k
  TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS);
983
25.5k
  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
984
25.5k
  if (MFI->isEntryFunction())
985
23.1k
    return;
986
2.32k
987
2.32k
  const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
988
2.32k
  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
989
2.32k
  const SIRegisterInfo *TRI = ST.getRegisterInfo();
990
2.32k
991
2.32k
  // Ignore the SGPRs the default implementation found.
992
2.32k
  SavedVGPRs.clearBitsNotInMask(TRI->getAllVGPRRegMask());
993
2.32k
994
2.32k
  // hasFP only knows about stack objects that already exist. We're now
995
2.32k
  // determining the stack slots that will be created, so we have to predict
996
2.32k
  // them. Stack objects force FP usage with calls.
997
2.32k
  //
998
2.32k
  // Note a new VGPR CSR may be introduced if one is used for the spill, but we
999
2.32k
  // don't want to report it here.
1000
2.32k
  //
1001
2.32k
  // FIXME: Is this really hasReservedCallFrame?
1002
2.32k
  const bool WillHaveFP =
1003
2.32k
      FrameInfo.hasCalls() &&
1004
2.32k
      
(126
SavedVGPRs.any()126
||
!allStackObjectsAreDead(FrameInfo)13
);
1005
2.32k
1006
2.32k
  // VGPRs used for SGPR spilling need to be specially inserted in the prolog,
1007
2.32k
  // so don't allow the default insertion to handle them.
1008
2.32k
  for (auto SSpill : MFI->getSGPRSpillVGPRs())
1009
146
    SavedVGPRs.reset(SSpill.VGPR);
1010
2.32k
1011
2.32k
  const bool HasFP = WillHaveFP || 
hasFP(MF)2.20k
;
1012
2.32k
  if (!HasFP)
1013
2.18k
    return;
1014
140
1015
140
  if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) {
1016
121
    int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr,
1017
121
                                                    TargetStackID::SGPRSpill);
1018
121
1019
121
    // If there is already a VGPR with free lanes, use it. We may already have
1020
121
    // to pay the penalty for spilling a CSR VGPR.
1021
121
    if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI))
1022
121
      
llvm_unreachable0
("allocate SGPR spill should have worked");
1023
121
1024
121
    MFI->FramePointerSaveIndex = NewFI;
1025
121
1026
121
    LLVM_DEBUG(
1027
121
      auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
1028
121
      dbgs() << "Spilling FP to  " << printReg(Spill.VGPR, TRI)
1029
121
             << ':' << Spill.Lane << '\n');
1030
121
    return;
1031
19
  }
1032
19
1033
19
  MFI->SGPRForFPSaveRestoreCopy = findUnusedSGPRNonCalleeSaved(MF.getRegInfo());
1034
19
1035
19
  if (!MFI->SGPRForFPSaveRestoreCopy) {
1036
0
    // There's no free lane to spill, and no free register to save FP, so we're
1037
0
    // forced to spill another VGPR to use for the spill.
1038
0
    int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr,
1039
0
                                                    TargetStackID::SGPRSpill);
1040
0
    if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI))
1041
0
      llvm_unreachable("allocate SGPR spill should have worked");
1042
0
    MFI->FramePointerSaveIndex = NewFI;
1043
0
1044
0
    LLVM_DEBUG(
1045
0
      auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
1046
0
      dbgs() << "FP requires fallback spill to " << printReg(Spill.VGPR, TRI)
1047
0
             << ':' << Spill.Lane << '\n';);
1048
19
  } else {
1049
19
    LLVM_DEBUG(dbgs() << "Saving FP with copy to " <<
1050
19
               printReg(MFI->SGPRForFPSaveRestoreCopy, TRI) << '\n');
1051
19
  }
1052
19
}
1053
1054
void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
1055
                                               BitVector &SavedRegs,
1056
25.4k
                                               RegScavenger *RS) const {
1057
25.4k
  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
1058
25.4k
  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1059
25.4k
  if (MFI->isEntryFunction())
1060
23.1k
    return;
1061
2.26k
1062
2.26k
  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1063
2.26k
  const SIRegisterInfo *TRI = ST.getRegisterInfo();
1064
2.26k
1065
2.26k
  // The SP is specifically managed and we don't want extra spills of it.
1066
2.26k
  SavedRegs.reset(MFI->getStackPtrOffsetReg());
1067
2.26k
  SavedRegs.clearBitsInMask(TRI->getAllVGPRRegMask());
1068
2.26k
}
1069
1070
bool SIFrameLowering::assignCalleeSavedSpillSlots(
1071
    MachineFunction &MF, const TargetRegisterInfo *TRI,
1072
25.4k
    std::vector<CalleeSavedInfo> &CSI) const {
1073
25.4k
  if (CSI.empty())
1074
25.3k
    return true; // Early exit if no callee saved registers are modified!
1075
54
1076
54
  const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1077
54
  if (!FuncInfo->SGPRForFPSaveRestoreCopy)
1078
50
    return false;
1079
4
1080
4
  for (auto &CS : CSI) {
1081
4
    if (CS.getReg() == FuncInfo->getFrameOffsetReg()) {
1082
0
      if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister)
1083
0
        CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy);
1084
0
      break;
1085
0
    }
1086
4
  }
1087
4
1088
4
  return false;
1089
4
}
1090
1091
MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
1092
  MachineFunction &MF,
1093
  MachineBasicBlock &MBB,
1094
1.29k
  MachineBasicBlock::iterator I) const {
1095
1.29k
  int64_t Amount = I->getOperand(0).getImm();
1096
1.29k
  if (Amount == 0)
1097
1.29k
    return MBB.erase(I);
1098
0
1099
0
  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1100
0
  const SIInstrInfo *TII = ST.getInstrInfo();
1101
0
  const DebugLoc &DL = I->getDebugLoc();
1102
0
  unsigned Opc = I->getOpcode();
1103
0
  bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
1104
0
  uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
1105
0
1106
0
  if (!hasReservedCallFrame(MF)) {
1107
0
    unsigned Align = getStackAlignment();
1108
0
1109
0
    Amount = alignTo(Amount, Align);
1110
0
    assert(isUInt<32>(Amount) && "exceeded stack address space size");
1111
0
    const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1112
0
    unsigned SPReg = MFI->getStackPtrOffsetReg();
1113
0
1114
0
    unsigned Op = IsDestroy ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
1115
0
    BuildMI(MBB, I, DL, TII->get(Op), SPReg)
1116
0
      .addReg(SPReg)
1117
0
      .addImm(Amount * ST.getWavefrontSize());
1118
0
  } else if (CalleePopAmount != 0) {
1119
0
    llvm_unreachable("is this used?");
1120
0
  }
1121
0
1122
0
  return MBB.erase(I);
1123
0
}
1124
1125
137k
bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
1126
137k
  const MachineFrameInfo &MFI = MF.getFrameInfo();
1127
137k
  if (MFI.hasCalls()) {
1128
6.52k
    // All offsets are unsigned, so need to be addressed in the same direction
1129
6.52k
    // as stack growth.
1130
6.52k
1131
6.52k
    // FIXME: This function is pretty broken, since it can be called before the
1132
6.52k
    // frame layout is determined or CSR spills are inserted.
1133
6.52k
    if (MFI.getStackSize() != 0)
1134
650
      return true;
1135
5.87k
1136
5.87k
    // For the entry point, the input wave scratch offset must be copied to the
1137
5.87k
    // API SP if there are calls.
1138
5.87k
    if (MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction())
1139
4.97k
      return true;
1140
131k
  }
1141
131k
1142
131k
  return MFI.hasVarSizedObjects() || 
MFI.isFrameAddressTaken()131k
||
1143
131k
    
MFI.hasStackMap()131k
||
MFI.hasPatchPoint()131k
||
1144
131k
    
MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->needsStackRealignment(MF)131k
||
1145
131k
    
MF.getTarget().Options.DisableFramePointerElim(MF)131k
;
1146
131k
}