/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===-- SIRegisterInfo.cpp - SI Register Information ---------------------===// |
2 | | // |
3 | | // The LLVM Compiler Infrastructure |
4 | | // |
5 | | // This file is distributed under the University of Illinois Open Source |
6 | | // License. See LICENSE.TXT for details. |
7 | | // |
8 | | //===----------------------------------------------------------------------===// |
9 | | // |
10 | | /// \file |
11 | | /// \brief SI implementation of the TargetRegisterInfo class. |
12 | | // |
13 | | //===----------------------------------------------------------------------===// |
14 | | |
15 | | #include "SIRegisterInfo.h" |
16 | | #include "AMDGPUSubtarget.h" |
17 | | #include "SIInstrInfo.h" |
18 | | #include "SIMachineFunctionInfo.h" |
19 | | #include "llvm/CodeGen/MachineFrameInfo.h" |
20 | | #include "llvm/CodeGen/MachineInstrBuilder.h" |
21 | | #include "llvm/CodeGen/RegisterScavenging.h" |
22 | | #include "llvm/IR/Function.h" |
23 | | #include "llvm/IR/LLVMContext.h" |
24 | | |
25 | | using namespace llvm; |
26 | | |
27 | 97.9k | static bool hasPressureSet(const int *PSets, unsigned PSetID) { |
28 | 322k | for (unsigned i = 0; PSets[i] != -1322k ; ++i224k ) { |
29 | 234k | if (PSets[i] == (int)PSetID) |
30 | 9.07k | return true; |
31 | 234k | } |
32 | 88.8k | return false; |
33 | 97.9k | } |
34 | | |
35 | | void SIRegisterInfo::classifyPressureSet(unsigned PSetID, unsigned Reg, |
36 | 97.9k | BitVector &PressureSets) const { |
37 | 186k | for (MCRegUnitIterator U(Reg, this); U.isValid()186k ; ++U88.8k ) { |
38 | 97.9k | const int *PSets = getRegUnitPressureSets(*U); |
39 | 97.9k | if (hasPressureSet(PSets, PSetID)97.9k ) { |
40 | 9.07k | PressureSets.set(PSetID); |
41 | 9.07k | break; |
42 | 9.07k | } |
43 | 97.9k | } |
44 | 97.9k | } |
45 | | |
46 | | static cl::opt<bool> EnableSpillSGPRToSMEM( |
47 | | "amdgpu-spill-sgpr-to-smem", |
48 | | cl::desc("Use scalar stores to spill SGPRs if supported by subtarget"), |
49 | | cl::init(false)); |
50 | | |
51 | | static cl::opt<bool> EnableSpillSGPRToVGPR( |
52 | | "amdgpu-spill-sgpr-to-vgpr", |
53 | | cl::desc("Enable spilling VGPRs to SGPRs"), |
54 | | cl::ReallyHidden, |
55 | | cl::init(true)); |
56 | | |
57 | | SIRegisterInfo::SIRegisterInfo(const SISubtarget &ST) : |
58 | | AMDGPURegisterInfo(), |
59 | | SGPRPressureSets(getNumRegPressureSets()), |
60 | | VGPRPressureSets(getNumRegPressureSets()), |
61 | | SpillSGPRToVGPR(false), |
62 | 1.81k | SpillSGPRToSMEM(false) { |
63 | 1.81k | if (EnableSpillSGPRToSMEM && 1.81k ST.hasScalarStores()5 ) |
64 | 5 | SpillSGPRToSMEM = true; |
65 | 1.80k | else if (1.80k EnableSpillSGPRToVGPR1.80k ) |
66 | 1.80k | SpillSGPRToVGPR = true; |
67 | 1.81k | |
68 | 1.81k | unsigned NumRegPressureSets = getNumRegPressureSets(); |
69 | 1.81k | |
70 | 1.81k | SGPRSetID = NumRegPressureSets; |
71 | 1.81k | VGPRSetID = NumRegPressureSets; |
72 | 1.81k | |
73 | 50.7k | for (unsigned i = 0; i < NumRegPressureSets50.7k ; ++i48.9k ) { |
74 | 48.9k | classifyPressureSet(i, AMDGPU::SGPR0, SGPRPressureSets); |
75 | 48.9k | classifyPressureSet(i, AMDGPU::VGPR0, VGPRPressureSets); |
76 | 48.9k | } |
77 | 1.81k | |
78 | 1.81k | // Determine the number of reg units for each pressure set. |
79 | 1.81k | std::vector<unsigned> PressureSetRegUnits(NumRegPressureSets, 0); |
80 | 3.15M | for (unsigned i = 0, e = getNumRegUnits(); i != e3.15M ; ++i3.15M ) { |
81 | 3.15M | const int *PSets = getRegUnitPressureSets(i); |
82 | 10.3M | for (unsigned j = 0; PSets[j] != -110.3M ; ++j7.21M ) { |
83 | 7.21M | ++PressureSetRegUnits[PSets[j]]; |
84 | 7.21M | } |
85 | 3.15M | } |
86 | 1.81k | |
87 | 1.81k | unsigned VGPRMax = 0, SGPRMax = 0; |
88 | 50.7k | for (unsigned i = 0; i < NumRegPressureSets50.7k ; ++i48.9k ) { |
89 | 48.9k | if (isVGPRPressureSet(i) && 48.9k PressureSetRegUnits[i] > VGPRMax1.81k ) { |
90 | 1.81k | VGPRSetID = i; |
91 | 1.81k | VGPRMax = PressureSetRegUnits[i]; |
92 | 1.81k | continue; |
93 | 1.81k | } |
94 | 47.1k | if (47.1k isSGPRPressureSet(i) && 47.1k PressureSetRegUnits[i] > SGPRMax7.25k ) { |
95 | 7.25k | SGPRSetID = i; |
96 | 7.25k | SGPRMax = PressureSetRegUnits[i]; |
97 | 7.25k | } |
98 | 48.9k | } |
99 | 1.81k | |
100 | 1.81k | assert(SGPRSetID < NumRegPressureSets && |
101 | 1.81k | VGPRSetID < NumRegPressureSets); |
102 | 1.81k | } |
103 | | |
104 | 713k | void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const { |
105 | 713k | MCRegAliasIterator R(Reg, this, true); |
106 | 713k | |
107 | 4.69M | for (; R.isValid()4.69M ; ++R3.98M ) |
108 | 3.98M | Reserved.set(*R); |
109 | 713k | } |
110 | | |
111 | | unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg( |
112 | 14.4k | const MachineFunction &MF) const { |
113 | 14.4k | |
114 | 14.4k | const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); |
115 | 14.4k | unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4; |
116 | 14.4k | unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); |
117 | 14.4k | return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass); |
118 | 14.4k | } |
119 | | |
120 | 14.4k | static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) { |
121 | 14.4k | unsigned Reg; |
122 | 14.4k | |
123 | 14.4k | // Try to place it in a hole after PrivateSegmentBufferReg. |
124 | 14.4k | if (RegCount & 314.4k ) { |
125 | 14.2k | // We cannot put the segment buffer in (Idx - 4) ... (Idx - 1) due to |
126 | 14.2k | // alignment constraints, so we have a hole where can put the wave offset. |
127 | 14.2k | Reg = RegCount - 1; |
128 | 14.4k | } else { |
129 | 173 | // We can put the segment buffer in (Idx - 4) ... (Idx - 1) and put the |
130 | 173 | // wave offset before it. |
131 | 173 | Reg = RegCount - 5; |
132 | 173 | } |
133 | 14.4k | |
134 | 14.4k | return Reg; |
135 | 14.4k | } |
136 | | |
137 | | unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg( |
138 | 14.4k | const MachineFunction &MF) const { |
139 | 14.4k | const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); |
140 | 14.4k | unsigned Reg = findPrivateSegmentWaveByteOffsetRegIndex(ST.getMaxNumSGPRs(MF)); |
141 | 14.4k | return AMDGPU::SGPR_32RegClass.getRegister(Reg); |
142 | 14.4k | } |
143 | | |
144 | | unsigned SIRegisterInfo::reservedStackPtrOffsetReg( |
145 | 1.19k | const MachineFunction &MF) const { |
146 | 1.19k | return AMDGPU::SGPR32; |
147 | 1.19k | } |
148 | | |
149 | 30.4k | BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { |
150 | 30.4k | BitVector Reserved(getNumRegs()); |
151 | 30.4k | |
152 | 30.4k | // EXEC_LO and EXEC_HI could be allocated and used as regular register, but |
153 | 30.4k | // this seems likely to result in bugs, so I'm marking them as reserved. |
154 | 30.4k | reserveRegisterTuples(Reserved, AMDGPU::EXEC); |
155 | 30.4k | reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR); |
156 | 30.4k | |
157 | 30.4k | // M0 has to be reserved so that llvm accepts it as a live-in into a block. |
158 | 30.4k | reserveRegisterTuples(Reserved, AMDGPU::M0); |
159 | 30.4k | |
160 | 30.4k | // Reserve the memory aperture registers. |
161 | 30.4k | reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE); |
162 | 30.4k | reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT); |
163 | 30.4k | reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE); |
164 | 30.4k | reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT); |
165 | 30.4k | |
166 | 30.4k | // Reserve Trap Handler registers - support is not implemented in Codegen. |
167 | 30.4k | reserveRegisterTuples(Reserved, AMDGPU::TBA); |
168 | 30.4k | reserveRegisterTuples(Reserved, AMDGPU::TMA); |
169 | 30.4k | reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1); |
170 | 30.4k | reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3); |
171 | 30.4k | reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5); |
172 | 30.4k | reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7); |
173 | 30.4k | reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9); |
174 | 30.4k | reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11); |
175 | 30.4k | |
176 | 30.4k | const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); |
177 | 30.4k | |
178 | 30.4k | unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF); |
179 | 30.4k | unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); |
180 | 160k | for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs160k ; ++i130k ) { |
181 | 130k | unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i); |
182 | 130k | reserveRegisterTuples(Reserved, Reg); |
183 | 130k | } |
184 | 30.4k | |
185 | 30.4k | unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF); |
186 | 30.4k | unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); |
187 | 35.4k | for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs35.4k ; ++i5.04k ) { |
188 | 5.04k | unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i); |
189 | 5.04k | reserveRegisterTuples(Reserved, Reg); |
190 | 5.04k | } |
191 | 30.4k | |
192 | 30.4k | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
193 | 30.4k | |
194 | 30.4k | unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); |
195 | 30.4k | if (ScratchWaveOffsetReg != AMDGPU::NoRegister30.4k ) { |
196 | 30.4k | // Reserve 1 SGPR for scratch wave offset in case we need to spill. |
197 | 30.4k | reserveRegisterTuples(Reserved, ScratchWaveOffsetReg); |
198 | 30.4k | } |
199 | 30.4k | |
200 | 30.4k | unsigned ScratchRSrcReg = MFI->getScratchRSrcReg(); |
201 | 30.4k | if (ScratchRSrcReg != AMDGPU::NoRegister30.4k ) { |
202 | 30.4k | // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need |
203 | 30.4k | // to spill. |
204 | 30.4k | // TODO: May need to reserve a VGPR if doing LDS spilling. |
205 | 30.4k | reserveRegisterTuples(Reserved, ScratchRSrcReg); |
206 | 30.4k | assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg)); |
207 | 30.4k | } |
208 | 30.4k | |
209 | 30.4k | // We have to assume the SP is needed in case there are calls in the function, |
210 | 30.4k | // which is detected after the function is lowered. If we aren't really going |
211 | 30.4k | // to need SP, don't bother reserving it. |
212 | 30.4k | unsigned StackPtrReg = MFI->getStackPtrOffsetReg(); |
213 | 30.4k | |
214 | 30.4k | if (StackPtrReg != AMDGPU::NoRegister30.4k ) { |
215 | 30.4k | reserveRegisterTuples(Reserved, StackPtrReg); |
216 | 30.4k | assert(!isSubRegister(ScratchRSrcReg, StackPtrReg)); |
217 | 30.4k | } |
218 | 30.4k | |
219 | 30.4k | unsigned FrameReg = MFI->getFrameOffsetReg(); |
220 | 30.4k | if (FrameReg != AMDGPU::NoRegister30.4k ) { |
221 | 30.4k | reserveRegisterTuples(Reserved, FrameReg); |
222 | 30.4k | assert(!isSubRegister(ScratchRSrcReg, FrameReg)); |
223 | 30.4k | } |
224 | 30.4k | |
225 | 30.4k | return Reserved; |
226 | 30.4k | } |
227 | | |
228 | 30.1k | bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { |
229 | 30.1k | const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>(); |
230 | 30.1k | if (Info->isEntryFunction()30.1k ) { |
231 | 28.3k | const MachineFrameInfo &MFI = Fn.getFrameInfo(); |
232 | 27.5k | return MFI.hasStackObjects() || MFI.hasCalls(); |
233 | 28.3k | } |
234 | 1.73k | |
235 | 1.73k | // May need scavenger for dealing with callee saved registers. |
236 | 1.73k | return true; |
237 | 1.73k | } |
238 | | |
239 | | bool SIRegisterInfo::requiresFrameIndexScavenging( |
240 | 15.0k | const MachineFunction &MF) const { |
241 | 15.0k | const MachineFrameInfo &MFI = MF.getFrameInfo(); |
242 | 15.0k | if (MFI.hasStackObjects()) |
243 | 544 | return true; |
244 | 14.5k | |
245 | 14.5k | // May need to deal with callee saved registers. |
246 | 14.5k | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
247 | 14.5k | return !Info->isEntryFunction(); |
248 | 14.5k | } |
249 | | |
250 | | bool SIRegisterInfo::requiresFrameIndexReplacementScavenging( |
251 | 14.7k | const MachineFunction &MF) const { |
252 | 14.7k | // m0 is needed for the scalar store offset. m0 is unallocatable, so we can't |
253 | 14.7k | // create a virtual register for it during frame index elimination, so the |
254 | 14.7k | // scavenger is directly needed. |
255 | 14.7k | return MF.getFrameInfo().hasStackObjects() && |
256 | 544 | MF.getSubtarget<SISubtarget>().hasScalarStores() && |
257 | 259 | MF.getInfo<SIMachineFunctionInfo>()->hasSpilledSGPRs(); |
258 | 14.7k | } |
259 | | |
260 | | bool SIRegisterInfo::requiresVirtualBaseRegisters( |
261 | 15.0k | const MachineFunction &) const { |
262 | 15.0k | // There are no special dedicated stack or frame pointers. |
263 | 15.0k | return true; |
264 | 15.0k | } |
265 | | |
266 | 30.1k | bool SIRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const { |
267 | 30.1k | // This helps catch bugs as verifier errors. |
268 | 30.1k | return true; |
269 | 30.1k | } |
270 | | |
271 | 4.40k | int64_t SIRegisterInfo::getMUBUFInstrOffset(const MachineInstr *MI) const { |
272 | 4.40k | assert(SIInstrInfo::isMUBUF(*MI)); |
273 | 4.40k | |
274 | 4.40k | int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), |
275 | 4.40k | AMDGPU::OpName::offset); |
276 | 4.40k | return MI->getOperand(OffIdx).getImm(); |
277 | 4.40k | } |
278 | | |
279 | | int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI, |
280 | 4 | int Idx) const { |
281 | 4 | if (!SIInstrInfo::isMUBUF(*MI)) |
282 | 0 | return 0; |
283 | 4 | |
284 | 4 | assert(Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), |
285 | 4 | AMDGPU::OpName::vaddr) && |
286 | 4 | "Should never see frame index on non-address operand"); |
287 | 4 | |
288 | 4 | return getMUBUFInstrOffset(MI); |
289 | 4 | } |
290 | | |
291 | 4.74k | bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { |
292 | 4.74k | if (!MI->mayLoadOrStore()) |
293 | 342 | return false; |
294 | 4.40k | |
295 | 4.40k | int64_t FullOffset = Offset + getMUBUFInstrOffset(MI); |
296 | 4.40k | |
297 | 4.40k | return !isUInt<12>(FullOffset); |
298 | 4.40k | } |
299 | | |
300 | | void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, |
301 | | unsigned BaseReg, |
302 | | int FrameIdx, |
303 | 0 | int64_t Offset) const { |
304 | 0 | MachineBasicBlock::iterator Ins = MBB->begin(); |
305 | 0 | DebugLoc DL; // Defaults to "unknown" |
306 | 0 |
|
307 | 0 | if (Ins != MBB->end()) |
308 | 0 | DL = Ins->getDebugLoc(); |
309 | 0 |
|
310 | 0 | MachineFunction *MF = MBB->getParent(); |
311 | 0 | const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>(); |
312 | 0 | const SIInstrInfo *TII = Subtarget.getInstrInfo(); |
313 | 0 |
|
314 | 0 | if (Offset == 00 ) { |
315 | 0 | BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), BaseReg) |
316 | 0 | .addFrameIndex(FrameIdx); |
317 | 0 | return; |
318 | 0 | } |
319 | 0 |
|
320 | 0 | MachineRegisterInfo &MRI = MF->getRegInfo(); |
321 | 0 | unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); |
322 | 0 |
|
323 | 0 | unsigned FIReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
324 | 0 |
|
325 | 0 | BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) |
326 | 0 | .addImm(Offset); |
327 | 0 | BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), FIReg) |
328 | 0 | .addFrameIndex(FrameIdx); |
329 | 0 |
|
330 | 0 | TII->getAddNoCarry(*MBB, Ins, DL, BaseReg) |
331 | 0 | .addReg(OffsetReg, RegState::Kill) |
332 | 0 | .addReg(FIReg); |
333 | 0 | } |
334 | | |
335 | | void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, |
336 | 0 | int64_t Offset) const { |
337 | 0 |
|
338 | 0 | MachineBasicBlock *MBB = MI.getParent(); |
339 | 0 | MachineFunction *MF = MBB->getParent(); |
340 | 0 | const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>(); |
341 | 0 | const SIInstrInfo *TII = Subtarget.getInstrInfo(); |
342 | 0 |
|
343 | | #ifndef NDEBUG |
344 | | // FIXME: Is it possible to be storing a frame index to itself? |
345 | | bool SeenFI = false; |
346 | | for (const MachineOperand &MO: MI.operands()) { |
347 | | if (MO.isFI()) { |
348 | | if (SeenFI) |
349 | | llvm_unreachable("should not see multiple frame indices"); |
350 | | |
351 | | SeenFI = true; |
352 | | } |
353 | | } |
354 | | #endif |
355 | |
|
356 | 0 | MachineOperand *FIOp = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); |
357 | 0 | assert(FIOp && FIOp->isFI() && "frame index must be address operand"); |
358 | 0 | assert(TII->isMUBUF(MI)); |
359 | 0 | assert(TII->getNamedOperand(MI, AMDGPU::OpName::soffset)->getReg() == |
360 | 0 | MF->getInfo<SIMachineFunctionInfo>()->getFrameOffsetReg() && |
361 | 0 | "should only be seeing frame offset relative FrameIndex"); |
362 | 0 |
|
363 | 0 |
|
364 | 0 | MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset); |
365 | 0 | int64_t NewOffset = OffsetOp->getImm() + Offset; |
366 | 0 | assert(isUInt<12>(NewOffset) && "offset should be legal"); |
367 | 0 |
|
368 | 0 | FIOp->ChangeToRegister(BaseReg, false); |
369 | 0 | OffsetOp->setImm(NewOffset); |
370 | 0 | } |
371 | | |
372 | | bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, |
373 | | unsigned BaseReg, |
374 | 0 | int64_t Offset) const { |
375 | 0 | if (!SIInstrInfo::isMUBUF(*MI)) |
376 | 0 | return false; |
377 | 0 |
|
378 | 0 | int64_t NewOffset = Offset + getMUBUFInstrOffset(MI); |
379 | 0 |
|
380 | 0 | return isUInt<12>(NewOffset); |
381 | 0 | } |
382 | | |
383 | | const TargetRegisterClass *SIRegisterInfo::getPointerRegClass( |
384 | 0 | const MachineFunction &MF, unsigned Kind) const { |
385 | 0 | // This is inaccurate. It depends on the instruction and address space. The |
386 | 0 | // only place where we should hit this is for dealing with frame indexes / |
387 | 0 | // private accesses, so this is correct in that case. |
388 | 0 | return &AMDGPU::VGPR_32RegClass; |
389 | 0 | } |
390 | | |
391 | 1.16k | static unsigned getNumSubRegsForSpillOp(unsigned Op) { |
392 | 1.16k | |
393 | 1.16k | switch (Op) { |
394 | 0 | case AMDGPU::SI_SPILL_S512_SAVE: |
395 | 0 | case AMDGPU::SI_SPILL_S512_RESTORE: |
396 | 0 | case AMDGPU::SI_SPILL_V512_SAVE: |
397 | 0 | case AMDGPU::SI_SPILL_V512_RESTORE: |
398 | 0 | return 16; |
399 | 0 | case AMDGPU::SI_SPILL_S256_SAVE: |
400 | 0 | case AMDGPU::SI_SPILL_S256_RESTORE: |
401 | 0 | case AMDGPU::SI_SPILL_V256_SAVE: |
402 | 0 | case AMDGPU::SI_SPILL_V256_RESTORE: |
403 | 0 | return 8; |
404 | 657 | case AMDGPU::SI_SPILL_S128_SAVE: |
405 | 657 | case AMDGPU::SI_SPILL_S128_RESTORE: |
406 | 657 | case AMDGPU::SI_SPILL_V128_SAVE: |
407 | 657 | case AMDGPU::SI_SPILL_V128_RESTORE: |
408 | 657 | return 4; |
409 | 0 | case AMDGPU::SI_SPILL_V96_SAVE: |
410 | 0 | case AMDGPU::SI_SPILL_V96_RESTORE: |
411 | 0 | return 3; |
412 | 12 | case AMDGPU::SI_SPILL_S64_SAVE: |
413 | 12 | case AMDGPU::SI_SPILL_S64_RESTORE: |
414 | 12 | case AMDGPU::SI_SPILL_V64_SAVE: |
415 | 12 | case AMDGPU::SI_SPILL_V64_RESTORE: |
416 | 12 | return 2; |
417 | 500 | case AMDGPU::SI_SPILL_S32_SAVE: |
418 | 500 | case AMDGPU::SI_SPILL_S32_RESTORE: |
419 | 500 | case AMDGPU::SI_SPILL_V32_SAVE: |
420 | 500 | case AMDGPU::SI_SPILL_V32_RESTORE: |
421 | 500 | return 1; |
422 | 0 | default: 0 llvm_unreachable0 ("Invalid spill opcode"); |
423 | 0 | } |
424 | 0 | } |
425 | | |
426 | 3.05k | static int getOffsetMUBUFStore(unsigned Opc) { |
427 | 3.05k | switch (Opc) { |
428 | 2.80k | case AMDGPU::BUFFER_STORE_DWORD_OFFEN: |
429 | 2.80k | return AMDGPU::BUFFER_STORE_DWORD_OFFSET; |
430 | 147 | case AMDGPU::BUFFER_STORE_BYTE_OFFEN: |
431 | 147 | return AMDGPU::BUFFER_STORE_BYTE_OFFSET; |
432 | 52 | case AMDGPU::BUFFER_STORE_SHORT_OFFEN: |
433 | 52 | return AMDGPU::BUFFER_STORE_SHORT_OFFSET; |
434 | 20 | case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN: |
435 | 20 | return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET; |
436 | 26 | case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN: |
437 | 26 | return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET; |
438 | 0 | default: |
439 | 0 | return -1; |
440 | 0 | } |
441 | 0 | } |
442 | | |
443 | 1.73k | static int getOffsetMUBUFLoad(unsigned Opc) { |
444 | 1.73k | switch (Opc) { |
445 | 1.60k | case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: |
446 | 1.60k | return AMDGPU::BUFFER_LOAD_DWORD_OFFSET; |
447 | 93 | case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN: |
448 | 93 | return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET; |
449 | 2 | case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN: |
450 | 2 | return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET; |
451 | 15 | case AMDGPU::BUFFER_LOAD_USHORT_OFFEN: |
452 | 15 | return AMDGPU::BUFFER_LOAD_USHORT_OFFSET; |
453 | 2 | case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN: |
454 | 2 | return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET; |
455 | 5 | case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN: |
456 | 5 | return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; |
457 | 16 | case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN: |
458 | 16 | return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET; |
459 | 0 | default: |
460 | 0 | return -1; |
461 | 0 | } |
462 | 0 | } |
463 | | |
464 | | // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not |
465 | | // need to handle the case where an SGPR may need to be spilled while spilling. |
466 | | static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII, |
467 | | MachineFrameInfo &MFI, |
468 | | MachineBasicBlock::iterator MI, |
469 | | int Index, |
470 | 4.78k | int64_t Offset) { |
471 | 4.78k | MachineBasicBlock *MBB = MI->getParent(); |
472 | 4.78k | const DebugLoc &DL = MI->getDebugLoc(); |
473 | 4.78k | bool IsStore = MI->mayStore(); |
474 | 4.78k | |
475 | 4.78k | unsigned Opc = MI->getOpcode(); |
476 | 4.78k | int LoadStoreOp = IsStore ? |
477 | 4.78k | getOffsetMUBUFStore(Opc)3.05k : getOffsetMUBUFLoad(Opc)1.73k ; |
478 | 4.78k | if (LoadStoreOp == -1) |
479 | 0 | return false; |
480 | 4.78k | |
481 | 4.78k | const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); |
482 | 4.78k | BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) |
483 | 4.78k | .add(*Reg) |
484 | 4.78k | .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)) |
485 | 4.78k | .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)) |
486 | 4.78k | .addImm(Offset) |
487 | 4.78k | .addImm(0) // glc |
488 | 4.78k | .addImm(0) // slc |
489 | 4.78k | .addImm(0) // tfe |
490 | 4.78k | .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); |
491 | 4.78k | return true; |
492 | 4.78k | } |
493 | | |
494 | | void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, |
495 | | unsigned LoadStoreOp, |
496 | | int Index, |
497 | | unsigned ValueReg, |
498 | | bool IsKill, |
499 | | unsigned ScratchRsrcReg, |
500 | | unsigned ScratchOffsetReg, |
501 | | int64_t InstOffset, |
502 | | MachineMemOperand *MMO, |
503 | 2.25k | RegScavenger *RS) const { |
504 | 2.25k | MachineBasicBlock *MBB = MI->getParent(); |
505 | 2.25k | MachineFunction *MF = MI->getParent()->getParent(); |
506 | 2.25k | const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); |
507 | 2.25k | const SIInstrInfo *TII = ST.getInstrInfo(); |
508 | 2.25k | const MachineFrameInfo &MFI = MF->getFrameInfo(); |
509 | 2.25k | |
510 | 2.25k | const MCInstrDesc &Desc = TII->get(LoadStoreOp); |
511 | 2.25k | const DebugLoc &DL = MI->getDebugLoc(); |
512 | 2.25k | bool IsStore = Desc.mayStore(); |
513 | 2.25k | |
514 | 2.25k | bool RanOutOfSGPRs = false; |
515 | 2.25k | bool Scavenged = false; |
516 | 2.25k | unsigned SOffset = ScratchOffsetReg; |
517 | 2.25k | |
518 | 2.25k | const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg); |
519 | 2.25k | unsigned NumSubRegs = AMDGPU::getRegBitWidth(RC->getID()) / 32; |
520 | 2.25k | unsigned Size = NumSubRegs * 4; |
521 | 2.25k | int64_t Offset = InstOffset + MFI.getObjectOffset(Index); |
522 | 2.25k | const int64_t OriginalImmOffset = Offset; |
523 | 2.25k | |
524 | 2.25k | unsigned Align = MFI.getObjectAlignment(Index); |
525 | 2.25k | const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo(); |
526 | 2.25k | |
527 | 2.25k | if (!isUInt<12>(Offset + Size)2.25k ) { |
528 | 232 | SOffset = AMDGPU::NoRegister; |
529 | 232 | |
530 | 232 | // We don't have access to the register scavenger if this function is called |
531 | 232 | // during PEI::scavengeFrameVirtualRegs(). |
532 | 232 | if (RS) |
533 | 0 | SOffset = RS->FindUnusedReg(&AMDGPU::SGPR_32RegClass); |
534 | 232 | |
535 | 232 | if (SOffset == AMDGPU::NoRegister232 ) { |
536 | 232 | // There are no free SGPRs, and since we are in the process of spilling |
537 | 232 | // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true |
538 | 232 | // on SI/CI and on VI it is true until we implement spilling using scalar |
539 | 232 | // stores), we have no way to free up an SGPR. Our solution here is to |
540 | 232 | // add the offset directly to the ScratchOffset register, and then |
541 | 232 | // subtract the offset after the spill to return ScratchOffset to it's |
542 | 232 | // original value. |
543 | 232 | RanOutOfSGPRs = true; |
544 | 232 | SOffset = ScratchOffsetReg; |
545 | 232 | } else { |
546 | 0 | Scavenged = true; |
547 | 0 | } |
548 | 232 | |
549 | 232 | BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset) |
550 | 232 | .addReg(ScratchOffsetReg) |
551 | 232 | .addImm(Offset); |
552 | 232 | |
553 | 232 | Offset = 0; |
554 | 232 | } |
555 | 2.25k | |
556 | 2.25k | const unsigned EltSize = 4; |
557 | 2.25k | |
558 | 8.47k | for (unsigned i = 0, e = NumSubRegs; i != e8.47k ; ++i, Offset += EltSize6.22k ) { |
559 | 6.22k | unsigned SubReg = NumSubRegs == 1 ? |
560 | 6.22k | ValueReg909 : getSubReg(ValueReg, getSubRegFromChannel(i))5.31k ; |
561 | 6.22k | |
562 | 6.22k | unsigned SOffsetRegState = 0; |
563 | 6.22k | unsigned SrcDstRegState = getDefRegState(!IsStore); |
564 | 6.22k | if (i + 1 == e6.22k ) { |
565 | 2.25k | SOffsetRegState |= getKillRegState(Scavenged); |
566 | 2.25k | // The last implicit use carries the "Kill" flag. |
567 | 2.25k | SrcDstRegState |= getKillRegState(IsKill); |
568 | 2.25k | } |
569 | 6.22k | |
570 | 6.22k | MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i); |
571 | 6.22k | MachineMemOperand *NewMMO |
572 | 6.22k | = MF->getMachineMemOperand(PInfo, MMO->getFlags(), |
573 | 6.22k | EltSize, MinAlign(Align, EltSize * i)); |
574 | 6.22k | |
575 | 6.22k | auto MIB = BuildMI(*MBB, MI, DL, Desc) |
576 | 6.22k | .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill)) |
577 | 6.22k | .addReg(ScratchRsrcReg) |
578 | 6.22k | .addReg(SOffset, SOffsetRegState) |
579 | 6.22k | .addImm(Offset) |
580 | 6.22k | .addImm(0) // glc |
581 | 6.22k | .addImm(0) // slc |
582 | 6.22k | .addImm(0) // tfe |
583 | 6.22k | .addMemOperand(NewMMO); |
584 | 6.22k | |
585 | 6.22k | if (NumSubRegs > 1) |
586 | 5.31k | MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState); |
587 | 6.22k | } |
588 | 2.25k | |
589 | 2.25k | if (RanOutOfSGPRs2.25k ) { |
590 | 232 | // Subtract the offset we added to the ScratchOffset register. |
591 | 232 | BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffsetReg) |
592 | 232 | .addReg(ScratchOffsetReg) |
593 | 232 | .addImm(OriginalImmOffset); |
594 | 232 | } |
595 | 2.25k | } |
596 | | |
597 | | static std::pair<unsigned, unsigned> getSpillEltSize(unsigned SuperRegSize, |
598 | 28 | bool Store) { |
599 | 28 | if (SuperRegSize % 16 == 028 ) { |
600 | 3 | return { 16, Store ? AMDGPU::S_BUFFER_STORE_DWORDX4_SGPR : |
601 | 3 | AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR }; |
602 | 6 | } |
603 | 22 | |
604 | 22 | if (22 SuperRegSize % 8 == 022 ) { |
605 | 8 | return { 8, Store ? AMDGPU::S_BUFFER_STORE_DWORDX2_SGPR : |
606 | 8 | AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR }; |
607 | 16 | } |
608 | 6 | |
609 | 6 | return { 4, Store ? 6 AMDGPU::S_BUFFER_STORE_DWORD_SGPR3 : |
610 | 3 | AMDGPU::S_BUFFER_LOAD_DWORD_SGPR}; |
611 | 28 | } |
612 | | |
613 | | bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, |
614 | | int Index, |
615 | | RegScavenger *RS, |
616 | 594 | bool OnlyToVGPR) const { |
617 | 594 | MachineBasicBlock *MBB = MI->getParent(); |
618 | 594 | MachineFunction *MF = MBB->getParent(); |
619 | 594 | SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); |
620 | 594 | |
621 | 594 | ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills |
622 | 594 | = MFI->getSGPRToVGPRSpills(Index); |
623 | 594 | bool SpillToVGPR = !VGPRSpills.empty(); |
624 | 594 | if (OnlyToVGPR && 594 !SpillToVGPR548 ) |
625 | 0 | return false; |
626 | 594 | |
627 | 594 | MachineRegisterInfo &MRI = MF->getRegInfo(); |
628 | 594 | const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); |
629 | 594 | const SIInstrInfo *TII = ST.getInstrInfo(); |
630 | 594 | |
631 | 594 | unsigned SuperReg = MI->getOperand(0).getReg(); |
632 | 594 | bool IsKill = MI->getOperand(0).isKill(); |
633 | 594 | const DebugLoc &DL = MI->getDebugLoc(); |
634 | 594 | |
635 | 594 | MachineFrameInfo &FrameInfo = MF->getFrameInfo(); |
636 | 594 | |
637 | 594 | bool SpillToSMEM = spillSGPRToSMEM(); |
638 | 594 | if (SpillToSMEM && 594 OnlyToVGPR14 ) |
639 | 0 | return false; |
640 | 594 | |
641 | 594 | assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); |
642 | 594 | |
643 | 594 | unsigned OffsetReg = AMDGPU::M0; |
644 | 594 | unsigned M0CopyReg = AMDGPU::NoRegister; |
645 | 594 | |
646 | 594 | if (SpillToSMEM594 ) { |
647 | 14 | if (RS->isRegUsed(AMDGPU::M0)14 ) { |
648 | 14 | M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); |
649 | 14 | BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg) |
650 | 14 | .addReg(AMDGPU::M0); |
651 | 14 | } |
652 | 14 | } |
653 | 594 | |
654 | 594 | unsigned ScalarStoreOp; |
655 | 594 | unsigned EltSize = 4; |
656 | 594 | const TargetRegisterClass *RC = getPhysRegClass(SuperReg); |
657 | 594 | if (SpillToSMEM && 594 isSGPRClass(RC)14 ) { |
658 | 14 | // XXX - if private_element_size is larger than 4 it might be useful to be |
659 | 14 | // able to spill wider vmem spills. |
660 | 14 | std::tie(EltSize, ScalarStoreOp) = |
661 | 14 | getSpillEltSize(getRegSizeInBits(*RC) / 8, true); |
662 | 14 | } |
663 | 594 | |
664 | 594 | ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize); |
665 | 594 | unsigned NumSubRegs = SplitParts.empty() ? 1466 : SplitParts.size()128 ; |
666 | 594 | |
667 | 594 | // SubReg carries the "Kill" flag when SubReg == SuperReg. |
668 | 466 | unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill); |
669 | 1.61k | for (unsigned i = 0, e = NumSubRegs; i < e1.61k ; ++i1.02k ) { |
670 | 1.02k | unsigned SubReg = NumSubRegs == 1 ? |
671 | 1.02k | SuperReg466 : getSubReg(SuperReg, SplitParts[i])556 ; |
672 | 1.02k | |
673 | 1.02k | if (SpillToSMEM1.02k ) { |
674 | 15 | int64_t FrOffset = FrameInfo.getObjectOffset(Index); |
675 | 15 | |
676 | 15 | // The allocated memory size is really the wavefront size * the frame |
677 | 15 | // index size. The widest register class is 64 bytes, so a 4-byte scratch |
678 | 15 | // allocation is enough to spill this in a single stack object. |
679 | 15 | // |
680 | 15 | // FIXME: Frame size/offsets are computed earlier than this, so the extra |
681 | 15 | // space is still unnecessarily allocated. |
682 | 15 | |
683 | 15 | unsigned Align = FrameInfo.getObjectAlignment(Index); |
684 | 15 | MachinePointerInfo PtrInfo |
685 | 15 | = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); |
686 | 15 | MachineMemOperand *MMO |
687 | 15 | = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, |
688 | 15 | EltSize, MinAlign(Align, EltSize * i)); |
689 | 15 | |
690 | 15 | // SMEM instructions only support a single offset, so increment the wave |
691 | 15 | // offset. |
692 | 15 | |
693 | 15 | int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i); |
694 | 15 | if (Offset != 015 ) { |
695 | 15 | BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) |
696 | 15 | .addReg(MFI->getFrameOffsetReg()) |
697 | 15 | .addImm(Offset); |
698 | 15 | } else { |
699 | 0 | BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) |
700 | 0 | .addReg(MFI->getFrameOffsetReg()); |
701 | 0 | } |
702 | 15 | |
703 | 15 | BuildMI(*MBB, MI, DL, TII->get(ScalarStoreOp)) |
704 | 15 | .addReg(SubReg, getKillRegState(IsKill)) // sdata |
705 | 15 | .addReg(MFI->getScratchRSrcReg()) // sbase |
706 | 15 | .addReg(OffsetReg, RegState::Kill) // soff |
707 | 15 | .addImm(0) // glc |
708 | 15 | .addMemOperand(MMO); |
709 | 15 | |
710 | 15 | continue; |
711 | 15 | } |
712 | 1.00k | |
713 | 1.00k | if (1.00k SpillToVGPR1.00k ) { |
714 | 925 | SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; |
715 | 925 | |
716 | 925 | BuildMI(*MBB, MI, DL, |
717 | 925 | TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), |
718 | 925 | Spill.VGPR) |
719 | 925 | .addReg(SubReg, getKillRegState(IsKill)) |
720 | 925 | .addImm(Spill.Lane); |
721 | 925 | |
722 | 925 | // FIXME: Since this spills to another register instead of an actual |
723 | 925 | // frame index, we should delete the frame index when all references to |
724 | 925 | // it are fixed. |
725 | 1.00k | } else { |
726 | 82 | // XXX - Can to VGPR spill fail for some subregisters but not others? |
727 | 82 | if (OnlyToVGPR) |
728 | 0 | return false; |
729 | 82 | |
730 | 82 | // Spill SGPR to a frame index. |
731 | 82 | // TODO: Should VI try to spill to VGPR and then spill to SMEM? |
732 | 82 | unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
733 | 82 | // TODO: Should VI try to spill to VGPR and then spill to SMEM? |
734 | 82 | |
735 | 82 | MachineInstrBuilder Mov |
736 | 82 | = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) |
737 | 82 | .addReg(SubReg, SubKillState); |
738 | 82 | |
739 | 82 | |
740 | 82 | // There could be undef components of a spilled super register. |
741 | 82 | // TODO: Can we detect this and skip the spill? |
742 | 82 | if (NumSubRegs > 182 ) { |
743 | 76 | // The last implicit use of the SuperReg carries the "Kill" flag. |
744 | 76 | unsigned SuperKillState = 0; |
745 | 76 | if (i + 1 == e) |
746 | 26 | SuperKillState |= getKillRegState(IsKill); |
747 | 76 | Mov.addReg(SuperReg, RegState::Implicit | SuperKillState); |
748 | 76 | } |
749 | 82 | |
750 | 82 | unsigned Align = FrameInfo.getObjectAlignment(Index); |
751 | 82 | MachinePointerInfo PtrInfo |
752 | 82 | = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); |
753 | 82 | MachineMemOperand *MMO |
754 | 82 | = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, |
755 | 82 | EltSize, MinAlign(Align, EltSize * i)); |
756 | 82 | BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE)) |
757 | 82 | .addReg(TmpReg, RegState::Kill) // src |
758 | 82 | .addFrameIndex(Index) // vaddr |
759 | 82 | .addReg(MFI->getScratchRSrcReg()) // srrsrc |
760 | 82 | .addReg(MFI->getFrameOffsetReg()) // soffset |
761 | 82 | .addImm(i * 4) // offset |
762 | 82 | .addMemOperand(MMO); |
763 | 82 | } |
764 | 1.02k | } |
765 | 594 | |
766 | 594 | if (594 M0CopyReg != AMDGPU::NoRegister594 ) { |
767 | 14 | BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::M0) |
768 | 14 | .addReg(M0CopyReg, RegState::Kill); |
769 | 14 | } |
770 | 594 | |
771 | 594 | MI->eraseFromParent(); |
772 | 594 | MFI->addToSpilledSGPRs(NumSubRegs); |
773 | 594 | return true; |
774 | 594 | } |
775 | | |
776 | | bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, |
777 | | int Index, |
778 | | RegScavenger *RS, |
779 | 582 | bool OnlyToVGPR) const { |
780 | 582 | MachineFunction *MF = MI->getParent()->getParent(); |
781 | 582 | MachineRegisterInfo &MRI = MF->getRegInfo(); |
782 | 582 | MachineBasicBlock *MBB = MI->getParent(); |
783 | 582 | SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); |
784 | 582 | |
785 | 582 | ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills |
786 | 582 | = MFI->getSGPRToVGPRSpills(Index); |
787 | 582 | bool SpillToVGPR = !VGPRSpills.empty(); |
788 | 582 | if (OnlyToVGPR && 582 !SpillToVGPR536 ) |
789 | 0 | return false; |
790 | 582 | |
791 | 582 | MachineFrameInfo &FrameInfo = MF->getFrameInfo(); |
792 | 582 | const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); |
793 | 582 | const SIInstrInfo *TII = ST.getInstrInfo(); |
794 | 582 | const DebugLoc &DL = MI->getDebugLoc(); |
795 | 582 | |
796 | 582 | unsigned SuperReg = MI->getOperand(0).getReg(); |
797 | 582 | bool SpillToSMEM = spillSGPRToSMEM(); |
798 | 582 | if (SpillToSMEM && 582 OnlyToVGPR14 ) |
799 | 0 | return false; |
800 | 582 | |
801 | 582 | assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); |
802 | 582 | |
803 | 582 | unsigned OffsetReg = AMDGPU::M0; |
804 | 582 | unsigned M0CopyReg = AMDGPU::NoRegister; |
805 | 582 | |
806 | 582 | if (SpillToSMEM582 ) { |
807 | 14 | if (RS->isRegUsed(AMDGPU::M0)14 ) { |
808 | 14 | M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); |
809 | 14 | BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg) |
810 | 14 | .addReg(AMDGPU::M0); |
811 | 14 | } |
812 | 14 | } |
813 | 582 | |
814 | 582 | unsigned EltSize = 4; |
815 | 582 | unsigned ScalarLoadOp; |
816 | 582 | |
817 | 582 | const TargetRegisterClass *RC = getPhysRegClass(SuperReg); |
818 | 582 | if (SpillToSMEM && 582 isSGPRClass(RC)14 ) { |
819 | 14 | // XXX - if private_element_size is larger than 4 it might be useful to be |
820 | 14 | // able to spill wider vmem spills. |
821 | 14 | std::tie(EltSize, ScalarLoadOp) = |
822 | 14 | getSpillEltSize(getRegSizeInBits(*RC) / 8, false); |
823 | 14 | } |
824 | 582 | |
825 | 582 | ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize); |
826 | 582 | unsigned NumSubRegs = SplitParts.empty() ? 1460 : SplitParts.size()122 ; |
827 | 582 | |
828 | 582 | // SubReg carries the "Kill" flag when SubReg == SuperReg. |
829 | 582 | int64_t FrOffset = FrameInfo.getObjectOffset(Index); |
830 | 582 | |
831 | 1.58k | for (unsigned i = 0, e = NumSubRegs; i < e1.58k ; ++i1.00k ) { |
832 | 1.00k | unsigned SubReg = NumSubRegs == 1 ? |
833 | 1.00k | SuperReg460 : getSubReg(SuperReg, SplitParts[i])544 ; |
834 | 1.00k | |
835 | 1.00k | if (SpillToSMEM1.00k ) { |
836 | 15 | // FIXME: Size may be > 4 but extra bytes wasted. |
837 | 15 | unsigned Align = FrameInfo.getObjectAlignment(Index); |
838 | 15 | MachinePointerInfo PtrInfo |
839 | 15 | = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); |
840 | 15 | MachineMemOperand *MMO |
841 | 15 | = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, |
842 | 15 | EltSize, MinAlign(Align, EltSize * i)); |
843 | 15 | |
844 | 15 | // Add i * 4 offset |
845 | 15 | int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i); |
846 | 15 | if (Offset != 015 ) { |
847 | 15 | BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) |
848 | 15 | .addReg(MFI->getFrameOffsetReg()) |
849 | 15 | .addImm(Offset); |
850 | 15 | } else { |
851 | 0 | BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) |
852 | 0 | .addReg(MFI->getFrameOffsetReg()); |
853 | 0 | } |
854 | 15 | |
855 | 15 | auto MIB = |
856 | 15 | BuildMI(*MBB, MI, DL, TII->get(ScalarLoadOp), SubReg) |
857 | 15 | .addReg(MFI->getScratchRSrcReg()) // sbase |
858 | 15 | .addReg(OffsetReg, RegState::Kill) // soff |
859 | 15 | .addImm(0) // glc |
860 | 15 | .addMemOperand(MMO); |
861 | 15 | |
862 | 15 | if (NumSubRegs > 1) |
863 | 2 | MIB.addReg(SuperReg, RegState::ImplicitDefine); |
864 | 15 | |
865 | 15 | continue; |
866 | 15 | } |
867 | 989 | |
868 | 989 | if (989 SpillToVGPR989 ) { |
869 | 907 | SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; |
870 | 907 | auto MIB = |
871 | 907 | BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), |
872 | 907 | SubReg) |
873 | 907 | .addReg(Spill.VGPR) |
874 | 907 | .addImm(Spill.Lane); |
875 | 907 | |
876 | 907 | if (NumSubRegs > 1) |
877 | 466 | MIB.addReg(SuperReg, RegState::ImplicitDefine); |
878 | 989 | } else { |
879 | 82 | if (OnlyToVGPR) |
880 | 0 | return false; |
881 | 82 | |
882 | 82 | // Restore SGPR from a stack slot. |
883 | 82 | // FIXME: We should use S_LOAD_DWORD here for VI. |
884 | 82 | unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
885 | 82 | unsigned Align = FrameInfo.getObjectAlignment(Index); |
886 | 82 | |
887 | 82 | MachinePointerInfo PtrInfo |
888 | 82 | = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); |
889 | 82 | |
890 | 82 | MachineMemOperand *MMO = MF->getMachineMemOperand(PtrInfo, |
891 | 82 | MachineMemOperand::MOLoad, EltSize, |
892 | 82 | MinAlign(Align, EltSize * i)); |
893 | 82 | |
894 | 82 | BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg) |
895 | 82 | .addFrameIndex(Index) // vaddr |
896 | 82 | .addReg(MFI->getScratchRSrcReg()) // srsrc |
897 | 82 | .addReg(MFI->getFrameOffsetReg()) // soffset |
898 | 82 | .addImm(i * 4) // offset |
899 | 82 | .addMemOperand(MMO); |
900 | 82 | |
901 | 82 | auto MIB = |
902 | 82 | BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg) |
903 | 82 | .addReg(TmpReg, RegState::Kill); |
904 | 82 | |
905 | 82 | if (NumSubRegs > 1) |
906 | 76 | MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); |
907 | 82 | } |
908 | 1.00k | } |
909 | 582 | |
910 | 582 | if (582 M0CopyReg != AMDGPU::NoRegister582 ) { |
911 | 14 | BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::M0) |
912 | 14 | .addReg(M0CopyReg, RegState::Kill); |
913 | 14 | } |
914 | 582 | |
915 | 582 | MI->eraseFromParent(); |
916 | 582 | return true; |
917 | 582 | } |
918 | | |
919 | | /// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to |
920 | | /// a VGPR and the stack slot can be safely eliminated when all other users are |
921 | | /// handled. |
922 | | bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( |
923 | | MachineBasicBlock::iterator MI, |
924 | | int FI, |
925 | 1.08k | RegScavenger *RS) const { |
926 | 1.08k | switch (MI->getOpcode()) { |
927 | 548 | case AMDGPU::SI_SPILL_S512_SAVE: |
928 | 548 | case AMDGPU::SI_SPILL_S256_SAVE: |
929 | 548 | case AMDGPU::SI_SPILL_S128_SAVE: |
930 | 548 | case AMDGPU::SI_SPILL_S64_SAVE: |
931 | 548 | case AMDGPU::SI_SPILL_S32_SAVE: |
932 | 548 | return spillSGPR(MI, FI, RS, true); |
933 | 536 | case AMDGPU::SI_SPILL_S512_RESTORE: |
934 | 536 | case AMDGPU::SI_SPILL_S256_RESTORE: |
935 | 536 | case AMDGPU::SI_SPILL_S128_RESTORE: |
936 | 536 | case AMDGPU::SI_SPILL_S64_RESTORE: |
937 | 536 | case AMDGPU::SI_SPILL_S32_RESTORE: |
938 | 536 | return restoreSGPR(MI, FI, RS, true); |
939 | 0 | default: |
940 | 0 | llvm_unreachable("not an SGPR spill instruction"); |
941 | 0 | } |
942 | 0 | } |
943 | | |
944 | | void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, |
945 | | int SPAdj, unsigned FIOperandNum, |
946 | 7.48k | RegScavenger *RS) const { |
947 | 7.48k | MachineFunction *MF = MI->getParent()->getParent(); |
948 | 7.48k | MachineRegisterInfo &MRI = MF->getRegInfo(); |
949 | 7.48k | MachineBasicBlock *MBB = MI->getParent(); |
950 | 7.48k | SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); |
951 | 7.48k | MachineFrameInfo &FrameInfo = MF->getFrameInfo(); |
952 | 7.48k | const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); |
953 | 7.48k | const SIInstrInfo *TII = ST.getInstrInfo(); |
954 | 7.48k | DebugLoc DL = MI->getDebugLoc(); |
955 | 7.48k | |
956 | 7.48k | MachineOperand &FIOp = MI->getOperand(FIOperandNum); |
957 | 7.48k | int Index = MI->getOperand(FIOperandNum).getIndex(); |
958 | 7.48k | |
959 | 7.48k | switch (MI->getOpcode()) { |
960 | 7.48k | // SGPR register spill |
961 | 46 | case AMDGPU::SI_SPILL_S512_SAVE: |
962 | 46 | case AMDGPU::SI_SPILL_S256_SAVE: |
963 | 46 | case AMDGPU::SI_SPILL_S128_SAVE: |
964 | 46 | case AMDGPU::SI_SPILL_S64_SAVE: |
965 | 46 | case AMDGPU::SI_SPILL_S32_SAVE: { |
966 | 46 | spillSGPR(MI, Index, RS); |
967 | 46 | break; |
968 | 46 | } |
969 | 46 | |
970 | 46 | // SGPR register restore |
971 | 46 | case AMDGPU::SI_SPILL_S512_RESTORE: |
972 | 46 | case AMDGPU::SI_SPILL_S256_RESTORE: |
973 | 46 | case AMDGPU::SI_SPILL_S128_RESTORE: |
974 | 46 | case AMDGPU::SI_SPILL_S64_RESTORE: |
975 | 46 | case AMDGPU::SI_SPILL_S32_RESTORE: { |
976 | 46 | restoreSGPR(MI, Index, RS); |
977 | 46 | break; |
978 | 46 | } |
979 | 46 | |
980 | 46 | // VGPR register spill |
981 | 1.16k | case AMDGPU::SI_SPILL_V512_SAVE: |
982 | 1.16k | case AMDGPU::SI_SPILL_V256_SAVE: |
983 | 1.16k | case AMDGPU::SI_SPILL_V128_SAVE: |
984 | 1.16k | case AMDGPU::SI_SPILL_V96_SAVE: |
985 | 1.16k | case AMDGPU::SI_SPILL_V64_SAVE: |
986 | 1.16k | case AMDGPU::SI_SPILL_V32_SAVE: { |
987 | 1.16k | const MachineOperand *VData = TII->getNamedOperand(*MI, |
988 | 1.16k | AMDGPU::OpName::vdata); |
989 | 1.16k | buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, |
990 | 1.16k | Index, |
991 | 1.16k | VData->getReg(), VData->isKill(), |
992 | 1.16k | TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(), |
993 | 1.16k | TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(), |
994 | 1.16k | TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), |
995 | 1.16k | *MI->memoperands_begin(), |
996 | 1.16k | RS); |
997 | 1.16k | MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode())); |
998 | 1.16k | MI->eraseFromParent(); |
999 | 1.16k | break; |
1000 | 1.16k | } |
1001 | 1.08k | case AMDGPU::SI_SPILL_V32_RESTORE: |
1002 | 1.08k | case AMDGPU::SI_SPILL_V64_RESTORE: |
1003 | 1.08k | case AMDGPU::SI_SPILL_V96_RESTORE: |
1004 | 1.08k | case AMDGPU::SI_SPILL_V128_RESTORE: |
1005 | 1.08k | case AMDGPU::SI_SPILL_V256_RESTORE: |
1006 | 1.08k | case AMDGPU::SI_SPILL_V512_RESTORE: { |
1007 | 1.08k | const MachineOperand *VData = TII->getNamedOperand(*MI, |
1008 | 1.08k | AMDGPU::OpName::vdata); |
1009 | 1.08k | |
1010 | 1.08k | buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET, |
1011 | 1.08k | Index, |
1012 | 1.08k | VData->getReg(), VData->isKill(), |
1013 | 1.08k | TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(), |
1014 | 1.08k | TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(), |
1015 | 1.08k | TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), |
1016 | 1.08k | *MI->memoperands_begin(), |
1017 | 1.08k | RS); |
1018 | 1.08k | MI->eraseFromParent(); |
1019 | 1.08k | break; |
1020 | 1.08k | } |
1021 | 1.08k | |
1022 | 5.14k | default: { |
1023 | 5.14k | const DebugLoc &DL = MI->getDebugLoc(); |
1024 | 5.14k | bool IsMUBUF = TII->isMUBUF(*MI); |
1025 | 5.14k | |
1026 | 5.14k | if (!IsMUBUF && |
1027 | 5.14k | MFI->getFrameOffsetReg() != MFI->getScratchWaveOffsetReg()340 ) { |
1028 | 21 | // Convert to an absolute stack address by finding the offset from the |
1029 | 21 | // scratch wave base and scaling by the wave size. |
1030 | 21 | // |
1031 | 21 | // In an entry function/kernel the stack address is already the absolute |
1032 | 21 | // address relative to the the scratch wave offset. |
1033 | 21 | |
1034 | 21 | unsigned DiffReg |
1035 | 21 | = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); |
1036 | 21 | |
1037 | 21 | bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32; |
1038 | 21 | unsigned ResultReg = IsCopy ? |
1039 | 21 | MI->getOperand(0).getReg() : |
1040 | 0 | MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
1041 | 21 | |
1042 | 21 | BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), DiffReg) |
1043 | 21 | .addReg(MFI->getFrameOffsetReg()) |
1044 | 21 | .addReg(MFI->getScratchWaveOffsetReg()); |
1045 | 21 | |
1046 | 21 | int64_t Offset = FrameInfo.getObjectOffset(Index); |
1047 | 21 | if (Offset == 021 ) { |
1048 | 0 | // XXX - This never happens because of emergency scavenging slot at 0? |
1049 | 0 | BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg) |
1050 | 0 | .addImm(Log2_32(ST.getWavefrontSize())) |
1051 | 0 | .addReg(DiffReg); |
1052 | 21 | } else { |
1053 | 21 | unsigned CarryOut |
1054 | 21 | = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); |
1055 | 21 | unsigned ScaledReg |
1056 | 21 | = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
1057 | 21 | |
1058 | 21 | BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ScaledReg) |
1059 | 21 | .addImm(Log2_32(ST.getWavefrontSize())) |
1060 | 21 | .addReg(DiffReg, RegState::Kill); |
1061 | 21 | |
1062 | 21 | // TODO: Fold if use instruction is another add of a constant. |
1063 | 21 | if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())21 ) { |
1064 | 19 | BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_I32_e64), ResultReg) |
1065 | 19 | .addReg(CarryOut, RegState::Define | RegState::Dead) |
1066 | 19 | .addImm(Offset) |
1067 | 19 | .addReg(ScaledReg, RegState::Kill); |
1068 | 21 | } else { |
1069 | 2 | unsigned ConstOffsetReg |
1070 | 2 | = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); |
1071 | 2 | |
1072 | 2 | BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg) |
1073 | 2 | .addImm(Offset); |
1074 | 2 | BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_I32_e64), ResultReg) |
1075 | 2 | .addReg(CarryOut, RegState::Define | RegState::Dead) |
1076 | 2 | .addReg(ConstOffsetReg, RegState::Kill) |
1077 | 2 | .addReg(ScaledReg, RegState::Kill); |
1078 | 2 | } |
1079 | 21 | |
1080 | 21 | MRI.setRegAllocationHint(CarryOut, 0, AMDGPU::VCC); |
1081 | 21 | } |
1082 | 21 | |
1083 | 21 | // Don't introduce an extra copy if we're just materializing in a mov. |
1084 | 21 | if (IsCopy) |
1085 | 21 | MI->eraseFromParent(); |
1086 | 21 | else |
1087 | 0 | FIOp.ChangeToRegister(ResultReg, false, false, true); |
1088 | 21 | return; |
1089 | 21 | } |
1090 | 5.12k | |
1091 | 5.12k | if (5.12k IsMUBUF5.12k ) { |
1092 | 4.80k | // Disable offen so we don't need a 0 vgpr base. |
1093 | 4.80k | assert(static_cast<int>(FIOperandNum) == |
1094 | 4.80k | AMDGPU::getNamedOperandIdx(MI->getOpcode(), |
1095 | 4.80k | AMDGPU::OpName::vaddr)); |
1096 | 4.80k | |
1097 | 4.80k | assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() |
1098 | 4.80k | == MFI->getFrameOffsetReg()); |
1099 | 4.80k | |
1100 | 4.80k | int64_t Offset = FrameInfo.getObjectOffset(Index); |
1101 | 4.80k | int64_t OldImm |
1102 | 4.80k | = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(); |
1103 | 4.80k | int64_t NewOffset = OldImm + Offset; |
1104 | 4.80k | |
1105 | 4.80k | if (isUInt<12>(NewOffset) && |
1106 | 4.80k | buildMUBUFOffsetLoadStore(TII, FrameInfo, MI, Index, NewOffset)4.78k ) { |
1107 | 4.78k | MI->eraseFromParent(); |
1108 | 4.78k | return; |
1109 | 4.78k | } |
1110 | 335 | } |
1111 | 335 | |
1112 | 335 | // If the offset is simply too big, don't convert to a scratch wave offset |
1113 | 335 | // relative index. |
1114 | 335 | |
1115 | 335 | int64_t Offset = FrameInfo.getObjectOffset(Index); |
1116 | 335 | FIOp.ChangeToImmediate(Offset); |
1117 | 335 | if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)335 ) { |
1118 | 16 | unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
1119 | 16 | BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) |
1120 | 16 | .addImm(Offset); |
1121 | 16 | FIOp.ChangeToRegister(TmpReg, false, false, true); |
1122 | 16 | } |
1123 | 46 | } |
1124 | 7.48k | } |
1125 | 7.48k | } |
1126 | | |
1127 | 6.56M | StringRef SIRegisterInfo::getRegAsmName(unsigned Reg) const { |
1128 | 6.56M | #define AMDGPU_REG_ASM_NAMES |
1129 | 6.56M | #include "AMDGPURegAsmNames.inc.cpp" |
1130 | 6.56M | |
1131 | 6.56M | #define REG_RANGE(BeginReg, EndReg, RegTable) \ |
1132 | 49.3M | if (49.3M Reg >= BeginReg && 49.3M Reg <= EndReg14.5M ) { \ |
1133 | 3.04M | unsigned Index = Reg - BeginReg; \ |
1134 | 3.04M | assert(Index < array_lengthof(RegTable)); \ |
1135 | 3.04M | return RegTable[Index]; \ |
1136 | 3.04M | } |
1137 | 6.56M | |
1138 | 6.56M | REG_RANGE6.56M (AMDGPU::VGPR0, AMDGPU::VGPR255, VGPR32RegNames); |
1139 | 5.73M | REG_RANGE5.73M (AMDGPU::SGPR0, AMDGPU::SGPR103, SGPR32RegNames); |
1140 | 4.99M | REG_RANGE4.99M (AMDGPU::VGPR0_VGPR1, AMDGPU::VGPR254_VGPR255, VGPR64RegNames); |
1141 | 4.50M | REG_RANGE4.50M (AMDGPU::SGPR0_SGPR1, AMDGPU::SGPR102_SGPR103, SGPR64RegNames); |
1142 | 4.30M | REG_RANGE(AMDGPU::VGPR0_VGPR1_VGPR2, AMDGPU::VGPR253_VGPR254_VGPR255, |
1143 | 4.30M | VGPR96RegNames); |
1144 | 4.30M | |
1145 | 4.30M | REG_RANGE(AMDGPU::VGPR0_VGPR1_VGPR2_VGPR3, |
1146 | 4.30M | AMDGPU::VGPR252_VGPR253_VGPR254_VGPR255, |
1147 | 4.06M | VGPR128RegNames); |
1148 | 4.06M | REG_RANGE(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, |
1149 | 4.06M | AMDGPU::SGPR100_SGPR101_SGPR102_SGPR103, |
1150 | 4.02M | SGPR128RegNames); |
1151 | 4.02M | |
1152 | 4.02M | REG_RANGE(AMDGPU::VGPR0_VGPR1_VGPR2_VGPR3_VGPR4_VGPR5_VGPR6_VGPR7, |
1153 | 4.02M | AMDGPU::VGPR248_VGPR249_VGPR250_VGPR251_VGPR252_VGPR253_VGPR254_VGPR255, |
1154 | 3.78M | VGPR256RegNames); |
1155 | 3.78M | |
1156 | 3.78M | REG_RANGE( |
1157 | 3.78M | AMDGPU::VGPR0_VGPR1_VGPR2_VGPR3_VGPR4_VGPR5_VGPR6_VGPR7_VGPR8_VGPR9_VGPR10_VGPR11_VGPR12_VGPR13_VGPR14_VGPR15, |
1158 | 3.78M | AMDGPU::VGPR240_VGPR241_VGPR242_VGPR243_VGPR244_VGPR245_VGPR246_VGPR247_VGPR248_VGPR249_VGPR250_VGPR251_VGPR252_VGPR253_VGPR254_VGPR255, |
1159 | 3.56M | VGPR512RegNames); |
1160 | 3.56M | |
1161 | 3.56M | REG_RANGE(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7, |
1162 | 3.56M | AMDGPU::SGPR96_SGPR97_SGPR98_SGPR99_SGPR100_SGPR101_SGPR102_SGPR103, |
1163 | 3.54M | SGPR256RegNames); |
1164 | 3.54M | |
1165 | 3.54M | REG_RANGE( |
1166 | 3.54M | AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7_SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15, |
1167 | 3.54M | AMDGPU::SGPR88_SGPR89_SGPR90_SGPR91_SGPR92_SGPR93_SGPR94_SGPR95_SGPR96_SGPR97_SGPR98_SGPR99_SGPR100_SGPR101_SGPR102_SGPR103, |
1168 | 3.54M | SGPR512RegNames |
1169 | 3.51M | ); |
1170 | 3.51M | |
1171 | 3.51M | #undef REG_RANGE |
1172 | 3.51M | |
1173 | 3.51M | // FIXME: Rename flat_scr so we don't need to special case this. |
1174 | 3.51M | switch (Reg) { |
1175 | 2.85k | case AMDGPU::FLAT_SCR: |
1176 | 2.85k | return "flat_scratch"; |
1177 | 5.98k | case AMDGPU::FLAT_SCR_LO: |
1178 | 5.98k | return "flat_scratch_lo"; |
1179 | 5.98k | case AMDGPU::FLAT_SCR_HI: |
1180 | 5.98k | return "flat_scratch_hi"; |
1181 | 3.50M | default: |
1182 | 3.50M | // For the special named registers the default is fine. |
1183 | 3.50M | return TargetRegisterInfo::getRegAsmName(Reg); |
1184 | 0 | } |
1185 | 0 | } |
1186 | | |
1187 | | // FIXME: This is very slow. It might be worth creating a map from physreg to |
1188 | | // register class. |
1189 | 4.42M | const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { |
1190 | 4.42M | assert(!TargetRegisterInfo::isVirtualRegister(Reg)); |
1191 | 4.42M | |
1192 | 4.42M | static const TargetRegisterClass *const BaseClasses[] = { |
1193 | 4.42M | &AMDGPU::VGPR_32RegClass, |
1194 | 4.42M | &AMDGPU::SReg_32RegClass, |
1195 | 4.42M | &AMDGPU::VReg_64RegClass, |
1196 | 4.42M | &AMDGPU::SReg_64RegClass, |
1197 | 4.42M | &AMDGPU::VReg_96RegClass, |
1198 | 4.42M | &AMDGPU::VReg_128RegClass, |
1199 | 4.42M | &AMDGPU::SReg_128RegClass, |
1200 | 4.42M | &AMDGPU::VReg_256RegClass, |
1201 | 4.42M | &AMDGPU::SReg_256RegClass, |
1202 | 4.42M | &AMDGPU::VReg_512RegClass, |
1203 | 4.42M | &AMDGPU::SReg_512RegClass, |
1204 | 4.42M | &AMDGPU::SCC_CLASSRegClass, |
1205 | 4.42M | }; |
1206 | 4.42M | |
1207 | 14.0M | for (const TargetRegisterClass *BaseClass : BaseClasses) { |
1208 | 14.0M | if (BaseClass->contains(Reg)14.0M ) { |
1209 | 4.42M | return BaseClass; |
1210 | 4.42M | } |
1211 | 0 | } |
1212 | 0 | return nullptr; |
1213 | 0 | } |
1214 | | |
1215 | | // TODO: It might be helpful to have some target specific flags in |
1216 | | // TargetRegisterClass to mark which classes are VGPRs to make this trivial. |
1217 | 9.31M | bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { |
1218 | 9.31M | unsigned Size = getRegSizeInBits(*RC); |
1219 | 9.31M | if (Size < 32) |
1220 | 3.85k | return false; |
1221 | 9.31M | switch (Size) { |
1222 | 4.94M | case 32: |
1223 | 4.94M | return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr; |
1224 | 3.18M | case 64: |
1225 | 3.18M | return getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) != nullptr; |
1226 | 1.19k | case 96: |
1227 | 1.19k | return getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) != nullptr; |
1228 | 1.07M | case 128: |
1229 | 1.07M | return getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) != nullptr; |
1230 | 74.6k | case 256: |
1231 | 74.6k | return getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) != nullptr; |
1232 | 37.2k | case 512: |
1233 | 37.2k | return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr; |
1234 | 0 | default: |
1235 | 0 | llvm_unreachable("Invalid register class size"); |
1236 | 0 | } |
1237 | 0 | } |
1238 | | |
1239 | | const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass( |
1240 | 154k | const TargetRegisterClass *SRC) const { |
1241 | 154k | switch (getRegSizeInBits(*SRC)) { |
1242 | 129k | case 32: |
1243 | 129k | return &AMDGPU::VGPR_32RegClass; |
1244 | 20.0k | case 64: |
1245 | 20.0k | return &AMDGPU::VReg_64RegClass; |
1246 | 0 | case 96: |
1247 | 0 | return &AMDGPU::VReg_96RegClass; |
1248 | 4.65k | case 128: |
1249 | 4.65k | return &AMDGPU::VReg_128RegClass; |
1250 | 53 | case 256: |
1251 | 53 | return &AMDGPU::VReg_256RegClass; |
1252 | 51 | case 512: |
1253 | 51 | return &AMDGPU::VReg_512RegClass; |
1254 | 0 | default: |
1255 | 0 | llvm_unreachable("Invalid register class size"); |
1256 | 0 | } |
1257 | 0 | } |
1258 | | |
1259 | | const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass( |
1260 | 1.76k | const TargetRegisterClass *VRC) const { |
1261 | 1.76k | switch (getRegSizeInBits(*VRC)) { |
1262 | 1.27k | case 32: |
1263 | 1.27k | return &AMDGPU::SGPR_32RegClass; |
1264 | 473 | case 64: |
1265 | 473 | return &AMDGPU::SReg_64RegClass; |
1266 | 8 | case 128: |
1267 | 8 | return &AMDGPU::SReg_128RegClass; |
1268 | 2 | case 256: |
1269 | 2 | return &AMDGPU::SReg_256RegClass; |
1270 | 0 | case 512: |
1271 | 0 | return &AMDGPU::SReg_512RegClass; |
1272 | 0 | default: |
1273 | 0 | llvm_unreachable("Invalid register class size"); |
1274 | 0 | } |
1275 | 0 | } |
1276 | | |
1277 | | const TargetRegisterClass *SIRegisterInfo::getSubRegClass( |
1278 | 335k | const TargetRegisterClass *RC, unsigned SubIdx) const { |
1279 | 335k | if (SubIdx == AMDGPU::NoSubRegister) |
1280 | 285k | return RC; |
1281 | 49.8k | |
1282 | 49.8k | // We can assume that each lane corresponds to one 32-bit register. |
1283 | 49.8k | unsigned Count = getSubRegIndexLaneMask(SubIdx).getNumLanes(); |
1284 | 49.8k | if (isSGPRClass(RC)49.8k ) { |
1285 | 22.8k | switch (Count) { |
1286 | 22.8k | case 1: |
1287 | 22.8k | return &AMDGPU::SGPR_32RegClass; |
1288 | 0 | case 2: |
1289 | 0 | return &AMDGPU::SReg_64RegClass; |
1290 | 0 | case 4: |
1291 | 0 | return &AMDGPU::SReg_128RegClass; |
1292 | 0 | case 8: |
1293 | 0 | return &AMDGPU::SReg_256RegClass; |
1294 | 0 | case 16: /* fall-through */ |
1295 | 0 | default: |
1296 | 0 | llvm_unreachable("Invalid sub-register class size"); |
1297 | 49.8k | } |
1298 | 26.9k | } else { |
1299 | 26.9k | switch (Count) { |
1300 | 26.9k | case 1: |
1301 | 26.9k | return &AMDGPU::VGPR_32RegClass; |
1302 | 46 | case 2: |
1303 | 46 | return &AMDGPU::VReg_64RegClass; |
1304 | 0 | case 3: |
1305 | 0 | return &AMDGPU::VReg_96RegClass; |
1306 | 0 | case 4: |
1307 | 0 | return &AMDGPU::VReg_128RegClass; |
1308 | 0 | case 8: |
1309 | 0 | return &AMDGPU::VReg_256RegClass; |
1310 | 0 | case 16: /* fall-through */ |
1311 | 0 | default: |
1312 | 0 | llvm_unreachable("Invalid sub-register class size"); |
1313 | 0 | } |
1314 | 0 | } |
1315 | 335k | } |
1316 | | |
1317 | | bool SIRegisterInfo::shouldRewriteCopySrc( |
1318 | | const TargetRegisterClass *DefRC, |
1319 | | unsigned DefSubReg, |
1320 | | const TargetRegisterClass *SrcRC, |
1321 | 355k | unsigned SrcSubReg) const { |
1322 | 355k | // We want to prefer the smallest register class possible, so we don't want to |
1323 | 355k | // stop and rewrite on anything that looks like a subregister |
1324 | 355k | // extract. Operations mostly don't care about the super register class, so we |
1325 | 355k | // only want to stop on the most basic of copies between the same register |
1326 | 355k | // class. |
1327 | 355k | // |
1328 | 355k | // e.g. if we have something like |
1329 | 355k | // vreg0 = ... |
1330 | 355k | // vreg1 = ... |
1331 | 355k | // vreg2 = REG_SEQUENCE vreg0, sub0, vreg1, sub1, vreg2, sub2 |
1332 | 355k | // vreg3 = COPY vreg2, sub0 |
1333 | 355k | // |
1334 | 355k | // We want to look through the COPY to find: |
1335 | 355k | // => vreg3 = COPY vreg0 |
1336 | 355k | |
1337 | 355k | // Plain copy. |
1338 | 355k | return getCommonSubClass(DefRC, SrcRC) != nullptr; |
1339 | 355k | } |
1340 | | |
1341 | | /// \brief Returns a register that is not used at any point in the function. |
1342 | | /// If all registers are used, then this function will return |
1343 | | // AMDGPU::NoRegister. |
1344 | | unsigned |
1345 | | SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, |
1346 | | const TargetRegisterClass *RC, |
1347 | 133 | const MachineFunction &MF) const { |
1348 | 133 | |
1349 | 133 | for (unsigned Reg : *RC) |
1350 | 4.21k | if (4.21k MRI.isAllocatable(Reg) && 4.21k !MRI.isPhysRegUsed(Reg)3.74k ) |
1351 | 129 | return Reg; |
1352 | 4 | return AMDGPU::NoRegister; |
1353 | 4 | } |
1354 | | |
1355 | | ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC, |
1356 | 5.44k | unsigned EltSize) const { |
1357 | 5.44k | if (EltSize == 45.44k ) { |
1358 | 5.28k | static const int16_t Sub0_15[] = { |
1359 | 5.28k | AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, |
1360 | 5.28k | AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, |
1361 | 5.28k | AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, |
1362 | 5.28k | AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, |
1363 | 5.28k | }; |
1364 | 5.28k | |
1365 | 5.28k | static const int16_t Sub0_7[] = { |
1366 | 5.28k | AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, |
1367 | 5.28k | AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, |
1368 | 5.28k | }; |
1369 | 5.28k | |
1370 | 5.28k | static const int16_t Sub0_3[] = { |
1371 | 5.28k | AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, |
1372 | 5.28k | }; |
1373 | 5.28k | |
1374 | 5.28k | static const int16_t Sub0_2[] = { |
1375 | 5.28k | AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, |
1376 | 5.28k | }; |
1377 | 5.28k | |
1378 | 5.28k | static const int16_t Sub0_1[] = { |
1379 | 5.28k | AMDGPU::sub0, AMDGPU::sub1, |
1380 | 5.28k | }; |
1381 | 5.28k | |
1382 | 5.28k | switch (AMDGPU::getRegBitWidth(*RC->MC)) { |
1383 | 906 | case 32: |
1384 | 906 | return {}; |
1385 | 4.13k | case 64: |
1386 | 4.13k | return makeArrayRef(Sub0_1); |
1387 | 0 | case 96: |
1388 | 0 | return makeArrayRef(Sub0_2); |
1389 | 163 | case 128: |
1390 | 163 | return makeArrayRef(Sub0_3); |
1391 | 60 | case 256: |
1392 | 60 | return makeArrayRef(Sub0_7); |
1393 | 21 | case 512: |
1394 | 21 | return makeArrayRef(Sub0_15); |
1395 | 0 | default: |
1396 | 0 | llvm_unreachable("unhandled register size"); |
1397 | 158 | } |
1398 | 158 | } |
1399 | 158 | |
1400 | 158 | if (158 EltSize == 8158 ) { |
1401 | 152 | static const int16_t Sub0_15_64[] = { |
1402 | 152 | AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, |
1403 | 152 | AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, |
1404 | 152 | AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, |
1405 | 152 | AMDGPU::sub12_sub13, AMDGPU::sub14_sub15 |
1406 | 152 | }; |
1407 | 152 | |
1408 | 152 | static const int16_t Sub0_7_64[] = { |
1409 | 152 | AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, |
1410 | 152 | AMDGPU::sub4_sub5, AMDGPU::sub6_sub7 |
1411 | 152 | }; |
1412 | 152 | |
1413 | 152 | |
1414 | 152 | static const int16_t Sub0_3_64[] = { |
1415 | 152 | AMDGPU::sub0_sub1, AMDGPU::sub2_sub3 |
1416 | 152 | }; |
1417 | 152 | |
1418 | 152 | switch (AMDGPU::getRegBitWidth(*RC->MC)) { |
1419 | 16 | case 64: |
1420 | 16 | return {}; |
1421 | 136 | case 128: |
1422 | 136 | return makeArrayRef(Sub0_3_64); |
1423 | 0 | case 256: |
1424 | 0 | return makeArrayRef(Sub0_7_64); |
1425 | 0 | case 512: |
1426 | 0 | return makeArrayRef(Sub0_15_64); |
1427 | 0 | default: |
1428 | 0 | llvm_unreachable("unhandled register size"); |
1429 | 6 | } |
1430 | 6 | } |
1431 | 6 | |
1432 | 158 | assert(EltSize == 16 && "unhandled register spill split size"); |
1433 | 6 | |
1434 | 6 | static const int16_t Sub0_15_128[] = { |
1435 | 6 | AMDGPU::sub0_sub1_sub2_sub3, |
1436 | 6 | AMDGPU::sub4_sub5_sub6_sub7, |
1437 | 6 | AMDGPU::sub8_sub9_sub10_sub11, |
1438 | 6 | AMDGPU::sub12_sub13_sub14_sub15 |
1439 | 6 | }; |
1440 | 6 | |
1441 | 6 | static const int16_t Sub0_7_128[] = { |
1442 | 6 | AMDGPU::sub0_sub1_sub2_sub3, |
1443 | 6 | AMDGPU::sub4_sub5_sub6_sub7 |
1444 | 6 | }; |
1445 | 6 | |
1446 | 6 | switch (AMDGPU::getRegBitWidth(*RC->MC)) { |
1447 | 4 | case 128: |
1448 | 4 | return {}; |
1449 | 2 | case 256: |
1450 | 2 | return makeArrayRef(Sub0_7_128); |
1451 | 0 | case 512: |
1452 | 0 | return makeArrayRef(Sub0_15_128); |
1453 | 0 | default: |
1454 | 0 | llvm_unreachable("unhandled register size"); |
1455 | 0 | } |
1456 | 0 | } |
1457 | | |
1458 | | const TargetRegisterClass* |
1459 | | SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI, |
1460 | 3.41M | unsigned Reg) const { |
1461 | 3.41M | if (TargetRegisterInfo::isVirtualRegister(Reg)) |
1462 | 112k | return MRI.getRegClass(Reg); |
1463 | 3.30M | |
1464 | 3.30M | return getPhysRegClass(Reg); |
1465 | 3.30M | } |
1466 | | |
1467 | | bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI, |
1468 | 3.25M | unsigned Reg) const { |
1469 | 3.25M | return hasVGPRs(getRegClassForReg(MRI, Reg)); |
1470 | 3.25M | } |
1471 | | |
1472 | | bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, |
1473 | | const TargetRegisterClass *SrcRC, |
1474 | | unsigned SubReg, |
1475 | | const TargetRegisterClass *DstRC, |
1476 | | unsigned DstSubReg, |
1477 | 145k | const TargetRegisterClass *NewRC) const { |
1478 | 145k | unsigned SrcSize = getRegSizeInBits(*SrcRC); |
1479 | 145k | unsigned DstSize = getRegSizeInBits(*DstRC); |
1480 | 145k | unsigned NewSize = getRegSizeInBits(*NewRC); |
1481 | 145k | |
1482 | 145k | // Do not increase size of registers beyond dword, we would need to allocate |
1483 | 145k | // adjacent registers and constraint regalloc more than needed. |
1484 | 145k | |
1485 | 145k | // Always allow dword coalescing. |
1486 | 145k | if (SrcSize <= 32 || 145k DstSize <= 3275.2k ) |
1487 | 104k | return true; |
1488 | 41.2k | |
1489 | 41.2k | return NewSize <= DstSize || 41.2k NewSize <= SrcSize4.79k ; |
1490 | 145k | } |
1491 | | |
1492 | | unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, |
1493 | 94.0k | MachineFunction &MF) const { |
1494 | 94.0k | |
1495 | 94.0k | const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); |
1496 | 94.0k | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
1497 | 94.0k | |
1498 | 94.0k | unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(), |
1499 | 94.0k | *MF.getFunction()); |
1500 | 94.0k | switch (RC->getID()) { |
1501 | 0 | default: |
1502 | 0 | return AMDGPURegisterInfo::getRegPressureLimit(RC, MF); |
1503 | 47.0k | case AMDGPU::VGPR_32RegClassID: |
1504 | 47.0k | return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF)); |
1505 | 47.0k | case AMDGPU::SGPR_32RegClassID: |
1506 | 47.0k | return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF)); |
1507 | 0 | } |
1508 | 0 | } |
1509 | | |
1510 | | unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, |
1511 | 871k | unsigned Idx) const { |
1512 | 871k | if (Idx == getVGPRPressureSet()) |
1513 | 47.0k | return getRegPressureLimit(&AMDGPU::VGPR_32RegClass, |
1514 | 47.0k | const_cast<MachineFunction &>(MF)); |
1515 | 824k | |
1516 | 824k | if (824k Idx == getSGPRPressureSet()824k ) |
1517 | 47.0k | return getRegPressureLimit(&AMDGPU::SGPR_32RegClass, |
1518 | 47.0k | const_cast<MachineFunction &>(MF)); |
1519 | 777k | |
1520 | 777k | return AMDGPURegisterInfo::getRegPressureSetLimit(MF, Idx); |
1521 | 777k | } |
1522 | | |
1523 | 3.76M | const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const { |
1524 | 3.76M | static const int Empty[] = { -1 }; |
1525 | 3.76M | |
1526 | 3.76M | if (hasRegUnit(AMDGPU::M0, RegUnit)) |
1527 | 1.81k | return Empty; |
1528 | 3.76M | return AMDGPURegisterInfo::getRegUnitPressureSets(RegUnit); |
1529 | 3.76M | } |