/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// |
2 | | // |
3 | | // The LLVM Compiler Infrastructure |
4 | | // |
5 | | // This file is distributed under the University of Illinois Open Source |
6 | | // License. See LICENSE.TXT for details. |
7 | | // |
8 | | //===----------------------------------------------------------------------===// |
9 | | // |
10 | | /// \file |
11 | | /// \brief Implements the AMDGPU specific subclass of TargetSubtarget. |
12 | | // |
13 | | //===----------------------------------------------------------------------===// |
14 | | |
15 | | #include "AMDGPUSubtarget.h" |
16 | | #include "AMDGPU.h" |
17 | | #include "AMDGPUTargetMachine.h" |
18 | | #include "AMDGPUCallLowering.h" |
19 | | #include "AMDGPUInstructionSelector.h" |
20 | | #include "AMDGPULegalizerInfo.h" |
21 | | #include "AMDGPURegisterBankInfo.h" |
22 | | #include "SIMachineFunctionInfo.h" |
23 | | #include "llvm/ADT/SmallString.h" |
24 | | #include "llvm/CodeGen/MachineScheduler.h" |
25 | | #include "llvm/IR/MDBuilder.h" |
26 | | #include "llvm/Target/TargetFrameLowering.h" |
27 | | #include <algorithm> |
28 | | |
29 | | using namespace llvm; |
30 | | |
31 | | #define DEBUG_TYPE "amdgpu-subtarget" |
32 | | |
33 | | #define GET_SUBTARGETINFO_TARGET_DESC |
34 | | #define GET_SUBTARGETINFO_CTOR |
35 | | #include "AMDGPUGenSubtargetInfo.inc" |
36 | | |
37 | 2.05k | AMDGPUSubtarget::~AMDGPUSubtarget() = default; |
38 | | |
39 | | AMDGPUSubtarget & |
40 | | AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, |
41 | 2.06k | StringRef GPU, StringRef FS) { |
42 | 2.06k | // Determine default and user-specified characteristics |
43 | 2.06k | // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be |
44 | 2.06k | // enabled, but some instructions do not respect them and they run at the |
45 | 2.06k | // double precision rate, so don't enable by default. |
46 | 2.06k | // |
47 | 2.06k | // We want to be able to turn these off, but making this a subtarget feature |
48 | 2.06k | // for SI has the unhelpful behavior that it unsets everything else if you |
49 | 2.06k | // disable it. |
50 | 2.06k | |
51 | 2.06k | SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+dx10-clamp,+load-store-opt,"); |
52 | 2.06k | if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. |
53 | 350 | FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,"; |
54 | 2.06k | |
55 | 2.06k | FullFS += FS; |
56 | 2.06k | |
57 | 2.06k | ParseSubtargetFeatures(GPU, FullFS); |
58 | 2.06k | |
59 | 2.06k | // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es |
60 | 2.06k | // on VI and newer hardware to avoid assertion failures due to missing ADDR64 |
61 | 2.06k | // variants of MUBUF instructions. |
62 | 2.06k | if (!hasAddr64() && 2.06k !FS.contains("flat-for-global")783 ) { |
63 | 447 | FlatForGlobal = true; |
64 | 447 | } |
65 | 2.06k | |
66 | 2.06k | // FIXME: I don't think think Evergreen has any useful support for |
67 | 2.06k | // denormals, but should be checked. Should we issue a warning somewhere |
68 | 2.06k | // if someone tries to enable these? |
69 | 2.06k | if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS2.06k ) { |
70 | 271 | FP64FP16Denormals = false; |
71 | 271 | FP32Denormals = false; |
72 | 271 | } |
73 | 2.06k | |
74 | 2.06k | // Set defaults if needed. |
75 | 2.06k | if (MaxPrivateElementSize == 0) |
76 | 2.04k | MaxPrivateElementSize = 4; |
77 | 2.06k | |
78 | 2.06k | if (LDSBankCount == 0) |
79 | 863 | LDSBankCount = 32; |
80 | 2.06k | |
81 | 2.06k | if (TT.getArch() == Triple::amdgcn2.06k ) { |
82 | 1.79k | if (LocalMemorySize == 0) |
83 | 592 | LocalMemorySize = 32768; |
84 | 1.79k | |
85 | 1.79k | // Do something sensible for unspecified target. |
86 | 1.79k | if (!HasMovrel && 1.79k !HasVGPRIndexMode724 ) |
87 | 592 | HasMovrel = true; |
88 | 1.79k | } |
89 | 2.06k | |
90 | 2.06k | return *this; |
91 | 2.06k | } |
92 | | |
93 | | AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, |
94 | | const TargetMachine &TM) |
95 | | : AMDGPUGenSubtargetInfo(TT, GPU, FS), |
96 | | TargetTriple(TT), |
97 | | Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600), |
98 | | IsaVersion(ISAVersion0_0_0), |
99 | | WavefrontSize(64), |
100 | | LocalMemorySize(0), |
101 | | LDSBankCount(0), |
102 | | MaxPrivateElementSize(0), |
103 | | |
104 | | FastFMAF32(false), |
105 | | HalfRate64Ops(false), |
106 | | |
107 | | FP32Denormals(false), |
108 | | FP64FP16Denormals(false), |
109 | | FPExceptions(false), |
110 | | DX10Clamp(false), |
111 | | FlatForGlobal(false), |
112 | | AutoWaitcntBeforeBarrier(false), |
113 | | UnalignedScratchAccess(false), |
114 | | UnalignedBufferAccess(false), |
115 | | |
116 | | HasApertureRegs(false), |
117 | | EnableXNACK(false), |
118 | | TrapHandler(false), |
119 | | DebuggerInsertNops(false), |
120 | | DebuggerReserveRegs(false), |
121 | | DebuggerEmitPrologue(false), |
122 | | |
123 | | EnableVGPRSpilling(false), |
124 | | EnablePromoteAlloca(false), |
125 | | EnableLoadStoreOpt(false), |
126 | | EnableUnsafeDSOffsetFolding(false), |
127 | | EnableSIScheduler(false), |
128 | | DumpCode(false), |
129 | | |
130 | | FP64(false), |
131 | | IsGCN(false), |
132 | | GCN3Encoding(false), |
133 | | CIInsts(false), |
134 | | GFX9Insts(false), |
135 | | SGPRInitBug(false), |
136 | | HasSMemRealTime(false), |
137 | | Has16BitInsts(false), |
138 | | HasIntClamp(false), |
139 | | HasVOP3PInsts(false), |
140 | | HasMovrel(false), |
141 | | HasVGPRIndexMode(false), |
142 | | HasScalarStores(false), |
143 | | HasInv2PiInlineImm(false), |
144 | | HasSDWA(false), |
145 | | HasSDWAOmod(false), |
146 | | HasSDWAScalar(false), |
147 | | HasSDWASdst(false), |
148 | | HasSDWAMac(false), |
149 | | HasSDWAOutModsVOPC(false), |
150 | | HasDPP(false), |
151 | | FlatAddressSpace(false), |
152 | | FlatInstOffsets(false), |
153 | | FlatGlobalInsts(false), |
154 | | FlatScratchInsts(false), |
155 | | AddNoCarryInsts(false), |
156 | | |
157 | | R600ALUInst(false), |
158 | | CaymanISA(false), |
159 | | CFALUBug(false), |
160 | | HasVertexCache(false), |
161 | | TexVTXClauseSize(0), |
162 | | ScalarizeGlobal(false), |
163 | | |
164 | | FeatureDisable(false), |
165 | 2.06k | InstrItins(getInstrItineraryForCPU(GPU)) { |
166 | 2.06k | AS = AMDGPU::getAMDGPUAS(TT); |
167 | 2.06k | initializeSubtargetDependencies(TT, GPU, FS); |
168 | 2.06k | } |
169 | | |
170 | | unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, |
171 | 15.2k | const Function &F) const { |
172 | 15.2k | if (NWaves == 1) |
173 | 26 | return getLocalMemorySize(); |
174 | 15.2k | unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; |
175 | 15.2k | unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); |
176 | 15.2k | unsigned MaxWaves = getMaxWavesPerEU(); |
177 | 15.2k | return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; |
178 | 15.2k | } |
179 | | |
180 | | unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, |
181 | 125k | const Function &F) const { |
182 | 125k | unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; |
183 | 125k | unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); |
184 | 125k | unsigned MaxWaves = getMaxWavesPerEU(); |
185 | 125k | unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; |
186 | 125k | unsigned NumWaves = Limit / (Bytes ? Bytes2.15k : 1u123k ); |
187 | 125k | NumWaves = std::min(NumWaves, MaxWaves); |
188 | 125k | NumWaves = std::max(NumWaves, 1u); |
189 | 125k | return NumWaves; |
190 | 125k | } |
191 | | |
192 | | std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( |
193 | 190k | const Function &F) const { |
194 | 190k | // Default minimum/maximum flat work group sizes. |
195 | 190k | std::pair<unsigned, unsigned> Default = |
196 | 190k | AMDGPU::isCompute(F.getCallingConv()) ? |
197 | 182k | std::pair<unsigned, unsigned>(getWavefrontSize() * 2, |
198 | 182k | getWavefrontSize() * 4) : |
199 | 7.56k | std::pair<unsigned, unsigned>(1, getWavefrontSize()); |
200 | 190k | |
201 | 190k | // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa |
202 | 190k | // starts using "amdgpu-flat-work-group-size" attribute. |
203 | 190k | Default.second = AMDGPU::getIntegerAttribute( |
204 | 190k | F, "amdgpu-max-work-group-size", Default.second); |
205 | 190k | Default.first = std::min(Default.first, Default.second); |
206 | 190k | |
207 | 190k | // Requested minimum/maximum flat work group sizes. |
208 | 190k | std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( |
209 | 190k | F, "amdgpu-flat-work-group-size", Default); |
210 | 190k | |
211 | 190k | // Make sure requested minimum is less than requested maximum. |
212 | 190k | if (Requested.first > Requested.second) |
213 | 0 | return Default; |
214 | 190k | |
215 | 190k | // Make sure requested values do not violate subtarget's specifications. |
216 | 190k | if (190k Requested.first < getMinFlatWorkGroupSize()190k ) |
217 | 0 | return Default; |
218 | 190k | if (190k Requested.second > getMaxFlatWorkGroupSize()190k ) |
219 | 0 | return Default; |
220 | 190k | |
221 | 190k | return Requested; |
222 | 190k | } |
223 | | |
224 | | std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( |
225 | 30.6k | const Function &F) const { |
226 | 30.6k | // Default minimum/maximum number of waves per execution unit. |
227 | 30.6k | std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); |
228 | 30.6k | |
229 | 30.6k | // Default/requested minimum/maximum flat work group sizes. |
230 | 30.6k | std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); |
231 | 30.6k | |
232 | 30.6k | // If minimum/maximum flat work group sizes were explicitly requested using |
233 | 30.6k | // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum |
234 | 30.6k | // number of waves per execution unit to values implied by requested |
235 | 30.6k | // minimum/maximum flat work group sizes. |
236 | 30.6k | unsigned MinImpliedByFlatWorkGroupSize = |
237 | 30.6k | getMaxWavesPerEU(FlatWorkGroupSizes.second); |
238 | 30.6k | bool RequestedFlatWorkGroupSize = false; |
239 | 30.6k | |
240 | 30.6k | // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa |
241 | 30.6k | // starts using "amdgpu-flat-work-group-size" attribute. |
242 | 30.6k | if (F.hasFnAttribute("amdgpu-max-work-group-size") || |
243 | 30.6k | F.hasFnAttribute("amdgpu-flat-work-group-size")30.5k ) { |
244 | 103 | Default.first = MinImpliedByFlatWorkGroupSize; |
245 | 103 | RequestedFlatWorkGroupSize = true; |
246 | 103 | } |
247 | 30.6k | |
248 | 30.6k | // Requested minimum/maximum number of waves per execution unit. |
249 | 30.6k | std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( |
250 | 30.6k | F, "amdgpu-waves-per-eu", Default, true); |
251 | 30.6k | |
252 | 30.6k | // Make sure requested minimum is less than requested maximum. |
253 | 30.6k | if (Requested.second && 30.6k Requested.first > Requested.second30.6k ) |
254 | 0 | return Default; |
255 | 30.6k | |
256 | 30.6k | // Make sure requested values do not violate subtarget's specifications. |
257 | 30.6k | if (30.6k Requested.first < getMinWavesPerEU() || |
258 | 30.6k | Requested.first > getMaxWavesPerEU()) |
259 | 0 | return Default; |
260 | 30.6k | if (30.6k Requested.second > getMaxWavesPerEU()30.6k ) |
261 | 0 | return Default; |
262 | 30.6k | |
263 | 30.6k | // Make sure requested values are compatible with values implied by requested |
264 | 30.6k | // minimum/maximum flat work group sizes. |
265 | 30.6k | if (30.6k RequestedFlatWorkGroupSize && |
266 | 103 | Requested.first < MinImpliedByFlatWorkGroupSize) |
267 | 2 | return Default; |
268 | 30.6k | |
269 | 30.6k | return Requested; |
270 | 30.6k | } |
271 | | |
272 | 3.25k | bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { |
273 | 3.25k | Function *Kernel = I->getParent()->getParent(); |
274 | 3.25k | unsigned MinSize = 0; |
275 | 3.25k | unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; |
276 | 3.25k | bool IdQuery = false; |
277 | 3.25k | |
278 | 3.25k | // If reqd_work_group_size is present it narrows value down. |
279 | 3.25k | if (auto *CI3.25k = dyn_cast<CallInst>(I)) { |
280 | 3.21k | const Function *F = CI->getCalledFunction(); |
281 | 3.21k | if (F3.21k ) { |
282 | 3.21k | unsigned Dim = UINT_MAX; |
283 | 3.21k | switch (F->getIntrinsicID()) { |
284 | 2.71k | case Intrinsic::amdgcn_workitem_id_x: |
285 | 2.71k | case Intrinsic::r600_read_tidig_x: |
286 | 2.71k | IdQuery = true; |
287 | 2.71k | LLVM_FALLTHROUGH; |
288 | 2.73k | case Intrinsic::r600_read_local_size_x: |
289 | 2.73k | Dim = 0; |
290 | 2.73k | break; |
291 | 171 | case Intrinsic::amdgcn_workitem_id_y: |
292 | 171 | case Intrinsic::r600_read_tidig_y: |
293 | 171 | IdQuery = true; |
294 | 171 | LLVM_FALLTHROUGH; |
295 | 257 | case Intrinsic::r600_read_local_size_y: |
296 | 257 | Dim = 1; |
297 | 257 | break; |
298 | 132 | case Intrinsic::amdgcn_workitem_id_z: |
299 | 132 | case Intrinsic::r600_read_tidig_z: |
300 | 132 | IdQuery = true; |
301 | 132 | LLVM_FALLTHROUGH; |
302 | 218 | case Intrinsic::r600_read_local_size_z: |
303 | 218 | Dim = 2; |
304 | 218 | break; |
305 | 0 | default: |
306 | 0 | break; |
307 | 3.21k | } |
308 | 3.21k | if (3.21k Dim <= 33.21k ) { |
309 | 3.21k | if (auto Node = Kernel->getMetadata("reqd_work_group_size")) |
310 | 6 | if (6 Node->getNumOperands() == 36 ) |
311 | 6 | MinSize = MaxSize = mdconst::extract<ConstantInt>( |
312 | 6 | Node->getOperand(Dim))->getZExtValue(); |
313 | 3.21k | } |
314 | 3.21k | } |
315 | 3.21k | } |
316 | 3.25k | |
317 | 3.25k | if (3.25k !MaxSize3.25k ) |
318 | 0 | return false; |
319 | 3.25k | |
320 | 3.25k | // Range metadata is [Lo, Hi). For ID query we need to pass max size |
321 | 3.25k | // as Hi. For size query we need to pass Hi + 1. |
322 | 3.25k | if (3.25k IdQuery3.25k ) |
323 | 3.02k | MinSize = 0; |
324 | 3.25k | else |
325 | 234 | ++MaxSize; |
326 | 3.25k | |
327 | 3.25k | MDBuilder MDB(I->getContext()); |
328 | 3.25k | MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), |
329 | 3.25k | APInt(32, MaxSize)); |
330 | 3.25k | I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); |
331 | 3.25k | return true; |
332 | 3.25k | } |
333 | | |
334 | | R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, |
335 | | const TargetMachine &TM) : |
336 | | AMDGPUSubtarget(TT, GPU, FS, TM), |
337 | | InstrInfo(*this), |
338 | | FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), |
339 | 253 | TLInfo(TM, *this) {} |
340 | | |
341 | | SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS, |
342 | | const TargetMachine &TM) |
343 | | : AMDGPUSubtarget(TT, GPU, FS, TM), InstrInfo(*this), |
344 | | FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), |
345 | 1.81k | TLInfo(TM, *this) { |
346 | 1.81k | CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); |
347 | 1.81k | Legalizer.reset(new AMDGPULegalizerInfo()); |
348 | 1.81k | |
349 | 1.81k | RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo())); |
350 | 1.81k | InstSelector.reset(new AMDGPUInstructionSelector( |
351 | 1.81k | *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()))); |
352 | 1.81k | } |
353 | | |
354 | | void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, |
355 | 35.9k | unsigned NumRegionInstrs) const { |
356 | 35.9k | // Track register pressure so the scheduler can try to decrease |
357 | 35.9k | // pressure once register usage is above the threshold defined by |
358 | 35.9k | // SIRegisterInfo::getRegPressureSetLimit() |
359 | 35.9k | Policy.ShouldTrackPressure = true; |
360 | 35.9k | |
361 | 35.9k | // Enabling both top down and bottom up scheduling seems to give us less |
362 | 35.9k | // register spills than just using one of these approaches on its own. |
363 | 35.9k | Policy.OnlyTopDown = false; |
364 | 35.9k | Policy.OnlyBottomUp = false; |
365 | 35.9k | |
366 | 35.9k | // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. |
367 | 35.9k | if (!enableSIScheduler()) |
368 | 35.9k | Policy.ShouldTrackLaneMasks = true; |
369 | 35.9k | } |
370 | | |
371 | 17.9k | bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const { |
372 | 17.5k | return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv()); |
373 | 17.9k | } |
374 | | |
375 | | unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF, |
376 | 1.74k | unsigned ExplicitArgBytes) const { |
377 | 1.74k | unsigned ImplicitBytes = getImplicitArgNumBytes(MF); |
378 | 1.74k | if (ImplicitBytes == 0) |
379 | 1.65k | return ExplicitArgBytes; |
380 | 95 | |
381 | 95 | unsigned Alignment = getAlignmentForImplicitArgPtr(); |
382 | 95 | return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; |
383 | 95 | } |
384 | | |
385 | 981 | unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { |
386 | 981 | if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS981 ) { |
387 | 495 | if (SGPRs <= 80) |
388 | 412 | return 10; |
389 | 83 | if (83 SGPRs <= 8883 ) |
390 | 0 | return 9; |
391 | 83 | if (83 SGPRs <= 10083 ) |
392 | 11 | return 8; |
393 | 72 | return 7; |
394 | 72 | } |
395 | 486 | if (486 SGPRs <= 48486 ) |
396 | 358 | return 10; |
397 | 128 | if (128 SGPRs <= 56128 ) |
398 | 10 | return 9; |
399 | 118 | if (118 SGPRs <= 64118 ) |
400 | 4 | return 8; |
401 | 114 | if (114 SGPRs <= 72114 ) |
402 | 29 | return 7; |
403 | 85 | if (85 SGPRs <= 8085 ) |
404 | 0 | return 6; |
405 | 85 | return 5; |
406 | 85 | } |
407 | | |
408 | 981 | unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { |
409 | 981 | if (VGPRs <= 24) |
410 | 118 | return 10; |
411 | 863 | if (863 VGPRs <= 28863 ) |
412 | 22 | return 9; |
413 | 841 | if (841 VGPRs <= 32841 ) |
414 | 217 | return 8; |
415 | 624 | if (624 VGPRs <= 36624 ) |
416 | 271 | return 7; |
417 | 353 | if (353 VGPRs <= 40353 ) |
418 | 35 | return 6; |
419 | 318 | if (318 VGPRs <= 48318 ) |
420 | 16 | return 5; |
421 | 302 | if (302 VGPRs <= 64302 ) |
422 | 37 | return 4; |
423 | 265 | if (265 VGPRs <= 84265 ) |
424 | 103 | return 3; |
425 | 162 | if (162 VGPRs <= 128162 ) |
426 | 26 | return 2; |
427 | 136 | return 1; |
428 | 136 | } |
429 | | |
430 | 107k | unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { |
431 | 107k | const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); |
432 | 107k | if (MFI.hasFlatScratchInit()107k ) { |
433 | 2.77k | if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) |
434 | 1.59k | return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). |
435 | 1.17k | if (1.17k getGeneration() == AMDGPUSubtarget::SEA_ISLANDS1.17k ) |
436 | 910 | return 4; // FLAT_SCRATCH, VCC (in that order). |
437 | 104k | } |
438 | 104k | |
439 | 104k | if (104k isXNACKEnabled()104k ) |
440 | 4.69k | return 4; // XNACK, VCC (in that order). |
441 | 100k | return 2; // VCC. |
442 | 100k | } |
443 | | |
444 | 107k | unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { |
445 | 107k | const Function &F = *MF.getFunction(); |
446 | 107k | const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); |
447 | 107k | |
448 | 107k | // Compute maximum number of SGPRs function can use using default/requested |
449 | 107k | // minimum number of waves per execution unit. |
450 | 107k | std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); |
451 | 107k | unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); |
452 | 107k | unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); |
453 | 107k | |
454 | 107k | // Check if maximum number of SGPRs was explicitly requested using |
455 | 107k | // "amdgpu-num-sgpr" attribute. |
456 | 107k | if (F.hasFnAttribute("amdgpu-num-sgpr")107k ) { |
457 | 40 | unsigned Requested = AMDGPU::getIntegerAttribute( |
458 | 40 | F, "amdgpu-num-sgpr", MaxNumSGPRs); |
459 | 40 | |
460 | 40 | // Make sure requested value does not violate subtarget's specifications. |
461 | 40 | if (Requested && 40 (Requested <= getReservedNumSGPRs(MF))40 ) |
462 | 0 | Requested = 0; |
463 | 40 | |
464 | 40 | // If more SGPRs are required to support the input user/system SGPRs, |
465 | 40 | // increase to accommodate them. |
466 | 40 | // |
467 | 40 | // FIXME: This really ends up using the requested number of SGPRs + number |
468 | 40 | // of reserved special registers in total. Theoretically you could re-use |
469 | 40 | // the last input registers for these special registers, but this would |
470 | 40 | // require a lot of complexity to deal with the weird aliasing. |
471 | 40 | unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); |
472 | 40 | if (Requested && 40 Requested < InputNumSGPRs40 ) |
473 | 0 | Requested = InputNumSGPRs; |
474 | 40 | |
475 | 40 | // Make sure requested value is compatible with values implied by |
476 | 40 | // default/requested minimum/maximum number of waves per execution unit. |
477 | 40 | if (Requested && 40 Requested > getMaxNumSGPRs(WavesPerEU.first, false)40 ) |
478 | 0 | Requested = 0; |
479 | 40 | if (WavesPerEU.second && |
480 | 40 | Requested40 && Requested < getMinNumSGPRs(WavesPerEU.second)40 ) |
481 | 0 | Requested = 0; |
482 | 40 | |
483 | 40 | if (Requested) |
484 | 40 | MaxNumSGPRs = Requested; |
485 | 40 | } |
486 | 107k | |
487 | 107k | if (hasSGPRInitBug()) |
488 | 29.8k | MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; |
489 | 107k | |
490 | 107k | return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), |
491 | 107k | MaxAddressableNumSGPRs); |
492 | 107k | } |
493 | | |
494 | 77.4k | unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { |
495 | 77.4k | const Function &F = *MF.getFunction(); |
496 | 77.4k | const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); |
497 | 77.4k | |
498 | 77.4k | // Compute maximum number of VGPRs function can use using default/requested |
499 | 77.4k | // minimum number of waves per execution unit. |
500 | 77.4k | std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); |
501 | 77.4k | unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); |
502 | 77.4k | |
503 | 77.4k | // Check if maximum number of VGPRs was explicitly requested using |
504 | 77.4k | // "amdgpu-num-vgpr" attribute. |
505 | 77.4k | if (F.hasFnAttribute("amdgpu-num-vgpr")77.4k ) { |
506 | 6 | unsigned Requested = AMDGPU::getIntegerAttribute( |
507 | 6 | F, "amdgpu-num-vgpr", MaxNumVGPRs); |
508 | 6 | |
509 | 6 | // Make sure requested value does not violate subtarget's specifications. |
510 | 6 | if (Requested && 6 Requested <= getReservedNumVGPRs(MF)6 ) |
511 | 0 | Requested = 0; |
512 | 6 | |
513 | 6 | // Make sure requested value is compatible with values implied by |
514 | 6 | // default/requested minimum/maximum number of waves per execution unit. |
515 | 6 | if (Requested && 6 Requested > getMaxNumVGPRs(WavesPerEU.first)6 ) |
516 | 0 | Requested = 0; |
517 | 6 | if (WavesPerEU.second && |
518 | 6 | Requested6 && Requested < getMinNumVGPRs(WavesPerEU.second)6 ) |
519 | 0 | Requested = 0; |
520 | 6 | |
521 | 6 | if (Requested) |
522 | 6 | MaxNumVGPRs = Requested; |
523 | 6 | } |
524 | 77.4k | |
525 | 77.4k | return MaxNumVGPRs - getReservedNumVGPRs(MF); |
526 | 77.4k | } |
527 | | |
528 | | struct MemOpClusterMutation : ScheduleDAGMutation { |
529 | | const SIInstrInfo *TII; |
530 | | |
531 | 11.4k | MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {} |
532 | | |
533 | 26.3k | void apply(ScheduleDAGInstrs *DAGInstrs) override { |
534 | 26.3k | ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); |
535 | 26.3k | |
536 | 26.3k | SUnit *SUa = nullptr; |
537 | 26.3k | // Search for two consequent memory operations and link them |
538 | 26.3k | // to prevent scheduler from moving them apart. |
539 | 26.3k | // In DAG pre-process SUnits are in the original order of |
540 | 26.3k | // the instructions before scheduling. |
541 | 176k | for (SUnit &SU : DAG->SUnits) { |
542 | 176k | MachineInstr &MI2 = *SU.getInstr(); |
543 | 176k | if (!MI2.mayLoad() && 176k !MI2.mayStore()139k ) { |
544 | 120k | SUa = nullptr; |
545 | 120k | continue; |
546 | 120k | } |
547 | 55.4k | if (55.4k !SUa55.4k ) { |
548 | 29.5k | SUa = &SU; |
549 | 29.5k | continue; |
550 | 29.5k | } |
551 | 25.9k | |
552 | 25.9k | MachineInstr &MI1 = *SUa->getInstr(); |
553 | 25.9k | if ((TII->isVMEM(MI1) && 25.9k TII->isVMEM(MI2)10.3k ) || |
554 | 16.0k | (TII->isFLAT(MI1) && 16.0k TII->isFLAT(MI2)2.59k ) || |
555 | 13.7k | (TII->isSMRD(MI1) && 13.7k TII->isSMRD(MI2)11.3k ) || |
556 | 25.9k | (TII->isDS(MI1) && 2.72k TII->isDS(MI2)1.39k )) { |
557 | 24.4k | SU.addPredBarrier(SUa); |
558 | 24.4k | |
559 | 220k | for (const SDep &SI : SU.Preds) { |
560 | 220k | if (SI.getSUnit() != SUa) |
561 | 181k | SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial)); |
562 | 220k | } |
563 | 24.4k | |
564 | 24.4k | if (&SU != &DAG->ExitSU24.4k ) { |
565 | 239k | for (const SDep &SI : SUa->Succs) { |
566 | 239k | if (SI.getSUnit() != &SU) |
567 | 201k | SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial)); |
568 | 239k | } |
569 | 24.4k | } |
570 | 24.4k | } |
571 | 176k | |
572 | 176k | SUa = &SU; |
573 | 176k | } |
574 | 26.3k | } |
575 | | }; |
576 | | |
577 | | void SISubtarget::getPostRAMutations( |
578 | 11.4k | std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { |
579 | 11.4k | Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo)); |
580 | 11.4k | } |