/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
Line | Count | Source (jump to first uncovered line) |
1 | | //=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU ------*- C++ -*-====// |
2 | | // |
3 | | // The LLVM Compiler Infrastructure |
4 | | // |
5 | | // This file is distributed under the University of Illinois Open Source |
6 | | // License. See LICENSE.TXT for details. |
7 | | // |
8 | | //==-----------------------------------------------------------------------===// |
9 | | // |
10 | | /// \file |
11 | | /// \brief AMDGPU specific subclass of TargetSubtarget. |
12 | | // |
13 | | //===----------------------------------------------------------------------===// |
14 | | |
15 | | #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H |
16 | | #define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H |
17 | | |
18 | | #include "AMDGPU.h" |
19 | | #include "AMDGPUCallLowering.h" |
20 | | #include "R600FrameLowering.h" |
21 | | #include "R600ISelLowering.h" |
22 | | #include "R600InstrInfo.h" |
23 | | #include "SIFrameLowering.h" |
24 | | #include "SIISelLowering.h" |
25 | | #include "SIInstrInfo.h" |
26 | | #include "SIMachineFunctionInfo.h" |
27 | | #include "Utils/AMDGPUBaseInfo.h" |
28 | | #include "llvm/ADT/Triple.h" |
29 | | #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" |
30 | | #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" |
31 | | #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" |
32 | | #include "llvm/CodeGen/MachineFunction.h" |
33 | | #include "llvm/CodeGen/SelectionDAGTargetInfo.h" |
34 | | #include "llvm/MC/MCInstrItineraries.h" |
35 | | #include "llvm/Support/MathExtras.h" |
36 | | #include <cassert> |
37 | | #include <cstdint> |
38 | | #include <memory> |
39 | | #include <utility> |
40 | | |
41 | | #define GET_SUBTARGETINFO_HEADER |
42 | | #include "AMDGPUGenSubtargetInfo.inc" |
43 | | |
44 | | namespace llvm { |
45 | | |
46 | | class StringRef; |
47 | | |
48 | | class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo { |
49 | | public: |
50 | | enum Generation { |
51 | | R600 = 0, |
52 | | R700, |
53 | | EVERGREEN, |
54 | | NORTHERN_ISLANDS, |
55 | | SOUTHERN_ISLANDS, |
56 | | SEA_ISLANDS, |
57 | | VOLCANIC_ISLANDS, |
58 | | GFX9, |
59 | | }; |
60 | | |
61 | | enum { |
62 | | ISAVersion0_0_0, |
63 | | ISAVersion6_0_0, |
64 | | ISAVersion6_0_1, |
65 | | ISAVersion7_0_0, |
66 | | ISAVersion7_0_1, |
67 | | ISAVersion7_0_2, |
68 | | ISAVersion7_0_3, |
69 | | ISAVersion8_0_0, |
70 | | ISAVersion8_0_1, |
71 | | ISAVersion8_0_2, |
72 | | ISAVersion8_0_3, |
73 | | ISAVersion8_0_4, |
74 | | ISAVersion8_1_0, |
75 | | ISAVersion9_0_0, |
76 | | ISAVersion9_0_1, |
77 | | ISAVersion9_0_2, |
78 | | ISAVersion9_0_3 |
79 | | }; |
80 | | |
81 | | enum TrapHandlerAbi { |
82 | | TrapHandlerAbiNone = 0, |
83 | | TrapHandlerAbiHsa = 1 |
84 | | }; |
85 | | |
86 | | enum TrapID { |
87 | | TrapIDHardwareReserved = 0, |
88 | | TrapIDHSADebugTrap = 1, |
89 | | TrapIDLLVMTrap = 2, |
90 | | TrapIDLLVMDebugTrap = 3, |
91 | | TrapIDDebugBreakpoint = 7, |
92 | | TrapIDDebugReserved8 = 8, |
93 | | TrapIDDebugReservedFE = 0xfe, |
94 | | TrapIDDebugReservedFF = 0xff |
95 | | }; |
96 | | |
97 | | enum TrapRegValues { |
98 | | LLVMTrapHandlerRegValue = 1 |
99 | | }; |
100 | | |
101 | | protected: |
102 | | // Basic subtarget description. |
103 | | Triple TargetTriple; |
104 | | Generation Gen; |
105 | | unsigned IsaVersion; |
106 | | unsigned WavefrontSize; |
107 | | int LocalMemorySize; |
108 | | int LDSBankCount; |
109 | | unsigned MaxPrivateElementSize; |
110 | | |
111 | | // Possibly statically set by tablegen, but may want to be overridden. |
112 | | bool FastFMAF32; |
113 | | bool HalfRate64Ops; |
114 | | |
115 | | // Dynamially set bits that enable features. |
116 | | bool FP32Denormals; |
117 | | bool FP64FP16Denormals; |
118 | | bool FPExceptions; |
119 | | bool DX10Clamp; |
120 | | bool FlatForGlobal; |
121 | | bool AutoWaitcntBeforeBarrier; |
122 | | bool UnalignedScratchAccess; |
123 | | bool UnalignedBufferAccess; |
124 | | bool HasApertureRegs; |
125 | | bool EnableXNACK; |
126 | | bool TrapHandler; |
127 | | bool DebuggerInsertNops; |
128 | | bool DebuggerReserveRegs; |
129 | | bool DebuggerEmitPrologue; |
130 | | |
131 | | // Used as options. |
132 | | bool EnableVGPRSpilling; |
133 | | bool EnablePromoteAlloca; |
134 | | bool EnableLoadStoreOpt; |
135 | | bool EnableUnsafeDSOffsetFolding; |
136 | | bool EnableSIScheduler; |
137 | | bool DumpCode; |
138 | | |
139 | | // Subtarget statically properties set by tablegen |
140 | | bool FP64; |
141 | | bool IsGCN; |
142 | | bool GCN3Encoding; |
143 | | bool CIInsts; |
144 | | bool GFX9Insts; |
145 | | bool SGPRInitBug; |
146 | | bool HasSMemRealTime; |
147 | | bool Has16BitInsts; |
148 | | bool HasIntClamp; |
149 | | bool HasVOP3PInsts; |
150 | | bool HasMovrel; |
151 | | bool HasVGPRIndexMode; |
152 | | bool HasScalarStores; |
153 | | bool HasInv2PiInlineImm; |
154 | | bool HasSDWA; |
155 | | bool HasSDWAOmod; |
156 | | bool HasSDWAScalar; |
157 | | bool HasSDWASdst; |
158 | | bool HasSDWAMac; |
159 | | bool HasSDWAOutModsVOPC; |
160 | | bool HasDPP; |
161 | | bool FlatAddressSpace; |
162 | | bool FlatInstOffsets; |
163 | | bool FlatGlobalInsts; |
164 | | bool FlatScratchInsts; |
165 | | bool AddNoCarryInsts; |
166 | | bool R600ALUInst; |
167 | | bool CaymanISA; |
168 | | bool CFALUBug; |
169 | | bool HasVertexCache; |
170 | | short TexVTXClauseSize; |
171 | | bool ScalarizeGlobal; |
172 | | |
173 | | // Dummy feature to use for assembler in tablegen. |
174 | | bool FeatureDisable; |
175 | | |
176 | | InstrItineraryData InstrItins; |
177 | | SelectionDAGTargetInfo TSInfo; |
178 | | AMDGPUAS AS; |
179 | | |
180 | | public: |
181 | | AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, |
182 | | const TargetMachine &TM); |
183 | | ~AMDGPUSubtarget() override; |
184 | | |
185 | | AMDGPUSubtarget &initializeSubtargetDependencies(const Triple &TT, |
186 | | StringRef GPU, StringRef FS); |
187 | | |
188 | | const AMDGPUInstrInfo *getInstrInfo() const override = 0; |
189 | | const AMDGPUFrameLowering *getFrameLowering() const override = 0; |
190 | | const AMDGPUTargetLowering *getTargetLowering() const override = 0; |
191 | | const AMDGPURegisterInfo *getRegisterInfo() const override = 0; |
192 | | |
193 | 49.9k | const InstrItineraryData *getInstrItineraryData() const override { |
194 | 49.9k | return &InstrItins; |
195 | 49.9k | } |
196 | | |
197 | | // Nothing implemented, just prevent crashes on use. |
198 | 22.5k | const SelectionDAGTargetInfo *getSelectionDAGInfo() const override { |
199 | 22.5k | return &TSInfo; |
200 | 22.5k | } |
201 | | |
202 | | void ParseSubtargetFeatures(StringRef CPU, StringRef FS); |
203 | | |
204 | 225k | bool isAmdHsaOS() const { |
205 | 225k | return TargetTriple.getOS() == Triple::AMDHSA; |
206 | 225k | } |
207 | | |
208 | 105k | bool isMesa3DOS() const { |
209 | 105k | return TargetTriple.getOS() == Triple::Mesa3D; |
210 | 105k | } |
211 | | |
212 | 1.66k | bool isOpenCLEnv() const { |
213 | 1.66k | return TargetTriple.getEnvironment() == Triple::OpenCL || |
214 | 1.65k | TargetTriple.getEnvironmentName() == "amdgizcl"; |
215 | 1.66k | } |
216 | | |
217 | 3.44M | Generation getGeneration() const { |
218 | 3.44M | return Gen; |
219 | 3.44M | } |
220 | | |
221 | 387k | unsigned getWavefrontSize() const { |
222 | 387k | return WavefrontSize; |
223 | 387k | } |
224 | | |
225 | 169k | int getLocalMemorySize() const { |
226 | 169k | return LocalMemorySize; |
227 | 169k | } |
228 | | |
229 | 238 | int getLDSBankCount() const { |
230 | 238 | return LDSBankCount; |
231 | 238 | } |
232 | | |
233 | 5.05k | unsigned getMaxPrivateElementSize() const { |
234 | 5.05k | return MaxPrivateElementSize; |
235 | 5.05k | } |
236 | | |
237 | 121k | AMDGPUAS getAMDGPUAS() const { |
238 | 121k | return AS; |
239 | 121k | } |
240 | | |
241 | 396k | bool has16BitInsts() const { |
242 | 396k | return Has16BitInsts; |
243 | 396k | } |
244 | | |
245 | 0 | bool hasIntClamp() const { |
246 | 0 | return HasIntClamp; |
247 | 0 | } |
248 | | |
249 | 4.74k | bool hasVOP3PInsts() const { |
250 | 4.74k | return HasVOP3PInsts; |
251 | 4.74k | } |
252 | | |
253 | 0 | bool hasHWFP64() const { |
254 | 0 | return FP64; |
255 | 0 | } |
256 | | |
257 | 427 | bool hasFastFMAF32() const { |
258 | 427 | return FastFMAF32; |
259 | 427 | } |
260 | | |
261 | 48 | bool hasHalfRate64Ops() const { |
262 | 48 | return HalfRate64Ops; |
263 | 48 | } |
264 | | |
265 | 56.6k | bool hasAddr64() const { |
266 | 56.6k | return (getGeneration() < VOLCANIC_ISLANDS); |
267 | 56.6k | } |
268 | | |
269 | 2.82k | bool hasBFE() const { |
270 | 2.82k | return (getGeneration() >= EVERGREEN); |
271 | 2.82k | } |
272 | | |
273 | 2.06k | bool hasBFI() const { |
274 | 2.06k | return (getGeneration() >= EVERGREEN); |
275 | 2.06k | } |
276 | | |
277 | 0 | bool hasBFM() const { |
278 | 0 | return hasBFE(); |
279 | 0 | } |
280 | | |
281 | 4.13k | bool hasBCNT(unsigned Size) const { |
282 | 4.13k | if (Size == 32) |
283 | 2.06k | return (getGeneration() >= EVERGREEN); |
284 | 4.13k | |
285 | 2.06k | if (2.06k Size == 642.06k ) |
286 | 2.06k | return (getGeneration() >= SOUTHERN_ISLANDS); |
287 | 2.06k | |
288 | 0 | return false; |
289 | 4.13k | } |
290 | | |
291 | 2.80k | bool hasMulU24() const { |
292 | 2.80k | return (getGeneration() >= EVERGREEN); |
293 | 2.80k | } |
294 | | |
295 | 1.50k | bool hasMulI24() const { |
296 | 1.50k | return (getGeneration() >= SOUTHERN_ISLANDS || |
297 | 412 | hasCaymanISA()); |
298 | 1.50k | } |
299 | | |
300 | 2.06k | bool hasFFBL() const { |
301 | 2.06k | return (getGeneration() >= EVERGREEN); |
302 | 2.06k | } |
303 | | |
304 | 2.06k | bool hasFFBH() const { |
305 | 2.06k | return (getGeneration() >= EVERGREEN); |
306 | 2.06k | } |
307 | | |
308 | 9 | bool hasMed3_16() const { |
309 | 9 | return getGeneration() >= GFX9; |
310 | 9 | } |
311 | | |
312 | 218 | bool hasMin3Max3_16() const { |
313 | 218 | return getGeneration() >= GFX9; |
314 | 218 | } |
315 | | |
316 | 1.30k | bool hasMadMixInsts() const { |
317 | 1.30k | return getGeneration() >= GFX9; |
318 | 1.30k | } |
319 | | |
320 | 253 | bool hasCARRY() const { |
321 | 253 | return (getGeneration() >= EVERGREEN); |
322 | 253 | } |
323 | | |
324 | 253 | bool hasBORROW() const { |
325 | 253 | return (getGeneration() >= EVERGREEN); |
326 | 253 | } |
327 | | |
328 | 226k | bool hasCaymanISA() const { |
329 | 226k | return CaymanISA; |
330 | 226k | } |
331 | | |
332 | 36 | TrapHandlerAbi getTrapHandlerAbi() const { |
333 | 36 | return isAmdHsaOS() ? TrapHandlerAbiHsa16 : TrapHandlerAbiNone20 ; |
334 | 36 | } |
335 | | |
336 | 16.8k | bool isPromoteAllocaEnabled() const { |
337 | 16.8k | return EnablePromoteAlloca; |
338 | 16.8k | } |
339 | | |
340 | 1.74k | bool unsafeDSOffsetFoldingEnabled() const { |
341 | 1.74k | return EnableUnsafeDSOffsetFolding; |
342 | 1.74k | } |
343 | | |
344 | 398k | bool dumpCode() const { |
345 | 398k | return DumpCode; |
346 | 398k | } |
347 | | |
348 | | /// Return the amount of LDS that can be used that will not restrict the |
349 | | /// occupancy lower than WaveCount. |
350 | | unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, |
351 | | const Function &) const; |
352 | | |
353 | | /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if |
354 | | /// the given LDS memory size is the only constraint. |
355 | | unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const; |
356 | | |
357 | 6 | unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const { |
358 | 6 | const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
359 | 6 | return getOccupancyWithLocalMemSize(MFI->getLDSSize(), *MF.getFunction()); |
360 | 6 | } |
361 | | |
362 | 2.37k | bool hasFP16Denormals() const { |
363 | 2.37k | return FP64FP16Denormals; |
364 | 2.37k | } |
365 | | |
366 | 23.9k | bool hasFP32Denormals() const { |
367 | 23.9k | return FP32Denormals; |
368 | 23.9k | } |
369 | | |
370 | 14.2k | bool hasFP64Denormals() const { |
371 | 14.2k | return FP64FP16Denormals; |
372 | 14.2k | } |
373 | | |
374 | 44 | bool supportsMinMaxDenormModes() const { |
375 | 44 | return getGeneration() >= AMDGPUSubtarget::GFX9; |
376 | 44 | } |
377 | | |
378 | 2.06k | bool hasFPExceptions() const { |
379 | 2.06k | return FPExceptions; |
380 | 2.06k | } |
381 | | |
382 | 14.5k | bool enableDX10Clamp() const { |
383 | 14.5k | return DX10Clamp; |
384 | 14.5k | } |
385 | | |
386 | 44.1k | bool enableIEEEBit(const MachineFunction &MF) const { |
387 | 44.1k | return AMDGPU::isCompute(MF.getFunction()->getCallingConv()); |
388 | 44.1k | } |
389 | | |
390 | 74.8k | bool useFlatForGlobal() const { |
391 | 74.8k | return FlatForGlobal; |
392 | 74.8k | } |
393 | | |
394 | 58 | bool hasAutoWaitcntBeforeBarrier() const { |
395 | 58 | return AutoWaitcntBeforeBarrier; |
396 | 58 | } |
397 | | |
398 | 16.3k | bool hasUnalignedBufferAccess() const { |
399 | 16.3k | return UnalignedBufferAccess; |
400 | 16.3k | } |
401 | | |
402 | 16.7k | bool hasUnalignedScratchAccess() const { |
403 | 16.7k | return UnalignedScratchAccess; |
404 | 16.7k | } |
405 | | |
406 | 15.0k | bool hasApertureRegs() const { |
407 | 15.0k | return HasApertureRegs; |
408 | 15.0k | } |
409 | | |
410 | 14.2k | bool isTrapHandlerEnabled() const { |
411 | 14.2k | return TrapHandler; |
412 | 14.2k | } |
413 | | |
414 | 127k | bool isXNACKEnabled() const { |
415 | 127k | return EnableXNACK; |
416 | 127k | } |
417 | | |
418 | 57.5k | bool hasFlatAddressSpace() const { |
419 | 57.5k | return FlatAddressSpace; |
420 | 57.5k | } |
421 | | |
422 | 391k | bool hasFlatInstOffsets() const { |
423 | 391k | return FlatInstOffsets; |
424 | 391k | } |
425 | | |
426 | 74.7k | bool hasFlatGlobalInsts() const { |
427 | 74.7k | return FlatGlobalInsts; |
428 | 74.7k | } |
429 | | |
430 | 0 | bool hasFlatScratchInsts() const { |
431 | 0 | return FlatScratchInsts; |
432 | 0 | } |
433 | | |
434 | 276 | bool hasD16LoadStore() const { |
435 | 276 | return getGeneration() >= GFX9; |
436 | 276 | } |
437 | | |
438 | 0 | bool hasAddNoCarry() const { |
439 | 0 | return AddNoCarryInsts; |
440 | 0 | } |
441 | | |
442 | 91.2k | bool isMesaKernel(const MachineFunction &MF) const { |
443 | 871 | return isMesa3DOS() && !AMDGPU::isShader(MF.getFunction()->getCallingConv()); |
444 | 91.2k | } |
445 | | |
446 | | // Covers VS/PS/CS graphics shaders |
447 | 14.1k | bool isMesaGfxShader(const MachineFunction &MF) const { |
448 | 13 | return isMesa3DOS() && AMDGPU::isShader(MF.getFunction()->getCallingConv()); |
449 | 14.1k | } |
450 | | |
451 | 100k | bool isAmdCodeObjectV2(const MachineFunction &MF) const { |
452 | 89.5k | return isAmdHsaOS() || isMesaKernel(MF); |
453 | 100k | } |
454 | | |
455 | 572 | bool hasFminFmaxLegacy() const { |
456 | 572 | return getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; |
457 | 572 | } |
458 | | |
459 | 68.9k | bool hasSDWA() const { |
460 | 68.9k | return HasSDWA; |
461 | 68.9k | } |
462 | | |
463 | 33.6k | bool hasSDWAOmod() const { |
464 | 33.6k | return HasSDWAOmod; |
465 | 33.6k | } |
466 | | |
467 | 92.8k | bool hasSDWAScalar() const { |
468 | 92.8k | return HasSDWAScalar; |
469 | 92.8k | } |
470 | | |
471 | 126 | bool hasSDWASdst() const { |
472 | 126 | return HasSDWASdst; |
473 | 126 | } |
474 | | |
475 | 1.78k | bool hasSDWAMac() const { |
476 | 1.78k | return HasSDWAMac; |
477 | 1.78k | } |
478 | | |
479 | 87 | bool hasSDWAOutModsVOPC() const { |
480 | 87 | return HasSDWAOutModsVOPC; |
481 | 87 | } |
482 | | |
483 | | /// \brief Returns the offset in bytes from the start of the input buffer |
484 | | /// of the first explicit kernel argument. |
485 | 37.3k | unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const { |
486 | 37.3k | return isAmdCodeObjectV2(MF) ? 03.79k : 3633.5k ; |
487 | 37.3k | } |
488 | | |
489 | 125 | unsigned getAlignmentForImplicitArgPtr() const { |
490 | 125 | return isAmdHsaOS() ? 833 : 492 ; |
491 | 125 | } |
492 | | |
493 | 1.74k | unsigned getImplicitArgNumBytes(const MachineFunction &MF) const { |
494 | 1.74k | if (isMesaKernel(MF)) |
495 | 81 | return 16; |
496 | 1.66k | if (1.66k isAmdHsaOS() && 1.66k isOpenCLEnv()1.66k ) |
497 | 14 | return 32; |
498 | 1.65k | return 0; |
499 | 1.74k | } |
500 | | |
501 | | // Scratch is allocated in 256 dword per wave blocks for the entire |
502 | | // wavefront. When viewed from the perspecive of an arbitrary workitem, this |
503 | | // is 4-byte aligned. |
504 | 2.06k | unsigned getStackAlignment() const { |
505 | 2.06k | return 4; |
506 | 2.06k | } |
507 | | |
508 | 52.7k | bool enableMachineScheduler() const override { |
509 | 52.7k | return true; |
510 | 52.7k | } |
511 | | |
512 | 17.3k | bool enableSubRegLiveness() const override { |
513 | 17.3k | return true; |
514 | 17.3k | } |
515 | | |
516 | 368k | void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b;} |
517 | 33.0k | bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal;} |
518 | | |
519 | | /// \returns Number of execution units per compute unit supported by the |
520 | | /// subtarget. |
521 | 0 | unsigned getEUsPerCU() const { |
522 | 0 | return AMDGPU::IsaInfo::getEUsPerCU(getFeatureBits()); |
523 | 0 | } |
524 | | |
525 | | /// \returns Maximum number of work groups per compute unit supported by the |
526 | | /// subtarget and limited by given \p FlatWorkGroupSize. |
527 | 140k | unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const { |
528 | 140k | return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(getFeatureBits(), |
529 | 140k | FlatWorkGroupSize); |
530 | 140k | } |
531 | | |
532 | | /// \returns Maximum number of waves per compute unit supported by the |
533 | | /// subtarget without any kind of limitation. |
534 | 0 | unsigned getMaxWavesPerCU() const { |
535 | 0 | return AMDGPU::IsaInfo::getMaxWavesPerCU(getFeatureBits()); |
536 | 0 | } |
537 | | |
538 | | /// \returns Maximum number of waves per compute unit supported by the |
539 | | /// subtarget and limited by given \p FlatWorkGroupSize. |
540 | 0 | unsigned getMaxWavesPerCU(unsigned FlatWorkGroupSize) const { |
541 | 0 | return AMDGPU::IsaInfo::getMaxWavesPerCU(getFeatureBits(), |
542 | 0 | FlatWorkGroupSize); |
543 | 0 | } |
544 | | |
545 | | /// \returns Minimum number of waves per execution unit supported by the |
546 | | /// subtarget. |
547 | 30.6k | unsigned getMinWavesPerEU() const { |
548 | 30.6k | return AMDGPU::IsaInfo::getMinWavesPerEU(getFeatureBits()); |
549 | 30.6k | } |
550 | | |
551 | | /// \returns Maximum number of waves per execution unit supported by the |
552 | | /// subtarget without any kind of limitation. |
553 | 247k | unsigned getMaxWavesPerEU() const { |
554 | 247k | return AMDGPU::IsaInfo::getMaxWavesPerEU(getFeatureBits()); |
555 | 247k | } |
556 | | |
557 | | /// \returns Maximum number of waves per execution unit supported by the |
558 | | /// subtarget and limited by given \p FlatWorkGroupSize. |
559 | 30.6k | unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const { |
560 | 30.6k | return AMDGPU::IsaInfo::getMaxWavesPerEU(getFeatureBits(), |
561 | 30.6k | FlatWorkGroupSize); |
562 | 30.6k | } |
563 | | |
564 | | /// \returns Minimum flat work group size supported by the subtarget. |
565 | 190k | unsigned getMinFlatWorkGroupSize() const { |
566 | 190k | return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(getFeatureBits()); |
567 | 190k | } |
568 | | |
569 | | /// \returns Maximum flat work group size supported by the subtarget. |
570 | 190k | unsigned getMaxFlatWorkGroupSize() const { |
571 | 190k | return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(getFeatureBits()); |
572 | 190k | } |
573 | | |
574 | | /// \returns Number of waves per work group supported by the subtarget and |
575 | | /// limited by given \p FlatWorkGroupSize. |
576 | 0 | unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const { |
577 | 0 | return AMDGPU::IsaInfo::getWavesPerWorkGroup(getFeatureBits(), |
578 | 0 | FlatWorkGroupSize); |
579 | 0 | } |
580 | | |
581 | | /// \returns Subtarget's default pair of minimum/maximum flat work group sizes |
582 | | /// for function \p F, or minimum/maximum flat work group sizes explicitly |
583 | | /// requested using "amdgpu-flat-work-group-size" attribute attached to |
584 | | /// function \p F. |
585 | | /// |
586 | | /// \returns Subtarget's default values if explicitly requested values cannot |
587 | | /// be converted to integer, or violate subtarget's specifications. |
588 | | std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const; |
589 | | |
590 | | /// \returns Subtarget's default pair of minimum/maximum number of waves per |
591 | | /// execution unit for function \p F, or minimum/maximum number of waves per |
592 | | /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute |
593 | | /// attached to function \p F. |
594 | | /// |
595 | | /// \returns Subtarget's default values if explicitly requested values cannot |
596 | | /// be converted to integer, violate subtarget's specifications, or are not |
597 | | /// compatible with minimum/maximum number of waves limited by flat work group |
598 | | /// size, register usage, and/or lds usage. |
599 | | std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const; |
600 | | |
601 | | /// Creates value range metadata on an workitemid.* inrinsic call or load. |
602 | | bool makeLIDRangeMetadata(Instruction *I) const; |
603 | | }; |
604 | | |
605 | | class R600Subtarget final : public AMDGPUSubtarget { |
606 | | private: |
607 | | R600InstrInfo InstrInfo; |
608 | | R600FrameLowering FrameLowering; |
609 | | R600TargetLowering TLInfo; |
610 | | |
611 | | public: |
612 | | R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS, |
613 | | const TargetMachine &TM); |
614 | | |
615 | 937k | const R600InstrInfo *getInstrInfo() const override { |
616 | 937k | return &InstrInfo; |
617 | 937k | } |
618 | | |
619 | 31.5k | const R600FrameLowering *getFrameLowering() const override { |
620 | 31.5k | return &FrameLowering; |
621 | 31.5k | } |
622 | | |
623 | 105k | const R600TargetLowering *getTargetLowering() const override { |
624 | 105k | return &TLInfo; |
625 | 105k | } |
626 | | |
627 | 1.51M | const R600RegisterInfo *getRegisterInfo() const override { |
628 | 1.51M | return &InstrInfo.getRegisterInfo(); |
629 | 1.51M | } |
630 | | |
631 | 8.58k | bool hasCFAluBug() const { |
632 | 8.58k | return CFALUBug; |
633 | 8.58k | } |
634 | | |
635 | 124k | bool hasVertexCache() const { |
636 | 124k | return HasVertexCache; |
637 | 124k | } |
638 | | |
639 | 4.22k | short getTexVTXClauseSize() const { |
640 | 4.22k | return TexVTXClauseSize; |
641 | 4.22k | } |
642 | | }; |
643 | | |
644 | | class SISubtarget final : public AMDGPUSubtarget { |
645 | | private: |
646 | | SIInstrInfo InstrInfo; |
647 | | SIFrameLowering FrameLowering; |
648 | | SITargetLowering TLInfo; |
649 | | |
650 | | /// GlobalISel related APIs. |
651 | | std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; |
652 | | std::unique_ptr<InstructionSelector> InstSelector; |
653 | | std::unique_ptr<LegalizerInfo> Legalizer; |
654 | | std::unique_ptr<RegisterBankInfo> RegBankInfo; |
655 | | |
656 | | public: |
657 | | SISubtarget(const Triple &TT, StringRef CPU, StringRef FS, |
658 | | const TargetMachine &TM); |
659 | | |
660 | 4.15M | const SIInstrInfo *getInstrInfo() const override { |
661 | 4.15M | return &InstrInfo; |
662 | 4.15M | } |
663 | | |
664 | 183k | const SIFrameLowering *getFrameLowering() const override { |
665 | 183k | return &FrameLowering; |
666 | 183k | } |
667 | | |
668 | 928k | const SITargetLowering *getTargetLowering() const override { |
669 | 928k | return &TLInfo; |
670 | 928k | } |
671 | | |
672 | 20 | const CallLowering *getCallLowering() const override { |
673 | 20 | return CallLoweringInfo.get(); |
674 | 20 | } |
675 | | |
676 | 20 | const InstructionSelector *getInstructionSelector() const override { |
677 | 20 | return InstSelector.get(); |
678 | 20 | } |
679 | | |
680 | 23 | const LegalizerInfo *getLegalizerInfo() const override { |
681 | 23 | return Legalizer.get(); |
682 | 23 | } |
683 | | |
684 | 125 | const RegisterBankInfo *getRegBankInfo() const override { |
685 | 125 | return RegBankInfo.get(); |
686 | 125 | } |
687 | | |
688 | 25.3M | const SIRegisterInfo *getRegisterInfo() const override { |
689 | 25.3M | return &InstrInfo.getRegisterInfo(); |
690 | 25.3M | } |
691 | | |
692 | | // XXX - Why is this here if it isn't in the default pass set? |
693 | 22 | bool enableEarlyIfConversion() const override { |
694 | 22 | return true; |
695 | 22 | } |
696 | | |
697 | | void overrideSchedPolicy(MachineSchedPolicy &Policy, |
698 | | unsigned NumRegionInstrs) const override; |
699 | | |
700 | | bool isVGPRSpillingEnabled(const Function& F) const; |
701 | | |
702 | 14.1k | unsigned getMaxNumUserSGPRs() const { |
703 | 14.1k | return 16; |
704 | 14.1k | } |
705 | | |
706 | 0 | bool hasSMemRealTime() const { |
707 | 0 | return HasSMemRealTime; |
708 | 0 | } |
709 | | |
710 | 172 | bool hasMovrel() const { |
711 | 172 | return HasMovrel; |
712 | 172 | } |
713 | | |
714 | 29 | bool hasVGPRIndexMode() const { |
715 | 29 | return HasVGPRIndexMode; |
716 | 29 | } |
717 | | |
718 | 172 | bool useVGPRIndexMode(bool UserEnable) const { |
719 | 143 | return !hasMovrel() || (UserEnable && 143 hasVGPRIndexMode()29 ); |
720 | 172 | } |
721 | | |
722 | 37 | bool hasScalarCompareEq64() const { |
723 | 37 | return getGeneration() >= VOLCANIC_ISLANDS; |
724 | 37 | } |
725 | | |
726 | 1.73k | bool hasScalarStores() const { |
727 | 1.73k | return HasScalarStores; |
728 | 1.73k | } |
729 | | |
730 | 3.72M | bool hasInv2PiInlineImm() const { |
731 | 3.72M | return HasInv2PiInlineImm; |
732 | 3.72M | } |
733 | | |
734 | 0 | bool hasDPP() const { |
735 | 0 | return HasDPP; |
736 | 0 | } |
737 | | |
738 | 50.7k | bool enableSIScheduler() const { |
739 | 50.7k | return EnableSIScheduler; |
740 | 50.7k | } |
741 | | |
742 | 1.74k | bool debuggerSupported() const { |
743 | 5 | return debuggerInsertNops() && debuggerReserveRegs() && |
744 | 3 | debuggerEmitPrologue(); |
745 | 1.74k | } |
746 | | |
747 | 16.7k | bool debuggerInsertNops() const { |
748 | 16.7k | return DebuggerInsertNops; |
749 | 16.7k | } |
750 | | |
751 | 120k | bool debuggerReserveRegs() const { |
752 | 120k | return DebuggerReserveRegs; |
753 | 120k | } |
754 | | |
755 | 74.4k | bool debuggerEmitPrologue() const { |
756 | 74.4k | return DebuggerEmitPrologue; |
757 | 74.4k | } |
758 | | |
759 | 14.8k | bool loadStoreOptEnabled() const { |
760 | 14.8k | return EnableLoadStoreOpt; |
761 | 14.8k | } |
762 | | |
763 | 137k | bool hasSGPRInitBug() const { |
764 | 137k | return SGPRInitBug; |
765 | 137k | } |
766 | | |
767 | 278k | bool has12DWordStoreHazard() const { |
768 | 278k | return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS; |
769 | 278k | } |
770 | | |
771 | 620k | bool hasSMovFedHazard() const { |
772 | 620k | return getGeneration() >= AMDGPUSubtarget::GFX9; |
773 | 620k | } |
774 | | |
775 | 1.17k | bool hasReadM0Hazard() const { |
776 | 1.17k | return getGeneration() >= AMDGPUSubtarget::GFX9; |
777 | 1.17k | } |
778 | | |
779 | | unsigned getKernArgSegmentSize(const MachineFunction &MF, |
780 | | unsigned ExplictArgBytes) const; |
781 | | |
782 | | /// Return the maximum number of waves per SIMD for kernels using \p SGPRs SGPRs |
783 | | unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; |
784 | | |
785 | | /// Return the maximum number of waves per SIMD for kernels using \p VGPRs VGPRs |
786 | | unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const; |
787 | | |
788 | | /// \returns true if the flat_scratch register should be initialized with the |
789 | | /// pointer to the wave's scratch memory rather than a size and offset. |
790 | 332 | bool flatScratchIsPointer() const { |
791 | 332 | return getGeneration() >= GFX9; |
792 | 332 | } |
793 | | |
794 | | /// \returns SGPR allocation granularity supported by the subtarget. |
795 | 0 | unsigned getSGPRAllocGranule() const { |
796 | 0 | return AMDGPU::IsaInfo::getSGPRAllocGranule(getFeatureBits()); |
797 | 0 | } |
798 | | |
799 | | /// \returns SGPR encoding granularity supported by the subtarget. |
800 | 28.3k | unsigned getSGPREncodingGranule() const { |
801 | 28.3k | return AMDGPU::IsaInfo::getSGPREncodingGranule(getFeatureBits()); |
802 | 28.3k | } |
803 | | |
804 | | /// \returns Total number of SGPRs supported by the subtarget. |
805 | 0 | unsigned getTotalNumSGPRs() const { |
806 | 0 | return AMDGPU::IsaInfo::getTotalNumSGPRs(getFeatureBits()); |
807 | 0 | } |
808 | | |
809 | | /// \returns Addressable number of SGPRs supported by the subtarget. |
810 | 29.2k | unsigned getAddressableNumSGPRs() const { |
811 | 29.2k | return AMDGPU::IsaInfo::getAddressableNumSGPRs(getFeatureBits()); |
812 | 29.2k | } |
813 | | |
814 | | /// \returns Minimum number of SGPRs that meets the given number of waves per |
815 | | /// execution unit requirement supported by the subtarget. |
816 | 14.2k | unsigned getMinNumSGPRs(unsigned WavesPerEU) const { |
817 | 14.2k | return AMDGPU::IsaInfo::getMinNumSGPRs(getFeatureBits(), WavesPerEU); |
818 | 14.2k | } |
819 | | |
820 | | /// \returns Maximum number of SGPRs that meets the given number of waves per |
821 | | /// execution unit requirement supported by the subtarget. |
822 | 262k | unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const { |
823 | 262k | return AMDGPU::IsaInfo::getMaxNumSGPRs(getFeatureBits(), WavesPerEU, |
824 | 262k | Addressable); |
825 | 262k | } |
826 | | |
827 | | /// \returns Reserved number of SGPRs for given function \p MF. |
828 | | unsigned getReservedNumSGPRs(const MachineFunction &MF) const; |
829 | | |
830 | | /// \returns Maximum number of SGPRs that meets number of waves per execution |
831 | | /// unit requirement for function \p MF, or number of SGPRs explicitly |
832 | | /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF. |
833 | | /// |
834 | | /// \returns Value that meets number of waves per execution unit requirement |
835 | | /// if explicitly requested value cannot be converted to integer, violates |
836 | | /// subtarget's specifications, or does not meet number of waves per execution |
837 | | /// unit requirement. |
838 | | unsigned getMaxNumSGPRs(const MachineFunction &MF) const; |
839 | | |
840 | | /// \returns VGPR allocation granularity supported by the subtarget. |
841 | 0 | unsigned getVGPRAllocGranule() const { |
842 | 0 | return AMDGPU::IsaInfo::getVGPRAllocGranule(getFeatureBits()); |
843 | 0 | } |
844 | | |
845 | | /// \returns VGPR encoding granularity supported by the subtarget. |
846 | 28.3k | unsigned getVGPREncodingGranule() const { |
847 | 28.3k | return AMDGPU::IsaInfo::getVGPREncodingGranule(getFeatureBits()); |
848 | 28.3k | } |
849 | | |
850 | | /// \returns Total number of VGPRs supported by the subtarget. |
851 | 0 | unsigned getTotalNumVGPRs() const { |
852 | 0 | return AMDGPU::IsaInfo::getTotalNumVGPRs(getFeatureBits()); |
853 | 0 | } |
854 | | |
855 | | /// \returns Addressable number of VGPRs supported by the subtarget. |
856 | 15.0k | unsigned getAddressableNumVGPRs() const { |
857 | 15.0k | return AMDGPU::IsaInfo::getAddressableNumVGPRs(getFeatureBits()); |
858 | 15.0k | } |
859 | | |
860 | | /// \returns Minimum number of VGPRs that meets given number of waves per |
861 | | /// execution unit requirement supported by the subtarget. |
862 | 14.1k | unsigned getMinNumVGPRs(unsigned WavesPerEU) const { |
863 | 14.1k | return AMDGPU::IsaInfo::getMinNumVGPRs(getFeatureBits(), WavesPerEU); |
864 | 14.1k | } |
865 | | |
866 | | /// \returns Maximum number of VGPRs that meets given number of waves per |
867 | | /// execution unit requirement supported by the subtarget. |
868 | 124k | unsigned getMaxNumVGPRs(unsigned WavesPerEU) const { |
869 | 124k | return AMDGPU::IsaInfo::getMaxNumVGPRs(getFeatureBits(), WavesPerEU); |
870 | 124k | } |
871 | | |
872 | | /// \returns Reserved number of VGPRs for given function \p MF. |
873 | 105k | unsigned getReservedNumVGPRs(const MachineFunction &MF) const { |
874 | 105k | return debuggerReserveRegs() ? 432 : 0105k ; |
875 | 105k | } |
876 | | |
877 | | /// \returns Maximum number of VGPRs that meets number of waves per execution |
878 | | /// unit requirement for function \p MF, or number of VGPRs explicitly |
879 | | /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF. |
880 | | /// |
881 | | /// \returns Value that meets number of waves per execution unit requirement |
882 | | /// if explicitly requested value cannot be converted to integer, violates |
883 | | /// subtarget's specifications, or does not meet number of waves per execution |
884 | | /// unit requirement. |
885 | | unsigned getMaxNumVGPRs(const MachineFunction &MF) const; |
886 | | |
887 | | void getPostRAMutations( |
888 | | std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) |
889 | | const override; |
890 | | }; |
891 | | |
892 | | } // end namespace llvm |
893 | | |
894 | | #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H |