/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
Line | Count | Source (jump to first uncovered line) |
1 | | //=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU ------*- C++ -*-====// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //==-----------------------------------------------------------------------===// |
8 | | // |
9 | | /// \file |
10 | | /// AMDGPU specific subclass of TargetSubtarget. |
11 | | // |
12 | | //===----------------------------------------------------------------------===// |
13 | | |
14 | | #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H |
15 | | #define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H |
16 | | |
17 | | #include "AMDGPU.h" |
18 | | #include "AMDGPUCallLowering.h" |
19 | | #include "R600FrameLowering.h" |
20 | | #include "R600ISelLowering.h" |
21 | | #include "R600InstrInfo.h" |
22 | | #include "SIFrameLowering.h" |
23 | | #include "SIISelLowering.h" |
24 | | #include "SIInstrInfo.h" |
25 | | #include "Utils/AMDGPUBaseInfo.h" |
26 | | #include "llvm/ADT/Triple.h" |
27 | | #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" |
28 | | #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" |
29 | | #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" |
30 | | #include "llvm/CodeGen/MachineFunction.h" |
31 | | #include "llvm/CodeGen/SelectionDAGTargetInfo.h" |
32 | | #include "llvm/MC/MCInstrItineraries.h" |
33 | | #include "llvm/Support/MathExtras.h" |
34 | | #include <cassert> |
35 | | #include <cstdint> |
36 | | #include <memory> |
37 | | #include <utility> |
38 | | |
39 | | #define GET_SUBTARGETINFO_HEADER |
40 | | #include "AMDGPUGenSubtargetInfo.inc" |
41 | | #define GET_SUBTARGETINFO_HEADER |
42 | | #include "R600GenSubtargetInfo.inc" |
43 | | |
44 | | namespace llvm { |
45 | | |
46 | | class StringRef; |
47 | | |
48 | | class AMDGPUSubtarget { |
49 | | public: |
50 | | enum Generation { |
51 | | R600 = 0, |
52 | | R700 = 1, |
53 | | EVERGREEN = 2, |
54 | | NORTHERN_ISLANDS = 3, |
55 | | SOUTHERN_ISLANDS = 4, |
56 | | SEA_ISLANDS = 5, |
57 | | VOLCANIC_ISLANDS = 6, |
58 | | GFX9 = 7, |
59 | | GFX10 = 8 |
60 | | }; |
61 | | |
62 | | private: |
63 | | Triple TargetTriple; |
64 | | |
65 | | protected: |
66 | | bool Has16BitInsts; |
67 | | bool HasMadMixInsts; |
68 | | bool FP32Denormals; |
69 | | bool FPExceptions; |
70 | | bool HasSDWA; |
71 | | bool HasVOP3PInsts; |
72 | | bool HasMulI24; |
73 | | bool HasMulU24; |
74 | | bool HasInv2PiInlineImm; |
75 | | bool HasFminFmaxLegacy; |
76 | | bool EnablePromoteAlloca; |
77 | | bool HasTrigReducedRange; |
78 | | int LocalMemorySize; |
79 | | unsigned WavefrontSize; |
80 | | |
81 | | public: |
82 | | AMDGPUSubtarget(const Triple &TT); |
83 | | |
84 | | static const AMDGPUSubtarget &get(const MachineFunction &MF); |
85 | | static const AMDGPUSubtarget &get(const TargetMachine &TM, |
86 | | const Function &F); |
87 | | |
88 | | /// \returns Default range flat work group size for a calling convention. |
89 | | std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const; |
90 | | |
91 | | /// \returns Subtarget's default pair of minimum/maximum flat work group sizes |
92 | | /// for function \p F, or minimum/maximum flat work group sizes explicitly |
93 | | /// requested using "amdgpu-flat-work-group-size" attribute attached to |
94 | | /// function \p F. |
95 | | /// |
96 | | /// \returns Subtarget's default values if explicitly requested values cannot |
97 | | /// be converted to integer, or violate subtarget's specifications. |
98 | | std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const; |
99 | | |
100 | | /// \returns Subtarget's default pair of minimum/maximum number of waves per |
101 | | /// execution unit for function \p F, or minimum/maximum number of waves per |
102 | | /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute |
103 | | /// attached to function \p F. |
104 | | /// |
105 | | /// \returns Subtarget's default values if explicitly requested values cannot |
106 | | /// be converted to integer, violate subtarget's specifications, or are not |
107 | | /// compatible with minimum/maximum number of waves limited by flat work group |
108 | | /// size, register usage, and/or lds usage. |
109 | | std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const; |
110 | | |
111 | | /// Return the amount of LDS that can be used that will not restrict the |
112 | | /// occupancy lower than WaveCount. |
113 | | unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, |
114 | | const Function &) const; |
115 | | |
116 | | /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if |
117 | | /// the given LDS memory size is the only constraint. |
118 | | unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const; |
119 | | |
120 | | unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const; |
121 | | |
122 | 357k | bool isAmdHsaOS() const { |
123 | 357k | return TargetTriple.getOS() == Triple::AMDHSA; |
124 | 357k | } |
125 | | |
126 | 52.0k | bool isAmdPalOS() const { |
127 | 52.0k | return TargetTriple.getOS() == Triple::AMDPAL; |
128 | 52.0k | } |
129 | | |
130 | 183k | bool isMesa3DOS() const { |
131 | 183k | return TargetTriple.getOS() == Triple::Mesa3D; |
132 | 183k | } |
133 | | |
134 | 156k | bool isMesaKernel(const Function &F) const { |
135 | 156k | return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv())6.26k ; |
136 | 156k | } |
137 | | |
138 | 161k | bool isAmdHsaOrMesa(const Function &F) const { |
139 | 161k | return isAmdHsaOS() || isMesaKernel(F)133k ; |
140 | 161k | } |
141 | | |
142 | 660k | bool has16BitInsts() const { |
143 | 660k | return Has16BitInsts; |
144 | 660k | } |
145 | | |
146 | 6.01k | bool hasMadMixInsts() const { |
147 | 6.01k | return HasMadMixInsts; |
148 | 6.01k | } |
149 | | |
150 | 46.3k | bool hasFP32Denormals() const { |
151 | 46.3k | return FP32Denormals; |
152 | 46.3k | } |
153 | | |
154 | 0 | bool hasFPExceptions() const { |
155 | 0 | return FPExceptions; |
156 | 0 | } |
157 | | |
158 | 136k | bool hasSDWA() const { |
159 | 136k | return HasSDWA; |
160 | 136k | } |
161 | | |
162 | 25.0k | bool hasVOP3PInsts() const { |
163 | 25.0k | return HasVOP3PInsts; |
164 | 25.0k | } |
165 | | |
166 | 7.43k | bool hasMulI24() const { |
167 | 7.43k | return HasMulI24; |
168 | 7.43k | } |
169 | | |
170 | 13.3k | bool hasMulU24() const { |
171 | 13.3k | return HasMulU24; |
172 | 13.3k | } |
173 | | |
174 | 5.19M | bool hasInv2PiInlineImm() const { |
175 | 5.19M | return HasInv2PiInlineImm; |
176 | 5.19M | } |
177 | | |
178 | 990 | bool hasFminFmaxLegacy() const { |
179 | 990 | return HasFminFmaxLegacy; |
180 | 990 | } |
181 | | |
182 | 98 | bool hasTrigReducedRange() const { |
183 | 98 | return HasTrigReducedRange; |
184 | 98 | } |
185 | | |
186 | 27.3k | bool isPromoteAllocaEnabled() const { |
187 | 27.3k | return EnablePromoteAlloca; |
188 | 27.3k | } |
189 | | |
190 | 813k | unsigned getWavefrontSize() const { |
191 | 813k | return WavefrontSize; |
192 | 813k | } |
193 | | |
194 | 393k | int getLocalMemorySize() const { |
195 | 393k | return LocalMemorySize; |
196 | 393k | } |
197 | | |
198 | 1.96k | unsigned getAlignmentForImplicitArgPtr() const { |
199 | 1.96k | return isAmdHsaOS() ? 81.24k : 4718 ; |
200 | 1.96k | } |
201 | | |
202 | | /// Returns the offset in bytes from the start of the input buffer |
203 | | /// of the first explicit kernel argument. |
204 | 64.0k | unsigned getExplicitKernelArgOffset(const Function &F) const { |
205 | 64.0k | return isAmdHsaOrMesa(F) ? 017.0k : 3646.9k ; |
206 | 64.0k | } |
207 | | |
208 | | /// \returns Maximum number of work groups per compute unit supported by the |
209 | | /// subtarget and limited by given \p FlatWorkGroupSize. |
210 | | virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const = 0; |
211 | | |
212 | | /// \returns Minimum flat work group size supported by the subtarget. |
213 | | virtual unsigned getMinFlatWorkGroupSize() const = 0; |
214 | | |
215 | | /// \returns Maximum flat work group size supported by the subtarget. |
216 | | virtual unsigned getMaxFlatWorkGroupSize() const = 0; |
217 | | |
218 | | /// \returns Maximum number of waves per execution unit supported by the |
219 | | /// subtarget and limited by given \p FlatWorkGroupSize. |
220 | | virtual unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const = 0; |
221 | | |
222 | | /// \returns Minimum number of waves per execution unit supported by the |
223 | | /// subtarget. |
224 | | virtual unsigned getMinWavesPerEU() const = 0; |
225 | | |
226 | 537k | unsigned getMaxWavesPerEU() const { return 10; } |
227 | | |
228 | | /// Creates value range metadata on an workitemid.* inrinsic call or load. |
229 | | bool makeLIDRangeMetadata(Instruction *I) const; |
230 | | |
231 | | /// \returns Number of bytes of arguments that are passed to a shader or |
232 | | /// kernel in addition to the explicit ones declared for the function. |
233 | 23.2k | unsigned getImplicitArgNumBytes(const Function &F) const { |
234 | 23.2k | if (isMesaKernel(F)) |
235 | 652 | return 16; |
236 | 22.6k | return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0); |
237 | 22.6k | } |
238 | | uint64_t getExplicitKernArgSize(const Function &F, |
239 | | unsigned &MaxAlign) const; |
240 | | unsigned getKernArgSegmentSize(const Function &F, |
241 | | unsigned &MaxAlign) const; |
242 | | |
243 | 3.90k | virtual ~AMDGPUSubtarget() {} |
244 | | }; |
245 | | |
246 | | class GCNSubtarget : public AMDGPUGenSubtargetInfo, |
247 | | public AMDGPUSubtarget { |
248 | | public: |
249 | | enum TrapHandlerAbi { |
250 | | TrapHandlerAbiNone = 0, |
251 | | TrapHandlerAbiHsa = 1 |
252 | | }; |
253 | | |
254 | | enum TrapID { |
255 | | TrapIDHardwareReserved = 0, |
256 | | TrapIDHSADebugTrap = 1, |
257 | | TrapIDLLVMTrap = 2, |
258 | | TrapIDLLVMDebugTrap = 3, |
259 | | TrapIDDebugBreakpoint = 7, |
260 | | TrapIDDebugReserved8 = 8, |
261 | | TrapIDDebugReservedFE = 0xfe, |
262 | | TrapIDDebugReservedFF = 0xff |
263 | | }; |
264 | | |
265 | | enum TrapRegValues { |
266 | | LLVMTrapHandlerRegValue = 1 |
267 | | }; |
268 | | |
269 | | private: |
270 | | /// GlobalISel related APIs. |
271 | | std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; |
272 | | std::unique_ptr<InstructionSelector> InstSelector; |
273 | | std::unique_ptr<LegalizerInfo> Legalizer; |
274 | | std::unique_ptr<RegisterBankInfo> RegBankInfo; |
275 | | |
276 | | protected: |
277 | | // Basic subtarget description. |
278 | | Triple TargetTriple; |
279 | | unsigned Gen; |
280 | | InstrItineraryData InstrItins; |
281 | | int LDSBankCount; |
282 | | unsigned MaxPrivateElementSize; |
283 | | |
284 | | // Possibly statically set by tablegen, but may want to be overridden. |
285 | | bool FastFMAF32; |
286 | | bool HalfRate64Ops; |
287 | | |
288 | | // Dynamially set bits that enable features. |
289 | | bool FP64FP16Denormals; |
290 | | bool FlatForGlobal; |
291 | | bool AutoWaitcntBeforeBarrier; |
292 | | bool CodeObjectV3; |
293 | | bool UnalignedScratchAccess; |
294 | | bool UnalignedBufferAccess; |
295 | | bool HasApertureRegs; |
296 | | bool EnableXNACK; |
297 | | bool DoesNotSupportXNACK; |
298 | | bool EnableCuMode; |
299 | | bool TrapHandler; |
300 | | |
301 | | // Used as options. |
302 | | bool EnableLoadStoreOpt; |
303 | | bool EnableUnsafeDSOffsetFolding; |
304 | | bool EnableSIScheduler; |
305 | | bool EnableDS128; |
306 | | bool EnablePRTStrictNull; |
307 | | bool DumpCode; |
308 | | |
309 | | // Subtarget statically properties set by tablegen |
310 | | bool FP64; |
311 | | bool FMA; |
312 | | bool MIMG_R128; |
313 | | bool IsGCN; |
314 | | bool GCN3Encoding; |
315 | | bool CIInsts; |
316 | | bool GFX8Insts; |
317 | | bool GFX9Insts; |
318 | | bool GFX10Insts; |
319 | | bool GFX7GFX8GFX9Insts; |
320 | | bool SGPRInitBug; |
321 | | bool HasSMemRealTime; |
322 | | bool HasIntClamp; |
323 | | bool HasFmaMixInsts; |
324 | | bool HasMovrel; |
325 | | bool HasVGPRIndexMode; |
326 | | bool HasScalarStores; |
327 | | bool HasScalarAtomics; |
328 | | bool HasSDWAOmod; |
329 | | bool HasSDWAScalar; |
330 | | bool HasSDWASdst; |
331 | | bool HasSDWAMac; |
332 | | bool HasSDWAOutModsVOPC; |
333 | | bool HasDPP; |
334 | | bool HasDPP8; |
335 | | bool HasR128A16; |
336 | | bool HasNSAEncoding; |
337 | | bool HasDLInsts; |
338 | | bool HasDot1Insts; |
339 | | bool HasDot2Insts; |
340 | | bool HasDot3Insts; |
341 | | bool HasDot4Insts; |
342 | | bool HasDot5Insts; |
343 | | bool HasDot6Insts; |
344 | | bool HasMAIInsts; |
345 | | bool HasPkFmacF16Inst; |
346 | | bool HasAtomicFaddInsts; |
347 | | bool EnableSRAMECC; |
348 | | bool DoesNotSupportSRAMECC; |
349 | | bool HasNoSdstCMPX; |
350 | | bool HasVscnt; |
351 | | bool HasRegisterBanking; |
352 | | bool HasVOP3Literal; |
353 | | bool HasNoDataDepHazard; |
354 | | bool FlatAddressSpace; |
355 | | bool FlatInstOffsets; |
356 | | bool FlatGlobalInsts; |
357 | | bool FlatScratchInsts; |
358 | | bool ScalarFlatScratchInsts; |
359 | | bool AddNoCarryInsts; |
360 | | bool HasUnpackedD16VMem; |
361 | | bool R600ALUInst; |
362 | | bool CaymanISA; |
363 | | bool CFALUBug; |
364 | | bool LDSMisalignedBug; |
365 | | bool HasVertexCache; |
366 | | short TexVTXClauseSize; |
367 | | bool ScalarizeGlobal; |
368 | | |
369 | | bool HasVcmpxPermlaneHazard; |
370 | | bool HasVMEMtoScalarWriteHazard; |
371 | | bool HasSMEMtoVectorWriteHazard; |
372 | | bool HasInstFwdPrefetchBug; |
373 | | bool HasVcmpxExecWARHazard; |
374 | | bool HasLdsBranchVmemWARHazard; |
375 | | bool HasNSAtoVMEMBug; |
376 | | bool HasOffset3fBug; |
377 | | bool HasFlatSegmentOffsetBug; |
378 | | |
379 | | // Dummy feature to use for assembler in tablegen. |
380 | | bool FeatureDisable; |
381 | | |
382 | | SelectionDAGTargetInfo TSInfo; |
383 | | private: |
384 | | SIInstrInfo InstrInfo; |
385 | | SITargetLowering TLInfo; |
386 | | SIFrameLowering FrameLowering; |
387 | | |
388 | | // See COMPUTE_TMPRING_SIZE.WAVESIZE, 13-bit field in units of 256-dword. |
389 | | static const unsigned MaxWaveScratchSize = (256 * 4) * ((1 << 13) - 1); |
390 | | |
391 | | public: |
392 | | GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, |
393 | | const GCNTargetMachine &TM); |
394 | | ~GCNSubtarget() override; |
395 | | |
396 | | GCNSubtarget &initializeSubtargetDependencies(const Triple &TT, |
397 | | StringRef GPU, StringRef FS); |
398 | | |
399 | 7.03M | const SIInstrInfo *getInstrInfo() const override { |
400 | 7.03M | return &InstrInfo; |
401 | 7.03M | } |
402 | | |
403 | 673k | const SIFrameLowering *getFrameLowering() const override { |
404 | 673k | return &FrameLowering; |
405 | 673k | } |
406 | | |
407 | 1.67M | const SITargetLowering *getTargetLowering() const override { |
408 | 1.67M | return &TLInfo; |
409 | 1.67M | } |
410 | | |
411 | 49.5M | const SIRegisterInfo *getRegisterInfo() const override { |
412 | 49.5M | return &InstrInfo.getRegisterInfo(); |
413 | 49.5M | } |
414 | | |
415 | 459 | const CallLowering *getCallLowering() const override { |
416 | 459 | return CallLoweringInfo.get(); |
417 | 459 | } |
418 | | |
419 | 1.51k | const InstructionSelector *getInstructionSelector() const override { |
420 | 1.51k | return InstSelector.get(); |
421 | 1.51k | } |
422 | | |
423 | 2.33k | const LegalizerInfo *getLegalizerInfo() const override { |
424 | 2.33k | return Legalizer.get(); |
425 | 2.33k | } |
426 | | |
427 | 2.41k | const RegisterBankInfo *getRegBankInfo() const override { |
428 | 2.41k | return RegBankInfo.get(); |
429 | 2.41k | } |
430 | | |
431 | | // Nothing implemented, just prevent crashes on use. |
432 | 30.0k | const SelectionDAGTargetInfo *getSelectionDAGInfo() const override { |
433 | 30.0k | return &TSInfo; |
434 | 30.0k | } |
435 | | |
436 | 75.3k | const InstrItineraryData *getInstrItineraryData() const override { |
437 | 75.3k | return &InstrItins; |
438 | 75.3k | } |
439 | | |
440 | | void ParseSubtargetFeatures(StringRef CPU, StringRef FS); |
441 | | |
442 | 12.9M | Generation getGeneration() const { |
443 | 12.9M | return (Generation)Gen; |
444 | 12.9M | } |
445 | | |
446 | 834k | unsigned getWavefrontSizeLog2() const { |
447 | 834k | return Log2_32(WavefrontSize); |
448 | 834k | } |
449 | | |
450 | | /// Return the number of high bits known to be zero fror a frame index. |
451 | 828k | unsigned getKnownHighZeroBitsForFrameIndex() const { |
452 | 828k | return countLeadingZeros(MaxWaveScratchSize) + getWavefrontSizeLog2(); |
453 | 828k | } |
454 | | |
455 | 7.55k | int getLDSBankCount() const { |
456 | 7.55k | return LDSBankCount; |
457 | 7.55k | } |
458 | | |
459 | 7.59k | unsigned getMaxPrivateElementSize() const { |
460 | 7.59k | return MaxPrivateElementSize; |
461 | 7.59k | } |
462 | | |
463 | | unsigned getConstantBusLimit(unsigned Opcode) const; |
464 | | |
465 | 0 | bool hasIntClamp() const { |
466 | 0 | return HasIntClamp; |
467 | 0 | } |
468 | | |
469 | 0 | bool hasFP64() const { |
470 | 0 | return FP64; |
471 | 0 | } |
472 | | |
473 | 0 | bool hasMIMG_R128() const { |
474 | 0 | return MIMG_R128; |
475 | 0 | } |
476 | | |
477 | 0 | bool hasHWFP64() const { |
478 | 0 | return FP64; |
479 | 0 | } |
480 | | |
481 | 10.1k | bool hasFastFMAF32() const { |
482 | 10.1k | return FastFMAF32; |
483 | 10.1k | } |
484 | | |
485 | 48 | bool hasHalfRate64Ops() const { |
486 | 48 | return HalfRate64Ops; |
487 | 48 | } |
488 | | |
489 | 168k | bool hasAddr64() const { |
490 | 168k | return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS); |
491 | 168k | } |
492 | | |
493 | | // Return true if the target only has the reverse operand versions of VALU |
494 | | // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32). |
495 | 14.0k | bool hasOnlyRevVALUShifts() const { |
496 | 14.0k | return getGeneration() >= VOLCANIC_ISLANDS; |
497 | 14.0k | } |
498 | | |
499 | 3.64k | bool hasBFE() const { |
500 | 3.64k | return true; |
501 | 3.64k | } |
502 | | |
503 | 3.64k | bool hasBFI() const { |
504 | 3.64k | return true; |
505 | 3.64k | } |
506 | | |
507 | 0 | bool hasBFM() const { |
508 | 0 | return hasBFE(); |
509 | 0 | } |
510 | | |
511 | 7.28k | bool hasBCNT(unsigned Size) const { |
512 | 7.28k | return true; |
513 | 7.28k | } |
514 | | |
515 | 3.64k | bool hasFFBL() const { |
516 | 3.64k | return true; |
517 | 3.64k | } |
518 | | |
519 | 3.64k | bool hasFFBH() const { |
520 | 3.64k | return true; |
521 | 3.64k | } |
522 | | |
523 | 20 | bool hasMed3_16() const { |
524 | 20 | return getGeneration() >= AMDGPUSubtarget::GFX9; |
525 | 20 | } |
526 | | |
527 | 1.51k | bool hasMin3Max3_16() const { |
528 | 1.51k | return getGeneration() >= AMDGPUSubtarget::GFX9; |
529 | 1.51k | } |
530 | | |
531 | 5.73k | bool hasFmaMixInsts() const { |
532 | 5.73k | return HasFmaMixInsts; |
533 | 5.73k | } |
534 | | |
535 | 0 | bool hasCARRY() const { |
536 | 0 | return true; |
537 | 0 | } |
538 | | |
539 | 0 | bool hasFMA() const { |
540 | 0 | return FMA; |
541 | 0 | } |
542 | | |
543 | 953k | bool hasSwap() const { |
544 | 953k | return GFX9Insts; |
545 | 953k | } |
546 | | |
547 | 38 | TrapHandlerAbi getTrapHandlerAbi() const { |
548 | 38 | return isAmdHsaOS() ? TrapHandlerAbiHsa18 : TrapHandlerAbiNone20 ; |
549 | 38 | } |
550 | | |
551 | | /// True if the offset field of DS instructions works as expected. On SI, the |
552 | | /// offset uses a 16-bit adder and does not always wrap properly. |
553 | 45.9k | bool hasUsableDSOffset() const { |
554 | 45.9k | return getGeneration() >= SEA_ISLANDS; |
555 | 45.9k | } |
556 | | |
557 | 2.03k | bool unsafeDSOffsetFoldingEnabled() const { |
558 | 2.03k | return EnableUnsafeDSOffsetFolding; |
559 | 2.03k | } |
560 | | |
561 | | /// Condition output from div_scale is usable. |
562 | 85 | bool hasUsableDivScaleConditionOutput() const { |
563 | 85 | return getGeneration() != SOUTHERN_ISLANDS; |
564 | 85 | } |
565 | | |
566 | | /// Extra wait hazard is needed in some cases before |
567 | | /// s_cbranch_vccnz/s_cbranch_vccz. |
568 | 298 | bool hasReadVCCZBug() const { |
569 | 298 | return getGeneration() <= SEA_ISLANDS; |
570 | 298 | } |
571 | | |
572 | | /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR |
573 | | /// was written by a VALU instruction. |
574 | 86.8k | bool hasSMRDReadVALUDefHazard() const { |
575 | 86.8k | return getGeneration() == SOUTHERN_ISLANDS; |
576 | 86.8k | } |
577 | | |
578 | | /// A read of an SGPR by a VMEM instruction requires 5 wait states when the |
579 | | /// SGPR was written by a VALU Instruction. |
580 | 151k | bool hasVMEMReadSGPRVALUDefHazard() const { |
581 | 151k | return getGeneration() >= VOLCANIC_ISLANDS; |
582 | 151k | } |
583 | | |
584 | 8 | bool hasRFEHazards() const { |
585 | 8 | return getGeneration() >= VOLCANIC_ISLANDS; |
586 | 8 | } |
587 | | |
588 | | /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32. |
589 | 255 | unsigned getSetRegWaitStates() const { |
590 | 255 | return getGeneration() <= SEA_ISLANDS ? 1150 : 2105 ; |
591 | 255 | } |
592 | | |
593 | 25.4k | bool dumpCode() const { |
594 | 25.4k | return DumpCode; |
595 | 25.4k | } |
596 | | |
597 | | /// Return the amount of LDS that can be used that will not restrict the |
598 | | /// occupancy lower than WaveCount. |
599 | | unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, |
600 | | const Function &) const; |
601 | | |
602 | 12.3k | bool hasFP16Denormals() const { |
603 | 12.3k | return FP64FP16Denormals; |
604 | 12.3k | } |
605 | | |
606 | 31.0k | bool hasFP64Denormals() const { |
607 | 31.0k | return FP64FP16Denormals; |
608 | 31.0k | } |
609 | | |
610 | 101 | bool supportsMinMaxDenormModes() const { |
611 | 101 | return getGeneration() >= AMDGPUSubtarget::GFX9; |
612 | 101 | } |
613 | | |
614 | 121k | bool useFlatForGlobal() const { |
615 | 121k | return FlatForGlobal; |
616 | 121k | } |
617 | | |
618 | | /// \returns If target supports ds_read/write_b128 and user enables generation |
619 | | /// of ds_read/write_b128. |
620 | 57.8k | bool useDS128() const { |
621 | 57.8k | return CIInsts && EnableDS12836.1k ; |
622 | 57.8k | } |
623 | | |
624 | | /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64 |
625 | 3.64k | bool haveRoundOpsF64() const { |
626 | 3.64k | return CIInsts; |
627 | 3.64k | } |
628 | | |
629 | | /// \returns If MUBUF instructions always perform range checking, even for |
630 | | /// buffer resources used for private memory access. |
631 | 6.23k | bool privateMemoryResourceIsRangeChecked() const { |
632 | 6.23k | return getGeneration() < AMDGPUSubtarget::GFX9; |
633 | 6.23k | } |
634 | | |
635 | | /// \returns If target requires PRT Struct NULL support (zero result registers |
636 | | /// for sparse texture support). |
637 | 300 | bool usePRTStrictNull() const { |
638 | 300 | return EnablePRTStrictNull; |
639 | 300 | } |
640 | | |
641 | 87 | bool hasAutoWaitcntBeforeBarrier() const { |
642 | 87 | return AutoWaitcntBeforeBarrier; |
643 | 87 | } |
644 | | |
645 | 23.1k | bool hasCodeObjectV3() const { |
646 | 23.1k | // FIXME: Need to add code object v3 support for mesa and pal. |
647 | 23.1k | return isAmdHsaOS() ? CodeObjectV34.03k : false19.1k ; |
648 | 23.1k | } |
649 | | |
650 | 151k | bool hasUnalignedBufferAccess() const { |
651 | 151k | return UnalignedBufferAccess; |
652 | 151k | } |
653 | | |
654 | 152k | bool hasUnalignedScratchAccess() const { |
655 | 152k | return UnalignedScratchAccess; |
656 | 152k | } |
657 | | |
658 | 25.3k | bool hasApertureRegs() const { |
659 | 25.3k | return HasApertureRegs; |
660 | 25.3k | } |
661 | | |
662 | 19.1k | bool isTrapHandlerEnabled() const { |
663 | 19.1k | return TrapHandler; |
664 | 19.1k | } |
665 | | |
666 | 456k | bool isXNACKEnabled() const { |
667 | 456k | return EnableXNACK; |
668 | 456k | } |
669 | | |
670 | 4.16k | bool isCuModeEnabled() const { |
671 | 4.16k | return EnableCuMode; |
672 | 4.16k | } |
673 | | |
674 | 88.6k | bool hasFlatAddressSpace() const { |
675 | 88.6k | return FlatAddressSpace; |
676 | 88.6k | } |
677 | | |
678 | 27 | bool hasFlatScrRegister() const { |
679 | 27 | return hasFlatAddressSpace(); |
680 | 27 | } |
681 | | |
682 | 962k | bool hasFlatInstOffsets() const { |
683 | 962k | return FlatInstOffsets; |
684 | 962k | } |
685 | | |
686 | 191k | bool hasFlatGlobalInsts() const { |
687 | 191k | return FlatGlobalInsts; |
688 | 191k | } |
689 | | |
690 | 0 | bool hasFlatScratchInsts() const { |
691 | 0 | return FlatScratchInsts; |
692 | 0 | } |
693 | | |
694 | 0 | bool hasScalarFlatScratchInsts() const { |
695 | 0 | return ScalarFlatScratchInsts; |
696 | 0 | } |
697 | | |
698 | 9.33k | bool hasFlatSegmentOffsetBug() const { |
699 | 9.33k | return HasFlatSegmentOffsetBug; |
700 | 9.33k | } |
701 | | |
702 | 2.12k | bool hasFlatLgkmVMemCountInOrder() const { |
703 | 2.12k | return getGeneration() > GFX9; |
704 | 2.12k | } |
705 | | |
706 | 32.5k | bool hasD16LoadStore() const { |
707 | 32.5k | return getGeneration() >= GFX9; |
708 | 32.5k | } |
709 | | |
710 | 32.5k | bool d16PreservesUnusedBits() const { |
711 | 32.5k | return hasD16LoadStore() && !isSRAMECCEnabled()8.90k ; |
712 | 32.5k | } |
713 | | |
714 | 88 | bool hasD16Images() const { |
715 | 88 | return getGeneration() >= VOLCANIC_ISLANDS; |
716 | 88 | } |
717 | | |
718 | | /// Return if most LDS instructions have an m0 use that require m0 to be |
719 | | /// iniitalized. |
720 | 38.5k | bool ldsRequiresM0Init() const { |
721 | 38.5k | return getGeneration() < GFX9; |
722 | 38.5k | } |
723 | | |
724 | | // True if the hardware rewinds and replays GWS operations if a wave is |
725 | | // preempted. |
726 | | // |
727 | | // If this is false, a GWS operation requires testing if a nack set the |
728 | | // MEM_VIOL bit, and repeating if so. |
729 | 179 | bool hasGWSAutoReplay() const { |
730 | 179 | return getGeneration() >= GFX9; |
731 | 179 | } |
732 | | |
733 | | /// \returns if target has ds_gws_sema_release_all instruction. |
734 | 6 | bool hasGWSSemaReleaseAll() const { |
735 | 6 | return CIInsts; |
736 | 6 | } |
737 | | |
738 | 11.2k | bool hasAddNoCarry() const { |
739 | 11.2k | return AddNoCarryInsts; |
740 | 11.2k | } |
741 | | |
742 | 1.62M | bool hasUnpackedD16VMem() const { |
743 | 1.62M | return HasUnpackedD16VMem; |
744 | 1.62M | } |
745 | | |
746 | | // Covers VS/PS/CS graphics shaders |
747 | 25.1k | bool isMesaGfxShader(const Function &F) const { |
748 | 25.1k | return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv())186 ; |
749 | 25.1k | } |
750 | | |
751 | 2.07k | bool hasMad64_32() const { |
752 | 2.07k | return getGeneration() >= SEA_ISLANDS; |
753 | 2.07k | } |
754 | | |
755 | 82.9k | bool hasSDWAOmod() const { |
756 | 82.9k | return HasSDWAOmod; |
757 | 82.9k | } |
758 | | |
759 | 228k | bool hasSDWAScalar() const { |
760 | 228k | return HasSDWAScalar; |
761 | 228k | } |
762 | | |
763 | 363 | bool hasSDWASdst() const { |
764 | 363 | return HasSDWASdst; |
765 | 363 | } |
766 | | |
767 | 4.16k | bool hasSDWAMac() const { |
768 | 4.16k | return HasSDWAMac; |
769 | 4.16k | } |
770 | | |
771 | 314 | bool hasSDWAOutModsVOPC() const { |
772 | 314 | return HasSDWAOutModsVOPC; |
773 | 314 | } |
774 | | |
775 | 7.73k | bool hasDLInsts() const { |
776 | 7.73k | return HasDLInsts; |
777 | 7.73k | } |
778 | | |
779 | 3.67k | bool hasDot1Insts() const { |
780 | 3.67k | return HasDot1Insts; |
781 | 3.67k | } |
782 | | |
783 | 5.88k | bool hasDot2Insts() const { |
784 | 5.88k | return HasDot2Insts; |
785 | 5.88k | } |
786 | | |
787 | 3.64k | bool hasDot3Insts() const { |
788 | 3.64k | return HasDot3Insts; |
789 | 3.64k | } |
790 | | |
791 | 3.64k | bool hasDot4Insts() const { |
792 | 3.64k | return HasDot4Insts; |
793 | 3.64k | } |
794 | | |
795 | 3.65k | bool hasDot5Insts() const { |
796 | 3.65k | return HasDot5Insts; |
797 | 3.65k | } |
798 | | |
799 | 3.64k | bool hasDot6Insts() const { |
800 | 3.64k | return HasDot6Insts; |
801 | 3.64k | } |
802 | | |
803 | 295k | bool hasMAIInsts() const { |
804 | 295k | return HasMAIInsts; |
805 | 295k | } |
806 | | |
807 | 0 | bool hasPkFmacF16Inst() const { |
808 | 0 | return HasPkFmacF16Inst; |
809 | 0 | } |
810 | | |
811 | 0 | bool hasAtomicFaddInsts() const { |
812 | 0 | return HasAtomicFaddInsts; |
813 | 0 | } |
814 | | |
815 | 8.90k | bool isSRAMECCEnabled() const { |
816 | 8.90k | return EnableSRAMECC; |
817 | 8.90k | } |
818 | | |
819 | 89 | bool hasNoSdstCMPX() const { |
820 | 89 | return HasNoSdstCMPX; |
821 | 89 | } |
822 | | |
823 | 89.0k | bool hasVscnt() const { |
824 | 89.0k | return HasVscnt; |
825 | 89.0k | } |
826 | | |
827 | 25.2k | bool hasRegisterBanking() const { |
828 | 25.2k | return HasRegisterBanking; |
829 | 25.2k | } |
830 | | |
831 | 703k | bool hasVOP3Literal() const { |
832 | 703k | return HasVOP3Literal; |
833 | 703k | } |
834 | | |
835 | 925k | bool hasNoDataDepHazard() const { |
836 | 925k | return HasNoDataDepHazard; |
837 | 925k | } |
838 | | |
839 | 41.9k | bool vmemWriteNeedsExpWaitcnt() const { |
840 | 41.9k | return getGeneration() < SEA_ISLANDS; |
841 | 41.9k | } |
842 | | |
843 | | // Scratch is allocated in 256 dword per wave blocks for the entire |
844 | | // wavefront. When viewed from the perspecive of an arbitrary workitem, this |
845 | | // is 4-byte aligned. |
846 | | // |
847 | | // Only 4-byte alignment is really needed to access anything. Transformations |
848 | | // on the pointer value itself may rely on the alignment / known low bits of |
849 | | // the pointer. Set this to something above the minimum to avoid needing |
850 | | // dynamic realignment in common cases. |
851 | 3.82k | unsigned getStackAlignment() const { |
852 | 3.82k | return 16; |
853 | 3.82k | } |
854 | | |
855 | 78.4k | bool enableMachineScheduler() const override { |
856 | 78.4k | return true; |
857 | 78.4k | } |
858 | | |
859 | 31.2k | bool enableSubRegLiveness() const override { |
860 | 31.2k | return true; |
861 | 31.2k | } |
862 | | |
863 | 1.61M | void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; } |
864 | 51.2k | bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; } |
865 | | |
866 | | /// \returns Number of execution units per compute unit supported by the |
867 | | /// subtarget. |
868 | 0 | unsigned getEUsPerCU() const { |
869 | 0 | return AMDGPU::IsaInfo::getEUsPerCU(this); |
870 | 0 | } |
871 | | |
872 | | /// \returns Maximum number of waves per compute unit supported by the |
873 | | /// subtarget without any kind of limitation. |
874 | 0 | unsigned getMaxWavesPerCU() const { |
875 | 0 | return AMDGPU::IsaInfo::getMaxWavesPerCU(this); |
876 | 0 | } |
877 | | |
878 | | /// \returns Maximum number of waves per compute unit supported by the |
879 | | /// subtarget and limited by given \p FlatWorkGroupSize. |
880 | 0 | unsigned getMaxWavesPerCU(unsigned FlatWorkGroupSize) const { |
881 | 0 | return AMDGPU::IsaInfo::getMaxWavesPerCU(this, FlatWorkGroupSize); |
882 | 0 | } |
883 | | |
884 | | /// \returns Maximum number of waves per execution unit supported by the |
885 | | /// subtarget without any kind of limitation. |
886 | 10.3k | unsigned getMaxWavesPerEU() const { |
887 | 10.3k | return AMDGPU::IsaInfo::getMaxWavesPerEU(this); |
888 | 10.3k | } |
889 | | |
890 | | /// \returns Number of waves per work group supported by the subtarget and |
891 | | /// limited by given \p FlatWorkGroupSize. |
892 | 0 | unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const { |
893 | 0 | return AMDGPU::IsaInfo::getWavesPerWorkGroup(this, FlatWorkGroupSize); |
894 | 0 | } |
895 | | |
896 | | // static wrappers |
897 | | static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI); |
898 | | |
899 | | // XXX - Why is this here if it isn't in the default pass set? |
900 | 134 | bool enableEarlyIfConversion() const override { |
901 | 134 | return true; |
902 | 134 | } |
903 | | |
904 | | void overrideSchedPolicy(MachineSchedPolicy &Policy, |
905 | | unsigned NumRegionInstrs) const override; |
906 | | |
907 | 23.1k | unsigned getMaxNumUserSGPRs() const { |
908 | 23.1k | return 16; |
909 | 23.1k | } |
910 | | |
911 | 0 | bool hasSMemRealTime() const { |
912 | 0 | return HasSMemRealTime; |
913 | 0 | } |
914 | | |
915 | 114 | bool hasMovrel() const { |
916 | 114 | return HasMovrel; |
917 | 114 | } |
918 | | |
919 | 25 | bool hasVGPRIndexMode() const { |
920 | 25 | return HasVGPRIndexMode; |
921 | 25 | } |
922 | | |
923 | 114 | bool useVGPRIndexMode(bool UserEnable) const { |
924 | 114 | return !hasMovrel() || (88 UserEnable88 && hasVGPRIndexMode()25 ); |
925 | 114 | } |
926 | | |
927 | 49 | bool hasScalarCompareEq64() const { |
928 | 49 | return getGeneration() >= VOLCANIC_ISLANDS; |
929 | 49 | } |
930 | | |
931 | 6.34k | bool hasScalarStores() const { |
932 | 6.34k | return HasScalarStores; |
933 | 6.34k | } |
934 | | |
935 | 0 | bool hasScalarAtomics() const { |
936 | 0 | return HasScalarAtomics; |
937 | 0 | } |
938 | | |
939 | 34 | bool hasLDSFPAtomics() const { |
940 | 34 | return GFX8Insts; |
941 | 34 | } |
942 | | |
943 | 25.4k | bool hasDPP() const { |
944 | 25.4k | return HasDPP; |
945 | 25.4k | } |
946 | | |
947 | 0 | bool hasDPP8() const { |
948 | 0 | return HasDPP8; |
949 | 0 | } |
950 | | |
951 | 0 | bool hasR128A16() const { |
952 | 0 | return HasR128A16; |
953 | 0 | } |
954 | | |
955 | 0 | bool hasOffset3fBug() const { |
956 | 0 | return HasOffset3fBug; |
957 | 0 | } |
958 | | |
959 | 0 | bool hasNSAEncoding() const { |
960 | 0 | return HasNSAEncoding; |
961 | 0 | } |
962 | | |
963 | | bool hasMadF16() const; |
964 | | |
965 | 82.1k | bool enableSIScheduler() const { |
966 | 82.1k | return EnableSIScheduler; |
967 | 82.1k | } |
968 | | |
969 | 25.2k | bool loadStoreOptEnabled() const { |
970 | 25.2k | return EnableLoadStoreOpt; |
971 | 25.2k | } |
972 | | |
973 | 322k | bool hasSGPRInitBug() const { |
974 | 322k | return SGPRInitBug; |
975 | 322k | } |
976 | | |
977 | 410k | bool has12DWordStoreHazard() const { |
978 | 410k | return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS; |
979 | 410k | } |
980 | | |
981 | | // \returns true if the subtarget supports DWORDX3 load/store instructions. |
982 | 5.56k | bool hasDwordx3LoadStores() const { |
983 | 5.56k | return CIInsts; |
984 | 5.56k | } |
985 | | |
986 | 957k | bool hasSMovFedHazard() const { |
987 | 957k | return getGeneration() == AMDGPUSubtarget::GFX9; |
988 | 957k | } |
989 | | |
990 | 857k | bool hasReadM0MovRelInterpHazard() const { |
991 | 857k | return getGeneration() == AMDGPUSubtarget::GFX9; |
992 | 857k | } |
993 | | |
994 | 857k | bool hasReadM0SendMsgHazard() const { |
995 | 857k | return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && |
996 | 857k | getGeneration() <= AMDGPUSubtarget::GFX9549k ; |
997 | 857k | } |
998 | | |
999 | 437k | bool hasVcmpxPermlaneHazard() const { |
1000 | 437k | return HasVcmpxPermlaneHazard; |
1001 | 437k | } |
1002 | | |
1003 | 437k | bool hasVMEMtoScalarWriteHazard() const { |
1004 | 437k | return HasVMEMtoScalarWriteHazard; |
1005 | 437k | } |
1006 | | |
1007 | 437k | bool hasSMEMtoVectorWriteHazard() const { |
1008 | 437k | return HasSMEMtoVectorWriteHazard; |
1009 | 437k | } |
1010 | | |
1011 | 183k | bool hasLDSMisalignedBug() const { |
1012 | 183k | return LDSMisalignedBug && !EnableCuMode4.74k ; |
1013 | 183k | } |
1014 | | |
1015 | 110 | bool hasInstFwdPrefetchBug() const { |
1016 | 110 | return HasInstFwdPrefetchBug; |
1017 | 110 | } |
1018 | | |
1019 | 437k | bool hasVcmpxExecWARHazard() const { |
1020 | 437k | return HasVcmpxExecWARHazard; |
1021 | 437k | } |
1022 | | |
1023 | 437k | bool hasLdsBranchVmemWARHazard() const { |
1024 | 437k | return HasLdsBranchVmemWARHazard; |
1025 | 437k | } |
1026 | | |
1027 | 988k | bool hasNSAtoVMEMBug() const { |
1028 | 988k | return HasNSAtoVMEMBug; |
1029 | 988k | } |
1030 | | |
1031 | | /// Return the maximum number of waves per SIMD for kernels using \p SGPRs |
1032 | | /// SGPRs |
1033 | | unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; |
1034 | | |
1035 | | /// Return the maximum number of waves per SIMD for kernels using \p VGPRs |
1036 | | /// VGPRs |
1037 | | unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const; |
1038 | | |
1039 | | /// \returns true if the flat_scratch register should be initialized with the |
1040 | | /// pointer to the wave's scratch memory rather than a size and offset. |
1041 | 431 | bool flatScratchIsPointer() const { |
1042 | 431 | return getGeneration() >= AMDGPUSubtarget::GFX9; |
1043 | 431 | } |
1044 | | |
1045 | | /// \returns true if the machine has merged shaders in which s0-s7 are |
1046 | | /// reserved by the hardware and user SGPRs start at s8 |
1047 | 7 | bool hasMergedShaders() const { |
1048 | 7 | return getGeneration() >= GFX9; |
1049 | 7 | } |
1050 | | |
1051 | | /// \returns SGPR allocation granularity supported by the subtarget. |
1052 | 0 | unsigned getSGPRAllocGranule() const { |
1053 | 0 | return AMDGPU::IsaInfo::getSGPRAllocGranule(this); |
1054 | 0 | } |
1055 | | |
1056 | | /// \returns SGPR encoding granularity supported by the subtarget. |
1057 | 0 | unsigned getSGPREncodingGranule() const { |
1058 | 0 | return AMDGPU::IsaInfo::getSGPREncodingGranule(this); |
1059 | 0 | } |
1060 | | |
1061 | | /// \returns Total number of SGPRs supported by the subtarget. |
1062 | 0 | unsigned getTotalNumSGPRs() const { |
1063 | 0 | return AMDGPU::IsaInfo::getTotalNumSGPRs(this); |
1064 | 0 | } |
1065 | | |
1066 | | /// \returns Addressable number of SGPRs supported by the subtarget. |
1067 | 48.6k | unsigned getAddressableNumSGPRs() const { |
1068 | 48.6k | return AMDGPU::IsaInfo::getAddressableNumSGPRs(this); |
1069 | 48.6k | } |
1070 | | |
1071 | | /// \returns Minimum number of SGPRs that meets the given number of waves per |
1072 | | /// execution unit requirement supported by the subtarget. |
1073 | 23.2k | unsigned getMinNumSGPRs(unsigned WavesPerEU) const { |
1074 | 23.2k | return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU); |
1075 | 23.2k | } |
1076 | | |
1077 | | /// \returns Maximum number of SGPRs that meets the given number of waves per |
1078 | | /// execution unit requirement supported by the subtarget. |
1079 | 620k | unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const { |
1080 | 620k | return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable); |
1081 | 620k | } |
1082 | | |
1083 | | /// \returns Reserved number of SGPRs for given function \p MF. |
1084 | | unsigned getReservedNumSGPRs(const MachineFunction &MF) const; |
1085 | | |
1086 | | /// \returns Maximum number of SGPRs that meets number of waves per execution |
1087 | | /// unit requirement for function \p MF, or number of SGPRs explicitly |
1088 | | /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF. |
1089 | | /// |
1090 | | /// \returns Value that meets number of waves per execution unit requirement |
1091 | | /// if explicitly requested value cannot be converted to integer, violates |
1092 | | /// subtarget's specifications, or does not meet number of waves per execution |
1093 | | /// unit requirement. |
1094 | | unsigned getMaxNumSGPRs(const MachineFunction &MF) const; |
1095 | | |
1096 | | /// \returns VGPR allocation granularity supported by the subtarget. |
1097 | 10.3k | unsigned getVGPRAllocGranule() const { |
1098 | 10.3k | return AMDGPU::IsaInfo::getVGPRAllocGranule(this); |
1099 | 10.3k | } |
1100 | | |
1101 | | /// \returns VGPR encoding granularity supported by the subtarget. |
1102 | 0 | unsigned getVGPREncodingGranule() const { |
1103 | 0 | return AMDGPU::IsaInfo::getVGPREncodingGranule(this); |
1104 | 0 | } |
1105 | | |
1106 | | /// \returns Total number of VGPRs supported by the subtarget. |
1107 | 3.14k | unsigned getTotalNumVGPRs() const { |
1108 | 3.14k | return AMDGPU::IsaInfo::getTotalNumVGPRs(this); |
1109 | 3.14k | } |
1110 | | |
1111 | | /// \returns Addressable number of VGPRs supported by the subtarget. |
1112 | 25.4k | unsigned getAddressableNumVGPRs() const { |
1113 | 25.4k | return AMDGPU::IsaInfo::getAddressableNumVGPRs(this); |
1114 | 25.4k | } |
1115 | | |
1116 | | /// \returns Minimum number of VGPRs that meets given number of waves per |
1117 | | /// execution unit requirement supported by the subtarget. |
1118 | 23.5k | unsigned getMinNumVGPRs(unsigned WavesPerEU) const { |
1119 | 23.5k | return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU); |
1120 | 23.5k | } |
1121 | | |
1122 | | /// \returns Maximum number of VGPRs that meets given number of waves per |
1123 | | /// execution unit requirement supported by the subtarget. |
1124 | 370k | unsigned getMaxNumVGPRs(unsigned WavesPerEU) const { |
1125 | 370k | return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU); |
1126 | 370k | } |
1127 | | |
1128 | | /// \returns Maximum number of VGPRs that meets number of waves per execution |
1129 | | /// unit requirement for function \p MF, or number of VGPRs explicitly |
1130 | | /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF. |
1131 | | /// |
1132 | | /// \returns Value that meets number of waves per execution unit requirement |
1133 | | /// if explicitly requested value cannot be converted to integer, violates |
1134 | | /// subtarget's specifications, or does not meet number of waves per execution |
1135 | | /// unit requirement. |
1136 | | unsigned getMaxNumVGPRs(const MachineFunction &MF) const; |
1137 | | |
1138 | | void getPostRAMutations( |
1139 | | std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) |
1140 | | const override; |
1141 | | |
1142 | 485k | bool isWave32() const { |
1143 | 485k | return WavefrontSize == 32; |
1144 | 485k | } |
1145 | | |
1146 | 0 | const TargetRegisterClass *getBoolRC() const { |
1147 | 0 | return getRegisterInfo()->getBoolRC(); |
1148 | 0 | } |
1149 | | |
1150 | | /// \returns Maximum number of work groups per compute unit supported by the |
1151 | | /// subtarget and limited by given \p FlatWorkGroupSize. |
1152 | 342k | unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override { |
1153 | 342k | return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize); |
1154 | 342k | } |
1155 | | |
1156 | | /// \returns Minimum flat work group size supported by the subtarget. |
1157 | 431k | unsigned getMinFlatWorkGroupSize() const override { |
1158 | 431k | return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this); |
1159 | 431k | } |
1160 | | |
1161 | | /// \returns Maximum flat work group size supported by the subtarget. |
1162 | 431k | unsigned getMaxFlatWorkGroupSize() const override { |
1163 | 431k | return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this); |
1164 | 431k | } |
1165 | | |
1166 | | /// \returns Maximum number of waves per execution unit supported by the |
1167 | | /// subtarget and limited by given \p FlatWorkGroupSize. |
1168 | 53.6k | unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const override { |
1169 | 53.6k | return AMDGPU::IsaInfo::getMaxWavesPerEU(this, FlatWorkGroupSize); |
1170 | 53.6k | } |
1171 | | |
1172 | | /// \returns Minimum number of waves per execution unit supported by the |
1173 | | /// subtarget. |
1174 | 53.6k | unsigned getMinWavesPerEU() const override { |
1175 | 53.6k | return AMDGPU::IsaInfo::getMinWavesPerEU(this); |
1176 | 53.6k | } |
1177 | | }; |
1178 | | |
1179 | | class R600Subtarget final : public R600GenSubtargetInfo, |
1180 | | public AMDGPUSubtarget { |
1181 | | private: |
1182 | | R600InstrInfo InstrInfo; |
1183 | | R600FrameLowering FrameLowering; |
1184 | | bool FMA; |
1185 | | bool CaymanISA; |
1186 | | bool CFALUBug; |
1187 | | bool HasVertexCache; |
1188 | | bool R600ALUInst; |
1189 | | bool FP64; |
1190 | | short TexVTXClauseSize; |
1191 | | Generation Gen; |
1192 | | R600TargetLowering TLInfo; |
1193 | | InstrItineraryData InstrItins; |
1194 | | SelectionDAGTargetInfo TSInfo; |
1195 | | |
1196 | | public: |
1197 | | R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS, |
1198 | | const TargetMachine &TM); |
1199 | | |
1200 | 928k | const R600InstrInfo *getInstrInfo() const override { return &InstrInfo; } |
1201 | | |
1202 | 37.5k | const R600FrameLowering *getFrameLowering() const override { |
1203 | 37.5k | return &FrameLowering; |
1204 | 37.5k | } |
1205 | | |
1206 | 130k | const R600TargetLowering *getTargetLowering() const override { |
1207 | 130k | return &TLInfo; |
1208 | 130k | } |
1209 | | |
1210 | 1.61M | const R600RegisterInfo *getRegisterInfo() const override { |
1211 | 1.61M | return &InstrInfo.getRegisterInfo(); |
1212 | 1.61M | } |
1213 | | |
1214 | 7.05k | const InstrItineraryData *getInstrItineraryData() const override { |
1215 | 7.05k | return &InstrItins; |
1216 | 7.05k | } |
1217 | | |
1218 | | // Nothing implemented, just prevent crashes on use. |
1219 | 4.59k | const SelectionDAGTargetInfo *getSelectionDAGInfo() const override { |
1220 | 4.59k | return &TSInfo; |
1221 | 4.59k | } |
1222 | | |
1223 | | void ParseSubtargetFeatures(StringRef CPU, StringRef FS); |
1224 | | |
1225 | 90.4k | Generation getGeneration() const { |
1226 | 90.4k | return Gen; |
1227 | 90.4k | } |
1228 | | |
1229 | 290 | unsigned getStackAlignment() const { |
1230 | 290 | return 4; |
1231 | 290 | } |
1232 | | |
1233 | | R600Subtarget &initializeSubtargetDependencies(const Triple &TT, |
1234 | | StringRef GPU, StringRef FS); |
1235 | | |
1236 | 1.16k | bool hasBFE() const { |
1237 | 1.16k | return (getGeneration() >= EVERGREEN); |
1238 | 1.16k | } |
1239 | | |
1240 | 290 | bool hasBFI() const { |
1241 | 290 | return (getGeneration() >= EVERGREEN); |
1242 | 290 | } |
1243 | | |
1244 | 580 | bool hasBCNT(unsigned Size) const { |
1245 | 580 | if (Size == 32) |
1246 | 290 | return (getGeneration() >= EVERGREEN); |
1247 | 290 | |
1248 | 290 | return false; |
1249 | 290 | } |
1250 | | |
1251 | 290 | bool hasBORROW() const { |
1252 | 290 | return (getGeneration() >= EVERGREEN); |
1253 | 290 | } |
1254 | | |
1255 | 290 | bool hasCARRY() const { |
1256 | 290 | return (getGeneration() >= EVERGREEN); |
1257 | 290 | } |
1258 | | |
1259 | 233k | bool hasCaymanISA() const { |
1260 | 233k | return CaymanISA; |
1261 | 233k | } |
1262 | | |
1263 | 290 | bool hasFFBL() const { |
1264 | 290 | return (getGeneration() >= EVERGREEN); |
1265 | 290 | } |
1266 | | |
1267 | 290 | bool hasFFBH() const { |
1268 | 290 | return (getGeneration() >= EVERGREEN); |
1269 | 290 | } |
1270 | | |
1271 | 299 | bool hasFMA() const { return FMA; } |
1272 | | |
1273 | 9.37k | bool hasCFAluBug() const { return CFALUBug; } |
1274 | | |
1275 | 131k | bool hasVertexCache() const { return HasVertexCache; } |
1276 | | |
1277 | 4.62k | short getTexVTXClauseSize() const { return TexVTXClauseSize; } |
1278 | | |
1279 | 7.05k | bool enableMachineScheduler() const override { |
1280 | 7.05k | return true; |
1281 | 7.05k | } |
1282 | | |
1283 | 2.29k | bool enableSubRegLiveness() const override { |
1284 | 2.29k | return true; |
1285 | 2.29k | } |
1286 | | |
1287 | | /// \returns Maximum number of work groups per compute unit supported by the |
1288 | | /// subtarget and limited by given \p FlatWorkGroupSize. |
1289 | 4.00k | unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override { |
1290 | 4.00k | return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize); |
1291 | 4.00k | } |
1292 | | |
1293 | | /// \returns Minimum flat work group size supported by the subtarget. |
1294 | 6.53k | unsigned getMinFlatWorkGroupSize() const override { |
1295 | 6.53k | return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this); |
1296 | 6.53k | } |
1297 | | |
1298 | | /// \returns Maximum flat work group size supported by the subtarget. |
1299 | 6.53k | unsigned getMaxFlatWorkGroupSize() const override { |
1300 | 6.53k | return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this); |
1301 | 6.53k | } |
1302 | | |
1303 | | /// \returns Maximum number of waves per execution unit supported by the |
1304 | | /// subtarget and limited by given \p FlatWorkGroupSize. |
1305 | 2.00k | unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const override { |
1306 | 2.00k | return AMDGPU::IsaInfo::getMaxWavesPerEU(this, FlatWorkGroupSize); |
1307 | 2.00k | } |
1308 | | |
1309 | | /// \returns Minimum number of waves per execution unit supported by the |
1310 | | /// subtarget. |
1311 | 2.00k | unsigned getMinWavesPerEU() const override { |
1312 | 2.00k | return AMDGPU::IsaInfo::getMinWavesPerEU(this); |
1313 | 2.00k | } |
1314 | | }; |
1315 | | |
1316 | | } // end namespace llvm |
1317 | | |
1318 | | #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H |