/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer -------------------===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | // |
9 | | /// \file |
10 | | /// |
11 | | /// The AMDGPUAsmPrinter is used to print both assembly string and also binary |
12 | | /// code. When passed an MCAsmStreamer it prints assembly and when passed |
13 | | /// an MCObjectStreamer it outputs binary code. |
14 | | // |
15 | | //===----------------------------------------------------------------------===// |
16 | | // |
17 | | |
18 | | #include "AMDGPUAsmPrinter.h" |
19 | | #include "AMDGPU.h" |
20 | | #include "AMDGPUSubtarget.h" |
21 | | #include "AMDGPUTargetMachine.h" |
22 | | #include "MCTargetDesc/AMDGPUInstPrinter.h" |
23 | | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
24 | | #include "MCTargetDesc/AMDGPUTargetStreamer.h" |
25 | | #include "R600AsmPrinter.h" |
26 | | #include "R600Defines.h" |
27 | | #include "R600MachineFunctionInfo.h" |
28 | | #include "R600RegisterInfo.h" |
29 | | #include "SIDefines.h" |
30 | | #include "SIInstrInfo.h" |
31 | | #include "SIMachineFunctionInfo.h" |
32 | | #include "SIRegisterInfo.h" |
33 | | #include "TargetInfo/AMDGPUTargetInfo.h" |
34 | | #include "Utils/AMDGPUBaseInfo.h" |
35 | | #include "llvm/BinaryFormat/ELF.h" |
36 | | #include "llvm/CodeGen/MachineFrameInfo.h" |
37 | | #include "llvm/IR/DiagnosticInfo.h" |
38 | | #include "llvm/MC/MCAssembler.h" |
39 | | #include "llvm/MC/MCContext.h" |
40 | | #include "llvm/MC/MCSectionELF.h" |
41 | | #include "llvm/MC/MCStreamer.h" |
42 | | #include "llvm/Support/AMDGPUMetadata.h" |
43 | | #include "llvm/Support/MathExtras.h" |
44 | | #include "llvm/Support/TargetParser.h" |
45 | | #include "llvm/Support/TargetRegistry.h" |
46 | | #include "llvm/Target/TargetLoweringObjectFile.h" |
47 | | |
48 | | using namespace llvm; |
49 | | using namespace llvm::AMDGPU; |
50 | | using namespace llvm::AMDGPU::HSAMD; |
51 | | |
52 | | // TODO: This should get the default rounding mode from the kernel. We just set |
53 | | // the default here, but this could change if the OpenCL rounding mode pragmas |
54 | | // are used. |
55 | | // |
56 | | // The denormal mode here should match what is reported by the OpenCL runtime |
57 | | // for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but |
58 | | // can also be override to flush with the -cl-denorms-are-zero compiler flag. |
59 | | // |
60 | | // AMD OpenCL only sets flush none and reports CL_FP_DENORM for double |
61 | | // precision, and leaves single precision to flush all and does not report |
62 | | // CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports |
63 | | // CL_FP_DENORM for both. |
64 | | // |
65 | | // FIXME: It seems some instructions do not support single precision denormals |
66 | | // regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32, |
67 | | // and sin_f32, cos_f32 on most parts). |
68 | | |
69 | | // We want to use these instructions, and using fp32 denormals also causes |
70 | | // instructions to run at the double precision rate for the device so it's |
71 | | // probably best to just report no single precision denormals. |
72 | 23.1k | static uint32_t getFPMode(const MachineFunction &F) { |
73 | 23.1k | const GCNSubtarget& ST = F.getSubtarget<GCNSubtarget>(); |
74 | 23.1k | // TODO: Is there any real use for the flush in only / flush out only modes? |
75 | 23.1k | |
76 | 23.1k | uint32_t FP32Denormals = |
77 | 23.1k | ST.hasFP32Denormals() ? FP_DENORM_FLUSH_NONE354 : FP_DENORM_FLUSH_IN_FLUSH_OUT22.8k ; |
78 | 23.1k | |
79 | 23.1k | uint32_t FP64Denormals = |
80 | 23.1k | ST.hasFP64Denormals() ? FP_DENORM_FLUSH_NONE22.7k : FP_DENORM_FLUSH_IN_FLUSH_OUT359 ; |
81 | 23.1k | |
82 | 23.1k | return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) | |
83 | 23.1k | FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) | |
84 | 23.1k | FP_DENORM_MODE_SP(FP32Denormals) | |
85 | 23.1k | FP_DENORM_MODE_DP(FP64Denormals); |
86 | 23.1k | } |
87 | | |
88 | | static AsmPrinter * |
89 | | createAMDGPUAsmPrinterPass(TargetMachine &tm, |
90 | 2.41k | std::unique_ptr<MCStreamer> &&Streamer) { |
91 | 2.41k | return new AMDGPUAsmPrinter(tm, std::move(Streamer)); |
92 | 2.41k | } |
93 | | |
94 | 78.9k | extern "C" void LLVMInitializeAMDGPUAsmPrinter() { |
95 | 78.9k | TargetRegistry::RegisterAsmPrinter(getTheAMDGPUTarget(), |
96 | 78.9k | llvm::createR600AsmPrinterPass); |
97 | 78.9k | TargetRegistry::RegisterAsmPrinter(getTheGCNTarget(), |
98 | 78.9k | createAMDGPUAsmPrinterPass); |
99 | 78.9k | } |
100 | | |
101 | | AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, |
102 | | std::unique_ptr<MCStreamer> Streamer) |
103 | 2.41k | : AsmPrinter(TM, std::move(Streamer)) { |
104 | 2.41k | if (IsaInfo::hasCodeObjectV3(getGlobalSTI())) |
105 | 290 | HSAMetadataStream.reset(new MetadataStreamerV3()); |
106 | 2.12k | else |
107 | 2.12k | HSAMetadataStream.reset(new MetadataStreamerV2()); |
108 | 2.41k | } |
109 | | |
110 | 25.4k | StringRef AMDGPUAsmPrinter::getPassName() const { |
111 | 25.4k | return "AMDGPU Assembly Printer"; |
112 | 25.4k | } |
113 | | |
114 | 84.5k | const MCSubtargetInfo *AMDGPUAsmPrinter::getGlobalSTI() const { |
115 | 84.5k | return TM.getMCSubtargetInfo(); |
116 | 84.5k | } |
117 | | |
118 | 15.2k | AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const { |
119 | 15.2k | if (!OutStreamer) |
120 | 0 | return nullptr; |
121 | 15.2k | return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer()); |
122 | 15.2k | } |
123 | | |
124 | 2.41k | void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) { |
125 | 2.41k | if (IsaInfo::hasCodeObjectV3(getGlobalSTI())) { |
126 | 290 | std::string ExpectedTarget; |
127 | 290 | raw_string_ostream ExpectedTargetOS(ExpectedTarget); |
128 | 290 | IsaInfo::streamIsaVersion(getGlobalSTI(), ExpectedTargetOS); |
129 | 290 | |
130 | 290 | getTargetStreamer()->EmitDirectiveAMDGCNTarget(ExpectedTarget); |
131 | 290 | } |
132 | 2.41k | |
133 | 2.41k | if (TM.getTargetTriple().getOS() != Triple::AMDHSA && |
134 | 2.41k | TM.getTargetTriple().getOS() != Triple::AMDPAL1.95k ) |
135 | 1.86k | return; |
136 | 542 | |
137 | 542 | if (TM.getTargetTriple().getOS() == Triple::AMDHSA) |
138 | 456 | HSAMetadataStream->begin(M); |
139 | 542 | |
140 | 542 | if (TM.getTargetTriple().getOS() == Triple::AMDPAL) |
141 | 86 | getTargetStreamer()->getPALMetadata()->readFromIR(M); |
142 | 542 | |
143 | 542 | if (IsaInfo::hasCodeObjectV3(getGlobalSTI())) |
144 | 290 | return; |
145 | 252 | |
146 | 252 | // HSA emits NT_AMDGPU_HSA_CODE_OBJECT_VERSION for code objects v2. |
147 | 252 | if (TM.getTargetTriple().getOS() == Triple::AMDHSA) |
148 | 166 | getTargetStreamer()->EmitDirectiveHSACodeObjectVersion(2, 1); |
149 | 252 | |
150 | 252 | // HSA and PAL emit NT_AMDGPU_HSA_ISA for code objects v2. |
151 | 252 | IsaVersion Version = getIsaVersion(getGlobalSTI()->getCPU()); |
152 | 252 | getTargetStreamer()->EmitDirectiveHSACodeObjectISA( |
153 | 252 | Version.Major, Version.Minor, Version.Stepping, "AMD", "AMDGPU"); |
154 | 252 | } |
155 | | |
156 | 2.39k | void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) { |
157 | 2.39k | // Following code requires TargetStreamer to be present. |
158 | 2.39k | if (!getTargetStreamer()) |
159 | 1 | return; |
160 | 2.39k | |
161 | 2.39k | if (!IsaInfo::hasCodeObjectV3(getGlobalSTI())) { |
162 | 2.10k | // Emit ISA Version (NT_AMD_AMDGPU_ISA). |
163 | 2.10k | std::string ISAVersionString; |
164 | 2.10k | raw_string_ostream ISAVersionStream(ISAVersionString); |
165 | 2.10k | IsaInfo::streamIsaVersion(getGlobalSTI(), ISAVersionStream); |
166 | 2.10k | getTargetStreamer()->EmitISAVersion(ISAVersionStream.str()); |
167 | 2.10k | } |
168 | 2.39k | |
169 | 2.39k | // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA). |
170 | 2.39k | if (TM.getTargetTriple().getOS() == Triple::AMDHSA) { |
171 | 454 | HSAMetadataStream->end(); |
172 | 454 | bool Success = HSAMetadataStream->emitTo(*getTargetStreamer()); |
173 | 454 | (void)Success; |
174 | 454 | assert(Success && "Malformed HSA Metadata"); |
175 | 454 | } |
176 | 2.39k | } |
177 | | |
178 | | bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough( |
179 | 10.2k | const MachineBasicBlock *MBB) const { |
180 | 10.2k | if (!AsmPrinter::isBlockOnlyReachableByFallthrough(MBB)) |
181 | 7.44k | return false; |
182 | 2.80k | |
183 | 2.80k | if (MBB->empty()) |
184 | 51 | return true; |
185 | 2.75k | |
186 | 2.75k | // If this is a block implementing a long branch, an expression relative to |
187 | 2.75k | // the start of the block is needed. to the start of the block. |
188 | 2.75k | // XXX - Is there a smarter way to check this? |
189 | 2.75k | return (MBB->back().getOpcode() != AMDGPU::S_SETPC_B64); |
190 | 2.75k | } |
191 | | |
192 | 25.4k | void AMDGPUAsmPrinter::EmitFunctionBodyStart() { |
193 | 25.4k | const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); |
194 | 25.4k | if (!MFI.isEntryFunction()) |
195 | 2.25k | return; |
196 | 23.1k | |
197 | 23.1k | const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>(); |
198 | 23.1k | const Function &F = MF->getFunction(); |
199 | 23.1k | if (!STM.hasCodeObjectV3() && STM.isAmdHsaOrMesa(F)19.8k && |
200 | 23.1k | (1.17k F.getCallingConv() == CallingConv::AMDGPU_KERNEL1.17k || |
201 | 1.17k | F.getCallingConv() == CallingConv::SPIR_KERNEL3 )) { |
202 | 1.16k | amd_kernel_code_t KernelCode; |
203 | 1.16k | getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF); |
204 | 1.16k | getTargetStreamer()->EmitAMDKernelCodeT(KernelCode); |
205 | 1.16k | } |
206 | 23.1k | |
207 | 23.1k | if (STM.isAmdHsaOS()) |
208 | 4.03k | HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo); |
209 | 23.1k | } |
210 | | |
211 | 25.4k | void AMDGPUAsmPrinter::EmitFunctionBodyEnd() { |
212 | 25.4k | const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); |
213 | 25.4k | if (!MFI.isEntryFunction()) |
214 | 2.25k | return; |
215 | 23.1k | |
216 | 23.1k | if (!IsaInfo::hasCodeObjectV3(getGlobalSTI()) || |
217 | 23.1k | TM.getTargetTriple().getOS() != Triple::AMDHSA3.26k ) |
218 | 19.8k | return; |
219 | 3.26k | |
220 | 3.26k | auto &Streamer = getTargetStreamer()->getStreamer(); |
221 | 3.26k | auto &Context = Streamer.getContext(); |
222 | 3.26k | auto &ObjectFileInfo = *Context.getObjectFileInfo(); |
223 | 3.26k | auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection(); |
224 | 3.26k | |
225 | 3.26k | Streamer.PushSection(); |
226 | 3.26k | Streamer.SwitchSection(&ReadOnlySection); |
227 | 3.26k | |
228 | 3.26k | // CP microcode requires the kernel descriptor to be allocated on 64 byte |
229 | 3.26k | // alignment. |
230 | 3.26k | Streamer.EmitValueToAlignment(64, 0, 1, 0); |
231 | 3.26k | if (ReadOnlySection.getAlignment() < 64) |
232 | 233 | ReadOnlySection.setAlignment(64); |
233 | 3.26k | |
234 | 3.26k | const MCSubtargetInfo &STI = MF->getSubtarget(); |
235 | 3.26k | |
236 | 3.26k | SmallString<128> KernelName; |
237 | 3.26k | getNameWithPrefix(KernelName, &MF->getFunction()); |
238 | 3.26k | getTargetStreamer()->EmitAmdhsaKernelDescriptor( |
239 | 3.26k | STI, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo), |
240 | 3.26k | CurrentProgramInfo.NumVGPRsForWavesPerEU, |
241 | 3.26k | CurrentProgramInfo.NumSGPRsForWavesPerEU - |
242 | 3.26k | IsaInfo::getNumExtraSGPRs(&STI, |
243 | 3.26k | CurrentProgramInfo.VCCUsed, |
244 | 3.26k | CurrentProgramInfo.FlatUsed), |
245 | 3.26k | CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed, |
246 | 3.26k | hasXNACK(STI)); |
247 | 3.26k | |
248 | 3.26k | Streamer.PopSection(); |
249 | 3.26k | } |
250 | | |
251 | 25.4k | void AMDGPUAsmPrinter::EmitFunctionEntryLabel() { |
252 | 25.4k | if (IsaInfo::hasCodeObjectV3(getGlobalSTI()) && |
253 | 25.4k | TM.getTargetTriple().getOS() == Triple::AMDHSA3.62k ) { |
254 | 3.62k | AsmPrinter::EmitFunctionEntryLabel(); |
255 | 3.62k | return; |
256 | 3.62k | } |
257 | 21.7k | |
258 | 21.7k | const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); |
259 | 21.7k | const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>(); |
260 | 21.7k | if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())19.8k ) { |
261 | 1.17k | SmallString<128> SymbolName; |
262 | 1.17k | getNameWithPrefix(SymbolName, &MF->getFunction()), |
263 | 1.17k | getTargetStreamer()->EmitAMDGPUSymbolType( |
264 | 1.17k | SymbolName, ELF::STT_AMDGPU_HSA_KERNEL); |
265 | 1.17k | } |
266 | 21.7k | if (DumpCodeInstEmitter) { |
267 | 2 | // Disassemble function name label to text. |
268 | 2 | DisasmLines.push_back(MF->getName().str() + ":"); |
269 | 2 | DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size()); |
270 | 2 | HexLines.push_back(""); |
271 | 2 | } |
272 | 21.7k | |
273 | 21.7k | AsmPrinter::EmitFunctionEntryLabel(); |
274 | 21.7k | } |
275 | | |
276 | 28.8k | void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const { |
277 | 28.8k | if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)2 ) { |
278 | 2 | // Write a line for the basic block label if it is not only fallthrough. |
279 | 2 | DisasmLines.push_back( |
280 | 2 | (Twine("BB") + Twine(getFunctionNumber()) |
281 | 2 | + "_" + Twine(MBB.getNumber()) + ":").str()); |
282 | 2 | DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size()); |
283 | 2 | HexLines.push_back(""); |
284 | 2 | } |
285 | 28.8k | AsmPrinter::EmitBasicBlockStart(MBB); |
286 | 28.8k | } |
287 | | |
288 | 348 | void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { |
289 | 348 | if (GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { |
290 | 232 | if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())224 ) { |
291 | 4 | OutContext.reportError({}, |
292 | 4 | Twine(GV->getName()) + |
293 | 4 | ": unsupported initializer for address space"); |
294 | 4 | return; |
295 | 4 | } |
296 | 228 | |
297 | 228 | // LDS variables aren't emitted in HSA or PAL yet. |
298 | 228 | const Triple::OSType OS = TM.getTargetTriple().getOS(); |
299 | 228 | if (OS == Triple::AMDHSA || OS == Triple::AMDPAL161 ) |
300 | 73 | return; |
301 | 155 | |
302 | 155 | MCSymbol *GVSym = getSymbol(GV); |
303 | 155 | |
304 | 155 | GVSym->redefineIfPossible(); |
305 | 155 | if (GVSym->isDefined() || GVSym->isVariable()) |
306 | 0 | report_fatal_error("symbol '" + Twine(GVSym->getName()) + |
307 | 0 | "' is already defined"); |
308 | 155 | |
309 | 155 | const DataLayout &DL = GV->getParent()->getDataLayout(); |
310 | 155 | uint64_t Size = DL.getTypeAllocSize(GV->getValueType()); |
311 | 155 | unsigned Align = GV->getAlignment(); |
312 | 155 | if (!Align) |
313 | 38 | Align = 4; |
314 | 155 | |
315 | 155 | EmitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration()); |
316 | 155 | EmitLinkage(GV, GVSym); |
317 | 155 | if (auto TS = getTargetStreamer()) |
318 | 154 | TS->emitAMDGPULDS(GVSym, Size, Align); |
319 | 155 | return; |
320 | 155 | } |
321 | 116 | |
322 | 116 | AsmPrinter::EmitGlobalVariable(GV); |
323 | 116 | } |
324 | | |
325 | 2.40k | bool AMDGPUAsmPrinter::doFinalization(Module &M) { |
326 | 2.40k | CallGraphResourceInfo.clear(); |
327 | 2.40k | |
328 | 2.40k | // Pad with s_code_end to help tools and guard against instruction prefetch |
329 | 2.40k | // causing stale data in caches. Arguably this should be done by the linker, |
330 | 2.40k | // which is why this isn't done for Mesa. |
331 | 2.40k | const MCSubtargetInfo &STI = *getGlobalSTI(); |
332 | 2.40k | if (AMDGPU::isGFX10(STI) && |
333 | 2.40k | (153 STI.getTargetTriple().getOS() == Triple::AMDHSA153 || |
334 | 153 | STI.getTargetTriple().getOS() == Triple::AMDPAL121 )) { |
335 | 35 | OutStreamer->SwitchSection(getObjFileLowering().getTextSection()); |
336 | 35 | getTargetStreamer()->EmitCodeEnd(); |
337 | 35 | } |
338 | 2.40k | |
339 | 2.40k | return AsmPrinter::doFinalization(M); |
340 | 2.40k | } |
341 | | |
342 | | // Print comments that apply to both callable functions and entry points. |
343 | | void AMDGPUAsmPrinter::emitCommonFunctionComments( |
344 | | uint32_t NumVGPR, |
345 | | uint32_t NumSGPR, |
346 | | uint64_t ScratchSize, |
347 | | uint64_t CodeSize, |
348 | 24.7k | const AMDGPUMachineFunction *MFI) { |
349 | 24.7k | OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false); |
350 | 24.7k | OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false); |
351 | 24.7k | OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false); |
352 | 24.7k | OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false); |
353 | 24.7k | OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()), |
354 | 24.7k | false); |
355 | 24.7k | } |
356 | | |
357 | | uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( |
358 | 3.26k | const MachineFunction &MF) const { |
359 | 3.26k | const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); |
360 | 3.26k | uint16_t KernelCodeProperties = 0; |
361 | 3.26k | |
362 | 3.26k | if (MFI.hasPrivateSegmentBuffer()) { |
363 | 3.26k | KernelCodeProperties |= |
364 | 3.26k | amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER; |
365 | 3.26k | } |
366 | 3.26k | if (MFI.hasDispatchPtr()) { |
367 | 7 | KernelCodeProperties |= |
368 | 7 | amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; |
369 | 7 | } |
370 | 3.26k | if (MFI.hasQueuePtr()) { |
371 | 4 | KernelCodeProperties |= |
372 | 4 | amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; |
373 | 4 | } |
374 | 3.26k | if (MFI.hasKernargSegmentPtr()) { |
375 | 2.72k | KernelCodeProperties |= |
376 | 2.72k | amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR; |
377 | 2.72k | } |
378 | 3.26k | if (MFI.hasDispatchID()) { |
379 | 0 | KernelCodeProperties |= |
380 | 0 | amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID; |
381 | 0 | } |
382 | 3.26k | if (MFI.hasFlatScratchInit()) { |
383 | 240 | KernelCodeProperties |= |
384 | 240 | amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; |
385 | 240 | } |
386 | 3.26k | if (MF.getSubtarget<GCNSubtarget>().isWave32()) { |
387 | 864 | KernelCodeProperties |= |
388 | 864 | amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32; |
389 | 864 | } |
390 | 3.26k | |
391 | 3.26k | return KernelCodeProperties; |
392 | 3.26k | } |
393 | | |
394 | | amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor( |
395 | | const MachineFunction &MF, |
396 | 3.26k | const SIProgramInfo &PI) const { |
397 | 3.26k | amdhsa::kernel_descriptor_t KernelDescriptor; |
398 | 3.26k | memset(&KernelDescriptor, 0x0, sizeof(KernelDescriptor)); |
399 | 3.26k | |
400 | 3.26k | assert(isUInt<32>(PI.ScratchSize)); |
401 | 3.26k | assert(isUInt<32>(PI.ComputePGMRSrc1)); |
402 | 3.26k | assert(isUInt<32>(PI.ComputePGMRSrc2)); |
403 | 3.26k | |
404 | 3.26k | KernelDescriptor.group_segment_fixed_size = PI.LDSSize; |
405 | 3.26k | KernelDescriptor.private_segment_fixed_size = PI.ScratchSize; |
406 | 3.26k | KernelDescriptor.compute_pgm_rsrc1 = PI.ComputePGMRSrc1; |
407 | 3.26k | KernelDescriptor.compute_pgm_rsrc2 = PI.ComputePGMRSrc2; |
408 | 3.26k | KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF); |
409 | 3.26k | |
410 | 3.26k | return KernelDescriptor; |
411 | 3.26k | } |
412 | | |
413 | 25.4k | bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { |
414 | 25.4k | CurrentProgramInfo = SIProgramInfo(); |
415 | 25.4k | |
416 | 25.4k | const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>(); |
417 | 25.4k | |
418 | 25.4k | // The starting address of all shader programs must be 256 bytes aligned. |
419 | 25.4k | // Regular functions just need the basic required instruction alignment. |
420 | 25.4k | MF.setAlignment(MFI->isEntryFunction() ? 823.1k : 22.25k ); |
421 | 25.4k | |
422 | 25.4k | SetupMachineFunction(MF); |
423 | 25.4k | |
424 | 25.4k | const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); |
425 | 25.4k | MCContext &Context = getObjFileLowering().getContext(); |
426 | 25.4k | // FIXME: This should be an explicit check for Mesa. |
427 | 25.4k | if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()20.8k ) { |
428 | 20.4k | MCSectionELF *ConfigSection = |
429 | 20.4k | Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0); |
430 | 20.4k | OutStreamer->SwitchSection(ConfigSection); |
431 | 20.4k | } |
432 | 25.4k | |
433 | 25.4k | if (MFI->isEntryFunction()) { |
434 | 23.1k | getSIProgramInfo(CurrentProgramInfo, MF); |
435 | 23.1k | } else { |
436 | 2.25k | auto I = CallGraphResourceInfo.insert( |
437 | 2.25k | std::make_pair(&MF.getFunction(), SIFunctionResourceInfo())); |
438 | 2.25k | SIFunctionResourceInfo &Info = I.first->second; |
439 | 2.25k | assert(I.second && "should only be called once per function"); |
440 | 2.25k | Info = analyzeResourceUsage(MF); |
441 | 2.25k | } |
442 | 25.4k | |
443 | 25.4k | if (STM.isAmdPalOS()) |
444 | 394 | EmitPALMetadata(MF, CurrentProgramInfo); |
445 | 25.0k | else if (!STM.isAmdHsaOS()) { |
446 | 20.4k | EmitProgramInfoSI(MF, CurrentProgramInfo); |
447 | 20.4k | } |
448 | 25.4k | |
449 | 25.4k | DumpCodeInstEmitter = nullptr; |
450 | 25.4k | if (STM.dumpCode()) { |
451 | 2 | // For -dumpcode, get the assembler out of the streamer, even if it does |
452 | 2 | // not really want to let us have it. This only works with -filetype=obj. |
453 | 2 | bool SaveFlag = OutStreamer->getUseAssemblerInfoForParsing(); |
454 | 2 | OutStreamer->setUseAssemblerInfoForParsing(true); |
455 | 2 | MCAssembler *Assembler = OutStreamer->getAssemblerPtr(); |
456 | 2 | OutStreamer->setUseAssemblerInfoForParsing(SaveFlag); |
457 | 2 | if (Assembler) |
458 | 2 | DumpCodeInstEmitter = Assembler->getEmitterPtr(); |
459 | 2 | } |
460 | 25.4k | |
461 | 25.4k | DisasmLines.clear(); |
462 | 25.4k | HexLines.clear(); |
463 | 25.4k | DisasmLineMaxLen = 0; |
464 | 25.4k | |
465 | 25.4k | EmitFunctionBody(); |
466 | 25.4k | |
467 | 25.4k | if (isVerbose()) { |
468 | 24.7k | MCSectionELF *CommentSection = |
469 | 24.7k | Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0); |
470 | 24.7k | OutStreamer->SwitchSection(CommentSection); |
471 | 24.7k | |
472 | 24.7k | if (!MFI->isEntryFunction()) { |
473 | 2.22k | OutStreamer->emitRawComment(" Function info:", false); |
474 | 2.22k | SIFunctionResourceInfo &Info = CallGraphResourceInfo[&MF.getFunction()]; |
475 | 2.22k | emitCommonFunctionComments( |
476 | 2.22k | Info.NumVGPR, |
477 | 2.22k | Info.getTotalNumSGPRs(MF.getSubtarget<GCNSubtarget>()), |
478 | 2.22k | Info.PrivateSegmentSize, |
479 | 2.22k | getFunctionCodeSize(MF), MFI); |
480 | 2.22k | return false; |
481 | 2.22k | } |
482 | 22.4k | |
483 | 22.4k | OutStreamer->emitRawComment(" Kernel info:", false); |
484 | 22.4k | emitCommonFunctionComments(CurrentProgramInfo.NumVGPR, |
485 | 22.4k | CurrentProgramInfo.NumSGPR, |
486 | 22.4k | CurrentProgramInfo.ScratchSize, |
487 | 22.4k | getFunctionCodeSize(MF), MFI); |
488 | 22.4k | |
489 | 22.4k | OutStreamer->emitRawComment( |
490 | 22.4k | " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false); |
491 | 22.4k | OutStreamer->emitRawComment( |
492 | 22.4k | " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false); |
493 | 22.4k | OutStreamer->emitRawComment( |
494 | 22.4k | " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) + |
495 | 22.4k | " bytes/workgroup (compile time only)", false); |
496 | 22.4k | |
497 | 22.4k | OutStreamer->emitRawComment( |
498 | 22.4k | " SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), false); |
499 | 22.4k | OutStreamer->emitRawComment( |
500 | 22.4k | " VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), false); |
501 | 22.4k | |
502 | 22.4k | OutStreamer->emitRawComment( |
503 | 22.4k | " NumSGPRsForWavesPerEU: " + |
504 | 22.4k | Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), false); |
505 | 22.4k | OutStreamer->emitRawComment( |
506 | 22.4k | " NumVGPRsForWavesPerEU: " + |
507 | 22.4k | Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false); |
508 | 22.4k | |
509 | 22.4k | OutStreamer->emitRawComment( |
510 | 22.4k | " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false); |
511 | 22.4k | |
512 | 22.4k | OutStreamer->emitRawComment( |
513 | 22.4k | " COMPUTE_PGM_RSRC2:USER_SGPR: " + |
514 | 22.4k | Twine(G_00B84C_USER_SGPR(CurrentProgramInfo.ComputePGMRSrc2)), false); |
515 | 22.4k | OutStreamer->emitRawComment( |
516 | 22.4k | " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " + |
517 | 22.4k | Twine(G_00B84C_TRAP_HANDLER(CurrentProgramInfo.ComputePGMRSrc2)), false); |
518 | 22.4k | OutStreamer->emitRawComment( |
519 | 22.4k | " COMPUTE_PGM_RSRC2:TGID_X_EN: " + |
520 | 22.4k | Twine(G_00B84C_TGID_X_EN(CurrentProgramInfo.ComputePGMRSrc2)), false); |
521 | 22.4k | OutStreamer->emitRawComment( |
522 | 22.4k | " COMPUTE_PGM_RSRC2:TGID_Y_EN: " + |
523 | 22.4k | Twine(G_00B84C_TGID_Y_EN(CurrentProgramInfo.ComputePGMRSrc2)), false); |
524 | 22.4k | OutStreamer->emitRawComment( |
525 | 22.4k | " COMPUTE_PGM_RSRC2:TGID_Z_EN: " + |
526 | 22.4k | Twine(G_00B84C_TGID_Z_EN(CurrentProgramInfo.ComputePGMRSrc2)), false); |
527 | 22.4k | OutStreamer->emitRawComment( |
528 | 22.4k | " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " + |
529 | 22.4k | Twine(G_00B84C_TIDIG_COMP_CNT(CurrentProgramInfo.ComputePGMRSrc2)), |
530 | 22.4k | false); |
531 | 22.4k | } |
532 | 25.4k | |
533 | 25.4k | if (23.1k DumpCodeInstEmitter23.1k ) { |
534 | 2 | |
535 | 2 | OutStreamer->SwitchSection( |
536 | 2 | Context.getELFSection(".AMDGPU.disasm", ELF::SHT_NOTE, 0)); |
537 | 2 | |
538 | 20 | for (size_t i = 0; i < DisasmLines.size(); ++i18 ) { |
539 | 18 | std::string Comment = "\n"; |
540 | 18 | if (!HexLines[i].empty()) { |
541 | 14 | Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' '); |
542 | 14 | Comment += " ; " + HexLines[i] + "\n"; |
543 | 14 | } |
544 | 18 | |
545 | 18 | OutStreamer->EmitBytes(StringRef(DisasmLines[i])); |
546 | 18 | OutStreamer->EmitBytes(StringRef(Comment)); |
547 | 18 | } |
548 | 2 | } |
549 | 23.1k | |
550 | 23.1k | return false; |
551 | 25.4k | } |
552 | | |
553 | 24.7k | uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const { |
554 | 24.7k | const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); |
555 | 24.7k | const SIInstrInfo *TII = STM.getInstrInfo(); |
556 | 24.7k | |
557 | 24.7k | uint64_t CodeSize = 0; |
558 | 24.7k | |
559 | 28.0k | for (const MachineBasicBlock &MBB : MF) { |
560 | 427k | for (const MachineInstr &MI : MBB) { |
561 | 427k | // TODO: CodeSize should account for multiple functions. |
562 | 427k | |
563 | 427k | // TODO: Should we count size of debug info? |
564 | 427k | if (MI.isDebugInstr()) |
565 | 46 | continue; |
566 | 427k | |
567 | 427k | CodeSize += TII->getInstSizeInBytes(MI); |
568 | 427k | } |
569 | 28.0k | } |
570 | 24.7k | |
571 | 24.7k | return CodeSize; |
572 | 24.7k | } |
573 | | |
574 | | static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI, |
575 | | const SIInstrInfo &TII, |
576 | 17.5k | unsigned Reg) { |
577 | 17.5k | for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) { |
578 | 12.8k | if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent())12.8k ) |
579 | 55 | return true; |
580 | 12.8k | } |
581 | 17.5k | |
582 | 17.5k | return false17.4k ; |
583 | 17.5k | } |
584 | | |
585 | | int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumSGPRs( |
586 | 2.22k | const GCNSubtarget &ST) const { |
587 | 2.22k | return NumExplicitSGPR + IsaInfo::getNumExtraSGPRs(&ST, |
588 | 2.22k | UsesVCC, UsesFlatScratch); |
589 | 2.22k | } |
590 | | |
591 | | AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( |
592 | 25.4k | const MachineFunction &MF) const { |
593 | 25.4k | SIFunctionResourceInfo Info; |
594 | 25.4k | |
595 | 25.4k | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
596 | 25.4k | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
597 | 25.4k | const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); |
598 | 25.4k | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
599 | 25.4k | const SIInstrInfo *TII = ST.getInstrInfo(); |
600 | 25.4k | const SIRegisterInfo &TRI = TII->getRegisterInfo(); |
601 | 25.4k | |
602 | 25.4k | Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) || |
603 | 25.4k | MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI)19.1k ; |
604 | 25.4k | |
605 | 25.4k | // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat |
606 | 25.4k | // instructions aren't used to access the scratch buffer. Inline assembly may |
607 | 25.4k | // need it though. |
608 | 25.4k | // |
609 | 25.4k | // If we only have implicit uses of flat_scr on flat instructions, it is not |
610 | 25.4k | // really needed. |
611 | 25.4k | if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit()6.30k && |
612 | 25.4k | (5.87k !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR)5.87k && |
613 | 5.87k | !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO)5.84k && |
614 | 5.87k | !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI)5.83k )) { |
615 | 5.82k | Info.UsesFlatScratch = false; |
616 | 5.82k | } |
617 | 25.4k | |
618 | 25.4k | Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects(); |
619 | 25.4k | Info.PrivateSegmentSize = FrameInfo.getStackSize(); |
620 | 25.4k | if (MFI->isStackRealigned()) |
621 | 8 | Info.PrivateSegmentSize += FrameInfo.getMaxAlignment(); |
622 | 25.4k | |
623 | 25.4k | |
624 | 25.4k | Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) || |
625 | 25.4k | MRI.isPhysRegUsed(AMDGPU::VCC_HI)19.6k ; |
626 | 25.4k | |
627 | 25.4k | // If there are no calls, MachineRegisterInfo can tell us the used register |
628 | 25.4k | // count easily. |
629 | 25.4k | // A tail call isn't considered a call for MachineFrameInfo's purposes. |
630 | 25.4k | if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()24.7k ) { |
631 | 24.7k | MCPhysReg HighestVGPRReg = AMDGPU::NoRegister; |
632 | 6.24M | for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) { |
633 | 6.24M | if (MRI.isPhysRegUsed(Reg)) { |
634 | 22.8k | HighestVGPRReg = Reg; |
635 | 22.8k | break; |
636 | 22.8k | } |
637 | 6.22M | MCPhysReg AReg = AMDGPU::AGPR0 + TRI.getHWRegIndex(Reg); |
638 | 6.22M | if (MRI.isPhysRegUsed(AReg)) { |
639 | 21 | HighestVGPRReg = AReg; |
640 | 21 | break; |
641 | 21 | } |
642 | 6.22M | } |
643 | 24.7k | |
644 | 24.7k | MCPhysReg HighestSGPRReg = AMDGPU::NoRegister; |
645 | 2.41M | for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) { |
646 | 2.41M | if (MRI.isPhysRegUsed(Reg)) { |
647 | 22.5k | HighestSGPRReg = Reg; |
648 | 22.5k | break; |
649 | 22.5k | } |
650 | 2.41M | } |
651 | 24.7k | |
652 | 24.7k | // We found the maximum register index. They start at 0, so add one to get the |
653 | 24.7k | // number of registers. |
654 | 24.7k | Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister ? 01.83k : |
655 | 24.7k | TRI.getHWRegIndex(HighestVGPRReg) + 122.9k ; |
656 | 24.7k | Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister ? 02.15k : |
657 | 24.7k | TRI.getHWRegIndex(HighestSGPRReg) + 122.5k ; |
658 | 24.7k | |
659 | 24.7k | return Info; |
660 | 24.7k | } |
661 | 661 | |
662 | 661 | int32_t MaxVGPR = -1; |
663 | 661 | int32_t MaxSGPR = -1; |
664 | 661 | uint64_t CalleeFrameSize = 0; |
665 | 661 | |
666 | 716 | for (const MachineBasicBlock &MBB : MF) { |
667 | 12.6k | for (const MachineInstr &MI : MBB) { |
668 | 12.6k | // TODO: Check regmasks? Do they occur anywhere except calls? |
669 | 52.0k | for (const MachineOperand &MO : MI.operands()) { |
670 | 52.0k | unsigned Width = 0; |
671 | 52.0k | bool IsSGPR = false; |
672 | 52.0k | |
673 | 52.0k | if (!MO.isReg()) |
674 | 16.7k | continue; |
675 | 35.2k | |
676 | 35.2k | unsigned Reg = MO.getReg(); |
677 | 35.2k | switch (Reg) { |
678 | 35.2k | case AMDGPU::EXEC: |
679 | 7.15k | case AMDGPU::EXEC_LO: |
680 | 7.15k | case AMDGPU::EXEC_HI: |
681 | 7.15k | case AMDGPU::SCC: |
682 | 7.15k | case AMDGPU::M0: |
683 | 7.15k | case AMDGPU::SRC_SHARED_BASE: |
684 | 7.15k | case AMDGPU::SRC_SHARED_LIMIT: |
685 | 7.15k | case AMDGPU::SRC_PRIVATE_BASE: |
686 | 7.15k | case AMDGPU::SRC_PRIVATE_LIMIT: |
687 | 7.15k | case AMDGPU::SGPR_NULL: |
688 | 7.15k | continue; |
689 | 7.15k | |
690 | 7.15k | case AMDGPU::SRC_POPS_EXITING_WAVE_ID: |
691 | 0 | llvm_unreachable("src_pops_exiting_wave_id should not be used"); |
692 | 7.15k | |
693 | 7.15k | case AMDGPU::NoRegister: |
694 | 5 | assert(MI.isDebugInstr()); |
695 | 5 | continue; |
696 | 7.15k | |
697 | 7.15k | case AMDGPU::VCC: |
698 | 610 | case AMDGPU::VCC_LO: |
699 | 610 | case AMDGPU::VCC_HI: |
700 | 610 | Info.UsesVCC = true; |
701 | 610 | continue; |
702 | 610 | |
703 | 763 | case AMDGPU::FLAT_SCR: |
704 | 763 | case AMDGPU::FLAT_SCR_LO: |
705 | 763 | case AMDGPU::FLAT_SCR_HI: |
706 | 763 | continue; |
707 | 763 | |
708 | 763 | case AMDGPU::XNACK_MASK: |
709 | 0 | case AMDGPU::XNACK_MASK_LO: |
710 | 0 | case AMDGPU::XNACK_MASK_HI: |
711 | 0 | llvm_unreachable("xnack_mask registers should not be used"); |
712 | 0 |
|
713 | 0 | case AMDGPU::LDS_DIRECT: |
714 | 0 | llvm_unreachable("lds_direct register should not be used"); |
715 | 0 |
|
716 | 0 | case AMDGPU::TBA: |
717 | 0 | case AMDGPU::TBA_LO: |
718 | 0 | case AMDGPU::TBA_HI: |
719 | 0 | case AMDGPU::TMA: |
720 | 0 | case AMDGPU::TMA_LO: |
721 | 0 | case AMDGPU::TMA_HI: |
722 | 0 | llvm_unreachable("trap handler registers should not be used"); |
723 | 0 |
|
724 | 0 | case AMDGPU::SRC_VCCZ: |
725 | 0 | llvm_unreachable("src_vccz register should not be used"); |
726 | 0 |
|
727 | 0 | case AMDGPU::SRC_EXECZ: |
728 | 0 | llvm_unreachable("src_execz register should not be used"); |
729 | 0 |
|
730 | 0 | case AMDGPU::SRC_SCC: |
731 | 0 | llvm_unreachable("src_scc register should not be used"); |
732 | 0 |
|
733 | 26.7k | default: |
734 | 26.7k | break; |
735 | 26.7k | } |
736 | 26.7k | |
737 | 26.7k | if (AMDGPU::SReg_32RegClass.contains(Reg)) { |
738 | 9.09k | assert(!AMDGPU::TTMP_32RegClass.contains(Reg) && |
739 | 9.09k | "trap handler registers should not be used"); |
740 | 9.09k | IsSGPR = true; |
741 | 9.09k | Width = 1; |
742 | 17.6k | } else if (AMDGPU::VGPR_32RegClass.contains(Reg)) { |
743 | 9.14k | IsSGPR = false; |
744 | 9.14k | Width = 1; |
745 | 9.14k | } else if (8.52k AMDGPU::AGPR_32RegClass.contains(Reg)8.52k ) { |
746 | 0 | IsSGPR = false; |
747 | 0 | Width = 1; |
748 | 8.52k | } else if (AMDGPU::SReg_64RegClass.contains(Reg)) { |
749 | 4.53k | assert(!AMDGPU::TTMP_64RegClass.contains(Reg) && |
750 | 4.53k | "trap handler registers should not be used"); |
751 | 4.53k | IsSGPR = true; |
752 | 4.53k | Width = 2; |
753 | 4.53k | } else if (3.98k AMDGPU::VReg_64RegClass.contains(Reg)3.98k ) { |
754 | 461 | IsSGPR = false; |
755 | 461 | Width = 2; |
756 | 3.52k | } else if (AMDGPU::AReg_64RegClass.contains(Reg)) { |
757 | 0 | IsSGPR = false; |
758 | 0 | Width = 2; |
759 | 3.52k | } else if (AMDGPU::VReg_96RegClass.contains(Reg)) { |
760 | 6 | IsSGPR = false; |
761 | 6 | Width = 3; |
762 | 3.51k | } else if (AMDGPU::SReg_96RegClass.contains(Reg)) { |
763 | 3 | Width = 3; |
764 | 3.51k | } else if (AMDGPU::SReg_128RegClass.contains(Reg)) { |
765 | 3.31k | assert(!AMDGPU::TTMP_128RegClass.contains(Reg) && |
766 | 3.31k | "trap handler registers should not be used"); |
767 | 3.31k | IsSGPR = true; |
768 | 3.31k | Width = 4; |
769 | 3.31k | } else if (204 AMDGPU::VReg_128RegClass.contains(Reg)204 ) { |
770 | 196 | IsSGPR = false; |
771 | 196 | Width = 4; |
772 | 196 | } else if (8 AMDGPU::AReg_128RegClass.contains(Reg)8 ) { |
773 | 0 | IsSGPR = false; |
774 | 0 | Width = 4; |
775 | 8 | } else if (AMDGPU::SReg_256RegClass.contains(Reg)) { |
776 | 0 | assert(!AMDGPU::TTMP_256RegClass.contains(Reg) && |
777 | 0 | "trap handler registers should not be used"); |
778 | 0 | IsSGPR = true; |
779 | 0 | Width = 8; |
780 | 8 | } else if (AMDGPU::VReg_256RegClass.contains(Reg)) { |
781 | 0 | IsSGPR = false; |
782 | 0 | Width = 8; |
783 | 8 | } else if (AMDGPU::SReg_512RegClass.contains(Reg)) { |
784 | 8 | assert(!AMDGPU::TTMP_512RegClass.contains(Reg) && |
785 | 8 | "trap handler registers should not be used"); |
786 | 8 | IsSGPR = true; |
787 | 8 | Width = 16; |
788 | 8 | } else if (0 AMDGPU::VReg_512RegClass.contains(Reg)0 ) { |
789 | 0 | IsSGPR = false; |
790 | 0 | Width = 16; |
791 | 0 | } else if (AMDGPU::AReg_512RegClass.contains(Reg)) { |
792 | 0 | IsSGPR = false; |
793 | 0 | Width = 16; |
794 | 0 | } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) { |
795 | 0 | IsSGPR = true; |
796 | 0 | Width = 32; |
797 | 0 | } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) { |
798 | 0 | IsSGPR = false; |
799 | 0 | Width = 32; |
800 | 0 | } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) { |
801 | 0 | IsSGPR = false; |
802 | 0 | Width = 32; |
803 | 0 | } else { |
804 | 0 | llvm_unreachable("Unknown register class"); |
805 | 0 | } |
806 | 26.7k | unsigned HWReg = TRI.getHWRegIndex(Reg); |
807 | 26.7k | int MaxUsed = HWReg + Width - 1; |
808 | 26.7k | if (IsSGPR) { |
809 | 16.9k | MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed1.34k : MaxSGPR15.6k ; |
810 | 16.9k | } else { |
811 | 9.81k | MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed1.52k : MaxVGPR8.29k ; |
812 | 9.81k | } |
813 | 26.7k | } |
814 | 12.6k | |
815 | 12.6k | if (MI.isCall()) { |
816 | 694 | // Pseudo used just to encode the underlying global. Is there a better |
817 | 694 | // way to track this? |
818 | 694 | |
819 | 694 | const MachineOperand *CalleeOp |
820 | 694 | = TII->getNamedOperand(MI, AMDGPU::OpName::callee); |
821 | 694 | const Function *Callee = cast<Function>(CalleeOp->getGlobal()); |
822 | 694 | if (Callee->isDeclaration()) { |
823 | 445 | // If this is a call to an external function, we can't do much. Make |
824 | 445 | // conservative guesses. |
825 | 445 | |
826 | 445 | // 48 SGPRs - vcc, - flat_scr, -xnack |
827 | 445 | int MaxSGPRGuess = |
828 | 445 | 47 - IsaInfo::getNumExtraSGPRs(&ST, true, ST.hasFlatAddressSpace()); |
829 | 445 | MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess); |
830 | 445 | MaxVGPR = std::max(MaxVGPR, 23); |
831 | 445 | |
832 | 445 | CalleeFrameSize = std::max(CalleeFrameSize, UINT64_C(16384)); |
833 | 445 | Info.UsesVCC = true; |
834 | 445 | Info.UsesFlatScratch = ST.hasFlatAddressSpace(); |
835 | 445 | Info.HasDynamicallySizedStack = true; |
836 | 445 | } else { |
837 | 249 | // We force CodeGen to run in SCC order, so the callee's register |
838 | 249 | // usage etc. should be the cumulative usage of all callees. |
839 | 249 | |
840 | 249 | auto I = CallGraphResourceInfo.find(Callee); |
841 | 249 | if (I == CallGraphResourceInfo.end()) { |
842 | 1 | // Avoid crashing on undefined behavior with an illegal call to a |
843 | 1 | // kernel. If a callsite's calling convention doesn't match the |
844 | 1 | // function's, it's undefined behavior. If the callsite calling |
845 | 1 | // convention does match, that would have errored earlier. |
846 | 1 | // FIXME: The verifier shouldn't allow this. |
847 | 1 | if (AMDGPU::isEntryFunctionCC(Callee->getCallingConv())) |
848 | 1 | report_fatal_error("invalid call to entry function"); |
849 | 0 | |
850 | 0 | llvm_unreachable("callee should have been handled before caller"); |
851 | 0 | } |
852 | 248 | |
853 | 248 | MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR); |
854 | 248 | MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR); |
855 | 248 | CalleeFrameSize |
856 | 248 | = std::max(I->second.PrivateSegmentSize, CalleeFrameSize); |
857 | 248 | Info.UsesVCC |= I->second.UsesVCC; |
858 | 248 | Info.UsesFlatScratch |= I->second.UsesFlatScratch; |
859 | 248 | Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack; |
860 | 248 | Info.HasRecursion |= I->second.HasRecursion; |
861 | 248 | } |
862 | 694 | |
863 | 694 | if (693 !Callee->doesNotRecurse()693 ) |
864 | 637 | Info.HasRecursion = true; |
865 | 693 | } |
866 | 12.6k | } |
867 | 716 | } |
868 | 661 | |
869 | 661 | Info.NumExplicitSGPR = MaxSGPR + 1; |
870 | 660 | Info.NumVGPR = MaxVGPR + 1; |
871 | 660 | Info.PrivateSegmentSize += CalleeFrameSize; |
872 | 660 | |
873 | 660 | return Info; |
874 | 661 | } |
875 | | |
876 | | void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, |
877 | 23.1k | const MachineFunction &MF) { |
878 | 23.1k | SIFunctionResourceInfo Info = analyzeResourceUsage(MF); |
879 | 23.1k | |
880 | 23.1k | ProgInfo.NumVGPR = Info.NumVGPR; |
881 | 23.1k | ProgInfo.NumSGPR = Info.NumExplicitSGPR; |
882 | 23.1k | ProgInfo.ScratchSize = Info.PrivateSegmentSize; |
883 | 23.1k | ProgInfo.VCCUsed = Info.UsesVCC; |
884 | 23.1k | ProgInfo.FlatUsed = Info.UsesFlatScratch; |
885 | 23.1k | ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion22.7k ; |
886 | 23.1k | |
887 | 23.1k | if (!isUInt<32>(ProgInfo.ScratchSize)) { |
888 | 2 | DiagnosticInfoStackSize DiagStackSize(MF.getFunction(), |
889 | 2 | ProgInfo.ScratchSize, DS_Error); |
890 | 2 | MF.getFunction().getContext().diagnose(DiagStackSize); |
891 | 2 | } |
892 | 23.1k | |
893 | 23.1k | const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); |
894 | 23.1k | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
895 | 23.1k | |
896 | 23.1k | // TODO(scott.linder): The calculations related to SGPR/VGPR blocks are |
897 | 23.1k | // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be |
898 | 23.1k | // unified. |
899 | 23.1k | unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs( |
900 | 23.1k | &STM, ProgInfo.VCCUsed, ProgInfo.FlatUsed); |
901 | 23.1k | |
902 | 23.1k | // Check the addressable register limit before we add ExtraSGPRs. |
903 | 23.1k | if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && |
904 | 23.1k | !STM.hasSGPRInitBug()14.1k ) { |
905 | 9.23k | unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs(); |
906 | 9.23k | if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) { |
907 | 1 | // This can happen due to a compiler bug or when using inline asm. |
908 | 1 | LLVMContext &Ctx = MF.getFunction().getContext(); |
909 | 1 | DiagnosticInfoResourceLimit Diag(MF.getFunction(), |
910 | 1 | "addressable scalar registers", |
911 | 1 | ProgInfo.NumSGPR, DS_Error, |
912 | 1 | DK_ResourceLimit, |
913 | 1 | MaxAddressableNumSGPRs); |
914 | 1 | Ctx.diagnose(Diag); |
915 | 1 | ProgInfo.NumSGPR = MaxAddressableNumSGPRs - 1; |
916 | 1 | } |
917 | 9.23k | } |
918 | 23.1k | |
919 | 23.1k | // Account for extra SGPRs and VGPRs reserved for debugger use. |
920 | 23.1k | ProgInfo.NumSGPR += ExtraSGPRs; |
921 | 23.1k | |
922 | 23.1k | // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave |
923 | 23.1k | // dispatch registers are function args. |
924 | 23.1k | unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0; |
925 | 54.6k | for (auto &Arg : MF.getFunction().args()) { |
926 | 54.6k | unsigned NumRegs = (Arg.getType()->getPrimitiveSizeInBits() + 31) / 32; |
927 | 54.6k | if (Arg.hasAttribute(Attribute::InReg)) |
928 | 4.46k | WaveDispatchNumSGPR += NumRegs; |
929 | 50.1k | else |
930 | 50.1k | WaveDispatchNumVGPR += NumRegs; |
931 | 54.6k | } |
932 | 23.1k | ProgInfo.NumSGPR = std::max(ProgInfo.NumSGPR, WaveDispatchNumSGPR); |
933 | 23.1k | ProgInfo.NumVGPR = std::max(ProgInfo.NumVGPR, WaveDispatchNumVGPR); |
934 | 23.1k | |
935 | 23.1k | // Adjust number of registers used to meet default/requested minimum/maximum |
936 | 23.1k | // number of waves per execution unit request. |
937 | 23.1k | ProgInfo.NumSGPRsForWavesPerEU = std::max( |
938 | 23.1k | std::max(ProgInfo.NumSGPR, 1u), STM.getMinNumSGPRs(MFI->getMaxWavesPerEU())); |
939 | 23.1k | ProgInfo.NumVGPRsForWavesPerEU = std::max( |
940 | 23.1k | std::max(ProgInfo.NumVGPR, 1u), STM.getMinNumVGPRs(MFI->getMaxWavesPerEU())); |
941 | 23.1k | |
942 | 23.1k | if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS || |
943 | 23.1k | STM.hasSGPRInitBug()14.1k ) { |
944 | 13.9k | unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs(); |
945 | 13.9k | if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) { |
946 | 4 | // This can happen due to a compiler bug or when using inline asm to use |
947 | 4 | // the registers which are usually reserved for vcc etc. |
948 | 4 | LLVMContext &Ctx = MF.getFunction().getContext(); |
949 | 4 | DiagnosticInfoResourceLimit Diag(MF.getFunction(), |
950 | 4 | "scalar registers", |
951 | 4 | ProgInfo.NumSGPR, DS_Error, |
952 | 4 | DK_ResourceLimit, |
953 | 4 | MaxAddressableNumSGPRs); |
954 | 4 | Ctx.diagnose(Diag); |
955 | 4 | ProgInfo.NumSGPR = MaxAddressableNumSGPRs; |
956 | 4 | ProgInfo.NumSGPRsForWavesPerEU = MaxAddressableNumSGPRs; |
957 | 4 | } |
958 | 13.9k | } |
959 | 23.1k | |
960 | 23.1k | if (STM.hasSGPRInitBug()) { |
961 | 4.95k | ProgInfo.NumSGPR = |
962 | 4.95k | AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; |
963 | 4.95k | ProgInfo.NumSGPRsForWavesPerEU = |
964 | 4.95k | AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; |
965 | 4.95k | } |
966 | 23.1k | |
967 | 23.1k | if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) { |
968 | 0 | LLVMContext &Ctx = MF.getFunction().getContext(); |
969 | 0 | DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs", |
970 | 0 | MFI->getNumUserSGPRs(), DS_Error); |
971 | 0 | Ctx.diagnose(Diag); |
972 | 0 | } |
973 | 23.1k | |
974 | 23.1k | if (MFI->getLDSSize() > static_cast<unsigned>(STM.getLocalMemorySize())) { |
975 | 1 | LLVMContext &Ctx = MF.getFunction().getContext(); |
976 | 1 | DiagnosticInfoResourceLimit Diag(MF.getFunction(), "local memory", |
977 | 1 | MFI->getLDSSize(), DS_Error); |
978 | 1 | Ctx.diagnose(Diag); |
979 | 1 | } |
980 | 23.1k | |
981 | 23.1k | ProgInfo.SGPRBlocks = IsaInfo::getNumSGPRBlocks( |
982 | 23.1k | &STM, ProgInfo.NumSGPRsForWavesPerEU); |
983 | 23.1k | ProgInfo.VGPRBlocks = IsaInfo::getNumVGPRBlocks( |
984 | 23.1k | &STM, ProgInfo.NumVGPRsForWavesPerEU); |
985 | 23.1k | |
986 | 23.1k | // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode |
987 | 23.1k | // register. |
988 | 23.1k | ProgInfo.FloatMode = getFPMode(MF); |
989 | 23.1k | |
990 | 23.1k | const SIModeRegisterDefaults Mode = MFI->getMode(); |
991 | 23.1k | ProgInfo.IEEEMode = Mode.IEEE; |
992 | 23.1k | |
993 | 23.1k | // Make clamp modifier on NaN input returns 0. |
994 | 23.1k | ProgInfo.DX10Clamp = Mode.DX10Clamp; |
995 | 23.1k | |
996 | 23.1k | unsigned LDSAlignShift; |
997 | 23.1k | if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) { |
998 | 6.21k | // LDS is allocated in 64 dword blocks. |
999 | 6.21k | LDSAlignShift = 8; |
1000 | 16.9k | } else { |
1001 | 16.9k | // LDS is allocated in 128 dword blocks. |
1002 | 16.9k | LDSAlignShift = 9; |
1003 | 16.9k | } |
1004 | 23.1k | |
1005 | 23.1k | unsigned LDSSpillSize = |
1006 | 23.1k | MFI->getLDSWaveSpillSize() * MFI->getMaxFlatWorkGroupSize(); |
1007 | 23.1k | |
1008 | 23.1k | ProgInfo.LDSSize = MFI->getLDSSize() + LDSSpillSize; |
1009 | 23.1k | ProgInfo.LDSBlocks = |
1010 | 23.1k | alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift; |
1011 | 23.1k | |
1012 | 23.1k | // Scratch is allocated in 256 dword blocks. |
1013 | 23.1k | unsigned ScratchAlignShift = 10; |
1014 | 23.1k | // We need to program the hardware with the amount of scratch memory that |
1015 | 23.1k | // is used by the entire wave. ProgInfo.ScratchSize is the amount of |
1016 | 23.1k | // scratch memory used per thread. |
1017 | 23.1k | ProgInfo.ScratchBlocks = |
1018 | 23.1k | alignTo(ProgInfo.ScratchSize * STM.getWavefrontSize(), |
1019 | 23.1k | 1ULL << ScratchAlignShift) >> |
1020 | 23.1k | ScratchAlignShift; |
1021 | 23.1k | |
1022 | 23.1k | if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) { |
1023 | 2.05k | ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0404 : 11.64k ; |
1024 | 2.05k | ProgInfo.MemOrdered = 1; |
1025 | 2.05k | } |
1026 | 23.1k | |
1027 | 23.1k | ProgInfo.ComputePGMRSrc1 = |
1028 | 23.1k | S_00B848_VGPRS(ProgInfo.VGPRBlocks) | |
1029 | 23.1k | S_00B848_SGPRS(ProgInfo.SGPRBlocks) | |
1030 | 23.1k | S_00B848_PRIORITY(ProgInfo.Priority) | |
1031 | 23.1k | S_00B848_FLOAT_MODE(ProgInfo.FloatMode) | |
1032 | 23.1k | S_00B848_PRIV(ProgInfo.Priv) | |
1033 | 23.1k | S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) | |
1034 | 23.1k | S_00B848_DEBUG_MODE(ProgInfo.DebugMode) | |
1035 | 23.1k | S_00B848_IEEE_MODE(ProgInfo.IEEEMode) | |
1036 | 23.1k | S_00B848_WGP_MODE(ProgInfo.WgpMode) | |
1037 | 23.1k | S_00B848_MEM_ORDERED(ProgInfo.MemOrdered); |
1038 | 23.1k | |
1039 | 23.1k | // 0 = X, 1 = XY, 2 = XYZ |
1040 | 23.1k | unsigned TIDIGCompCnt = 0; |
1041 | 23.1k | if (MFI->hasWorkItemIDZ()) |
1042 | 84 | TIDIGCompCnt = 2; |
1043 | 23.0k | else if (MFI->hasWorkItemIDY()) |
1044 | 88 | TIDIGCompCnt = 1; |
1045 | 23.1k | |
1046 | 23.1k | ProgInfo.ComputePGMRSrc2 = |
1047 | 23.1k | S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) | |
1048 | 23.1k | S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) | |
1049 | 23.1k | // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP. |
1050 | 23.1k | S_00B84C_TRAP_HANDLER(STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled()) | |
1051 | 23.1k | S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) | |
1052 | 23.1k | S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) | |
1053 | 23.1k | S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) | |
1054 | 23.1k | S_00B84C_TG_SIZE_EN(MFI->hasWorkGroupInfo()) | |
1055 | 23.1k | S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) | |
1056 | 23.1k | S_00B84C_EXCP_EN_MSB(0) | |
1057 | 23.1k | // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP. |
1058 | 23.1k | S_00B84C_LDS_SIZE(STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks) | |
1059 | 23.1k | S_00B84C_EXCP_EN(0); |
1060 | 23.1k | } |
1061 | | |
1062 | 20.4k | static unsigned getRsrcReg(CallingConv::ID CallConv) { |
1063 | 20.4k | switch (CallConv) { |
1064 | 20.4k | default: 17.5k LLVM_FALLTHROUGH17.5k ; |
1065 | 17.6k | case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1; |
1066 | 17.5k | case CallingConv::AMDGPU_LS: return 0 R_00B528_SPI_SHADER_PGM_RSRC1_LS0 ; |
1067 | 17.5k | case CallingConv::AMDGPU_HS: return 14 R_00B428_SPI_SHADER_PGM_RSRC1_HS14 ; |
1068 | 17.5k | case CallingConv::AMDGPU_ES: return 0 R_00B328_SPI_SHADER_PGM_RSRC1_ES0 ; |
1069 | 17.5k | case CallingConv::AMDGPU_GS: return 119 R_00B228_SPI_SHADER_PGM_RSRC1_GS119 ; |
1070 | 17.5k | case CallingConv::AMDGPU_VS: return 190 R_00B128_SPI_SHADER_PGM_RSRC1_VS190 ; |
1071 | 17.5k | case CallingConv::AMDGPU_PS: return 2.49k R_00B028_SPI_SHADER_PGM_RSRC1_PS2.49k ; |
1072 | 20.4k | } |
1073 | 20.4k | } |
1074 | | |
1075 | | void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, |
1076 | 20.4k | const SIProgramInfo &CurrentProgramInfo) { |
1077 | 20.4k | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
1078 | 20.4k | unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv()); |
1079 | 20.4k | |
1080 | 20.4k | if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { |
1081 | 17.6k | OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4); |
1082 | 17.6k | |
1083 | 17.6k | OutStreamer->EmitIntValue(CurrentProgramInfo.ComputePGMRSrc1, 4); |
1084 | 17.6k | |
1085 | 17.6k | OutStreamer->EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4); |
1086 | 17.6k | OutStreamer->EmitIntValue(CurrentProgramInfo.ComputePGMRSrc2, 4); |
1087 | 17.6k | |
1088 | 17.6k | OutStreamer->EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4); |
1089 | 17.6k | OutStreamer->EmitIntValue(S_00B860_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4); |
1090 | 17.6k | |
1091 | 17.6k | // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 = |
1092 | 17.6k | // 0" comment but I don't see a corresponding field in the register spec. |
1093 | 17.6k | } else { |
1094 | 2.82k | OutStreamer->EmitIntValue(RsrcReg, 4); |
1095 | 2.82k | OutStreamer->EmitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) | |
1096 | 2.82k | S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4); |
1097 | 2.82k | OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4); |
1098 | 2.82k | OutStreamer->EmitIntValue( |
1099 | 2.82k | S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4); |
1100 | 2.82k | } |
1101 | 20.4k | |
1102 | 20.4k | if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) { |
1103 | 2.49k | OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4); |
1104 | 2.49k | OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks), 4); |
1105 | 2.49k | OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4); |
1106 | 2.49k | OutStreamer->EmitIntValue(MFI->getPSInputEnable(), 4); |
1107 | 2.49k | OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4); |
1108 | 2.49k | OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4); |
1109 | 2.49k | } |
1110 | 20.4k | |
1111 | 20.4k | OutStreamer->EmitIntValue(R_SPILLED_SGPRS, 4); |
1112 | 20.4k | OutStreamer->EmitIntValue(MFI->getNumSpilledSGPRs(), 4); |
1113 | 20.4k | OutStreamer->EmitIntValue(R_SPILLED_VGPRS, 4); |
1114 | 20.4k | OutStreamer->EmitIntValue(MFI->getNumSpilledVGPRs(), 4); |
1115 | 20.4k | } |
1116 | | |
1117 | | // This is the equivalent of EmitProgramInfoSI above, but for when the OS type |
1118 | | // is AMDPAL. It stores each compute/SPI register setting and other PAL |
1119 | | // metadata items into the PALMD::Metadata, combining with any provided by the |
1120 | | // frontend as LLVM metadata. Once all functions are written, the PAL metadata |
1121 | | // is then written as a single block in the .note section. |
1122 | | void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF, |
1123 | 394 | const SIProgramInfo &CurrentProgramInfo) { |
1124 | 394 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
1125 | 394 | auto CC = MF.getFunction().getCallingConv(); |
1126 | 394 | auto MD = getTargetStreamer()->getPALMetadata(); |
1127 | 394 | |
1128 | 394 | MD->setEntryPoint(CC, MF.getFunction().getName()); |
1129 | 394 | MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU); |
1130 | 394 | MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU); |
1131 | 394 | if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { |
1132 | 320 | MD->setRsrc1(CC, CurrentProgramInfo.ComputePGMRSrc1); |
1133 | 320 | MD->setRsrc2(CC, CurrentProgramInfo.ComputePGMRSrc2); |
1134 | 320 | } else { |
1135 | 74 | MD->setRsrc1(CC, S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) | |
1136 | 74 | S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks)); |
1137 | 74 | if (CurrentProgramInfo.ScratchBlocks > 0) |
1138 | 1 | MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1)); |
1139 | 74 | } |
1140 | 394 | // ScratchSize is in bytes, 16 aligned. |
1141 | 394 | MD->setScratchSize(CC, alignTo(CurrentProgramInfo.ScratchSize, 16)); |
1142 | 394 | if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) { |
1143 | 33 | MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks)); |
1144 | 33 | MD->setSpiPsInputEna(MFI->getPSInputEnable()); |
1145 | 33 | MD->setSpiPsInputAddr(MFI->getPSInputAddr()); |
1146 | 33 | } |
1147 | 394 | |
1148 | 394 | const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); |
1149 | 394 | if (STM.isWave32()) |
1150 | 9 | MD->setWave32(MF.getFunction().getCallingConv()); |
1151 | 394 | } |
1152 | | |
1153 | | // This is supposed to be log2(Size) |
1154 | 1.16k | static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) { |
1155 | 1.16k | switch (Size) { |
1156 | 1.16k | case 4: |
1157 | 1.15k | return AMD_ELEMENT_4_BYTES; |
1158 | 1.16k | case 8: |
1159 | 5 | return AMD_ELEMENT_8_BYTES; |
1160 | 1.16k | case 16: |
1161 | 5 | return AMD_ELEMENT_16_BYTES; |
1162 | 1.16k | default: |
1163 | 0 | llvm_unreachable("invalid private_element_size"); |
1164 | 1.16k | } |
1165 | 1.16k | } |
1166 | | |
1167 | | void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, |
1168 | | const SIProgramInfo &CurrentProgramInfo, |
1169 | 1.16k | const MachineFunction &MF) const { |
1170 | 1.16k | const Function &F = MF.getFunction(); |
1171 | 1.16k | assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || |
1172 | 1.16k | F.getCallingConv() == CallingConv::SPIR_KERNEL); |
1173 | 1.16k | |
1174 | 1.16k | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
1175 | 1.16k | const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); |
1176 | 1.16k | |
1177 | 1.16k | AMDGPU::initDefaultAMDKernelCodeT(Out, &STM); |
1178 | 1.16k | |
1179 | 1.16k | Out.compute_pgm_resource_registers = |
1180 | 1.16k | CurrentProgramInfo.ComputePGMRSrc1 | |
1181 | 1.16k | (CurrentProgramInfo.ComputePGMRSrc2 << 32); |
1182 | 1.16k | Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64; |
1183 | 1.16k | |
1184 | 1.16k | if (CurrentProgramInfo.DynamicCallStack) |
1185 | 77 | Out.code_properties |= AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK; |
1186 | 1.16k | |
1187 | 1.16k | AMD_HSA_BITS_SET(Out.code_properties, |
1188 | 1.16k | AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE, |
1189 | 1.16k | getElementByteSizeValue(STM.getMaxPrivateElementSize())); |
1190 | 1.16k | |
1191 | 1.16k | if (MFI->hasPrivateSegmentBuffer()) { |
1192 | 1.16k | Out.code_properties |= |
1193 | 1.16k | AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER; |
1194 | 1.16k | } |
1195 | 1.16k | |
1196 | 1.16k | if (MFI->hasDispatchPtr()) |
1197 | 33 | Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; |
1198 | 1.16k | |
1199 | 1.16k | if (MFI->hasQueuePtr()) |
1200 | 56 | Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; |
1201 | 1.16k | |
1202 | 1.16k | if (MFI->hasKernargSegmentPtr()) |
1203 | 856 | Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR; |
1204 | 1.16k | |
1205 | 1.16k | if (MFI->hasDispatchID()) |
1206 | 6 | Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID; |
1207 | 1.16k | |
1208 | 1.16k | if (MFI->hasFlatScratchInit()) |
1209 | 190 | Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; |
1210 | 1.16k | |
1211 | 1.16k | if (MFI->hasDispatchPtr()) |
1212 | 33 | Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; |
1213 | 1.16k | |
1214 | 1.16k | if (STM.isXNACKEnabled()) |
1215 | 185 | Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED; |
1216 | 1.16k | |
1217 | 1.16k | unsigned MaxKernArgAlign; |
1218 | 1.16k | Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign); |
1219 | 1.16k | Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR; |
1220 | 1.16k | Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR; |
1221 | 1.16k | Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize; |
1222 | 1.16k | Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize; |
1223 | 1.16k | |
1224 | 1.16k | // These alignment values are specified in powers of two, so alignment = |
1225 | 1.16k | // 2^n. The minimum alignment is 2^4 = 16. |
1226 | 1.16k | Out.kernarg_segment_alignment = std::max<size_t>(4, |
1227 | 1.16k | countTrailingZeros(MaxKernArgAlign)); |
1228 | 1.16k | } |
1229 | | |
1230 | | bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, |
1231 | 789 | const char *ExtraCode, raw_ostream &O) { |
1232 | 789 | // First try the generic code, which knows about modifiers like 'c' and 'n'. |
1233 | 789 | if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O)) |
1234 | 4 | return false; |
1235 | 785 | |
1236 | 785 | if (ExtraCode && ExtraCode[0]0 ) { |
1237 | 0 | if (ExtraCode[1] != 0) |
1238 | 0 | return true; // Unknown modifier. |
1239 | 0 | |
1240 | 0 | switch (ExtraCode[0]) { |
1241 | 0 | case 'r': |
1242 | 0 | break; |
1243 | 0 | default: |
1244 | 0 | return true; |
1245 | 785 | } |
1246 | 785 | } |
1247 | 785 | |
1248 | 785 | // TODO: Should be able to support other operand types like globals. |
1249 | 785 | const MachineOperand &MO = MI->getOperand(OpNo); |
1250 | 785 | if (MO.isReg()) { |
1251 | 785 | AMDGPUInstPrinter::printRegOperand(MO.getReg(), O, |
1252 | 785 | *MF->getSubtarget().getRegisterInfo()); |
1253 | 785 | return false; |
1254 | 785 | } |
1255 | 0 | |
1256 | 0 | return true; |
1257 | 0 | } |