Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
Line
Count
Source (jump to first uncovered line)
1
//===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer  -------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
/// \file
10
///
11
/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
12
/// code.  When passed an MCAsmStreamer it prints assembly and when passed
13
/// an MCObjectStreamer it outputs binary code.
14
//
15
//===----------------------------------------------------------------------===//
16
//
17
18
#include "AMDGPUAsmPrinter.h"
19
#include "AMDGPU.h"
20
#include "AMDGPUSubtarget.h"
21
#include "AMDGPUTargetMachine.h"
22
#include "MCTargetDesc/AMDGPUInstPrinter.h"
23
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
24
#include "MCTargetDesc/AMDGPUTargetStreamer.h"
25
#include "R600AsmPrinter.h"
26
#include "R600Defines.h"
27
#include "R600MachineFunctionInfo.h"
28
#include "R600RegisterInfo.h"
29
#include "SIDefines.h"
30
#include "SIInstrInfo.h"
31
#include "SIMachineFunctionInfo.h"
32
#include "SIRegisterInfo.h"
33
#include "TargetInfo/AMDGPUTargetInfo.h"
34
#include "Utils/AMDGPUBaseInfo.h"
35
#include "llvm/BinaryFormat/ELF.h"
36
#include "llvm/CodeGen/MachineFrameInfo.h"
37
#include "llvm/IR/DiagnosticInfo.h"
38
#include "llvm/MC/MCAssembler.h"
39
#include "llvm/MC/MCContext.h"
40
#include "llvm/MC/MCSectionELF.h"
41
#include "llvm/MC/MCStreamer.h"
42
#include "llvm/Support/AMDGPUMetadata.h"
43
#include "llvm/Support/MathExtras.h"
44
#include "llvm/Support/TargetParser.h"
45
#include "llvm/Support/TargetRegistry.h"
46
#include "llvm/Target/TargetLoweringObjectFile.h"
47
48
using namespace llvm;
49
using namespace llvm::AMDGPU;
50
using namespace llvm::AMDGPU::HSAMD;
51
52
// TODO: This should get the default rounding mode from the kernel. We just set
53
// the default here, but this could change if the OpenCL rounding mode pragmas
54
// are used.
55
//
56
// The denormal mode here should match what is reported by the OpenCL runtime
57
// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
58
// can also be override to flush with the -cl-denorms-are-zero compiler flag.
59
//
60
// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
61
// precision, and leaves single precision to flush all and does not report
62
// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
63
// CL_FP_DENORM for both.
64
//
65
// FIXME: It seems some instructions do not support single precision denormals
66
// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
67
// and sin_f32, cos_f32 on most parts).
68
69
// We want to use these instructions, and using fp32 denormals also causes
70
// instructions to run at the double precision rate for the device so it's
71
// probably best to just report no single precision denormals.
72
23.1k
static uint32_t getFPMode(const MachineFunction &F) {
73
23.1k
  const GCNSubtarget& ST = F.getSubtarget<GCNSubtarget>();
74
23.1k
  // TODO: Is there any real use for the flush in only / flush out only modes?
75
23.1k
76
23.1k
  uint32_t FP32Denormals =
77
23.1k
    ST.hasFP32Denormals() ? 
FP_DENORM_FLUSH_NONE354
:
FP_DENORM_FLUSH_IN_FLUSH_OUT22.8k
;
78
23.1k
79
23.1k
  uint32_t FP64Denormals =
80
23.1k
    ST.hasFP64Denormals() ? 
FP_DENORM_FLUSH_NONE22.7k
:
FP_DENORM_FLUSH_IN_FLUSH_OUT359
;
81
23.1k
82
23.1k
  return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) |
83
23.1k
         FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) |
84
23.1k
         FP_DENORM_MODE_SP(FP32Denormals) |
85
23.1k
         FP_DENORM_MODE_DP(FP64Denormals);
86
23.1k
}
87
88
static AsmPrinter *
89
createAMDGPUAsmPrinterPass(TargetMachine &tm,
90
2.41k
                           std::unique_ptr<MCStreamer> &&Streamer) {
91
2.41k
  return new AMDGPUAsmPrinter(tm, std::move(Streamer));
92
2.41k
}
93
94
78.9k
extern "C" void LLVMInitializeAMDGPUAsmPrinter() {
95
78.9k
  TargetRegistry::RegisterAsmPrinter(getTheAMDGPUTarget(),
96
78.9k
                                     llvm::createR600AsmPrinterPass);
97
78.9k
  TargetRegistry::RegisterAsmPrinter(getTheGCNTarget(),
98
78.9k
                                     createAMDGPUAsmPrinterPass);
99
78.9k
}
100
101
AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
102
                                   std::unique_ptr<MCStreamer> Streamer)
103
2.41k
  : AsmPrinter(TM, std::move(Streamer)) {
104
2.41k
    if (IsaInfo::hasCodeObjectV3(getGlobalSTI()))
105
290
      HSAMetadataStream.reset(new MetadataStreamerV3());
106
2.12k
    else
107
2.12k
      HSAMetadataStream.reset(new MetadataStreamerV2());
108
2.41k
}
109
110
25.4k
StringRef AMDGPUAsmPrinter::getPassName() const {
111
25.4k
  return "AMDGPU Assembly Printer";
112
25.4k
}
113
114
84.5k
const MCSubtargetInfo *AMDGPUAsmPrinter::getGlobalSTI() const {
115
84.5k
  return TM.getMCSubtargetInfo();
116
84.5k
}
117
118
15.2k
AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const {
119
15.2k
  if (!OutStreamer)
120
0
    return nullptr;
121
15.2k
  return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer());
122
15.2k
}
123
124
2.41k
void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
125
2.41k
  if (IsaInfo::hasCodeObjectV3(getGlobalSTI())) {
126
290
    std::string ExpectedTarget;
127
290
    raw_string_ostream ExpectedTargetOS(ExpectedTarget);
128
290
    IsaInfo::streamIsaVersion(getGlobalSTI(), ExpectedTargetOS);
129
290
130
290
    getTargetStreamer()->EmitDirectiveAMDGCNTarget(ExpectedTarget);
131
290
  }
132
2.41k
133
2.41k
  if (TM.getTargetTriple().getOS() != Triple::AMDHSA &&
134
2.41k
      
TM.getTargetTriple().getOS() != Triple::AMDPAL1.95k
)
135
1.86k
    return;
136
542
137
542
  if (TM.getTargetTriple().getOS() == Triple::AMDHSA)
138
456
    HSAMetadataStream->begin(M);
139
542
140
542
  if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
141
86
    getTargetStreamer()->getPALMetadata()->readFromIR(M);
142
542
143
542
  if (IsaInfo::hasCodeObjectV3(getGlobalSTI()))
144
290
    return;
145
252
146
252
  // HSA emits NT_AMDGPU_HSA_CODE_OBJECT_VERSION for code objects v2.
147
252
  if (TM.getTargetTriple().getOS() == Triple::AMDHSA)
148
166
    getTargetStreamer()->EmitDirectiveHSACodeObjectVersion(2, 1);
149
252
150
252
  // HSA and PAL emit NT_AMDGPU_HSA_ISA for code objects v2.
151
252
  IsaVersion Version = getIsaVersion(getGlobalSTI()->getCPU());
152
252
  getTargetStreamer()->EmitDirectiveHSACodeObjectISA(
153
252
      Version.Major, Version.Minor, Version.Stepping, "AMD", "AMDGPU");
154
252
}
155
156
2.39k
void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) {
157
2.39k
  // Following code requires TargetStreamer to be present.
158
2.39k
  if (!getTargetStreamer())
159
1
    return;
160
2.39k
161
2.39k
  if (!IsaInfo::hasCodeObjectV3(getGlobalSTI())) {
162
2.10k
    // Emit ISA Version (NT_AMD_AMDGPU_ISA).
163
2.10k
    std::string ISAVersionString;
164
2.10k
    raw_string_ostream ISAVersionStream(ISAVersionString);
165
2.10k
    IsaInfo::streamIsaVersion(getGlobalSTI(), ISAVersionStream);
166
2.10k
    getTargetStreamer()->EmitISAVersion(ISAVersionStream.str());
167
2.10k
  }
168
2.39k
169
2.39k
  // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
170
2.39k
  if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
171
454
    HSAMetadataStream->end();
172
454
    bool Success = HSAMetadataStream->emitTo(*getTargetStreamer());
173
454
    (void)Success;
174
454
    assert(Success && "Malformed HSA Metadata");
175
454
  }
176
2.39k
}
177
178
bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough(
179
10.2k
  const MachineBasicBlock *MBB) const {
180
10.2k
  if (!AsmPrinter::isBlockOnlyReachableByFallthrough(MBB))
181
7.44k
    return false;
182
2.80k
183
2.80k
  if (MBB->empty())
184
51
    return true;
185
2.75k
186
2.75k
  // If this is a block implementing a long branch, an expression relative to
187
2.75k
  // the start of the block is needed.  to the start of the block.
188
2.75k
  // XXX - Is there a smarter way to check this?
189
2.75k
  return (MBB->back().getOpcode() != AMDGPU::S_SETPC_B64);
190
2.75k
}
191
192
25.4k
void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
193
25.4k
  const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
194
25.4k
  if (!MFI.isEntryFunction())
195
2.25k
    return;
196
23.1k
197
23.1k
  const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
198
23.1k
  const Function &F = MF->getFunction();
199
23.1k
  if (!STM.hasCodeObjectV3() && 
STM.isAmdHsaOrMesa(F)19.8k
&&
200
23.1k
      
(1.17k
F.getCallingConv() == CallingConv::AMDGPU_KERNEL1.17k
||
201
1.17k
       
F.getCallingConv() == CallingConv::SPIR_KERNEL3
)) {
202
1.16k
    amd_kernel_code_t KernelCode;
203
1.16k
    getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
204
1.16k
    getTargetStreamer()->EmitAMDKernelCodeT(KernelCode);
205
1.16k
  }
206
23.1k
207
23.1k
  if (STM.isAmdHsaOS())
208
4.03k
    HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
209
23.1k
}
210
211
25.4k
void AMDGPUAsmPrinter::EmitFunctionBodyEnd() {
212
25.4k
  const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
213
25.4k
  if (!MFI.isEntryFunction())
214
2.25k
    return;
215
23.1k
216
23.1k
  if (!IsaInfo::hasCodeObjectV3(getGlobalSTI()) ||
217
23.1k
      
TM.getTargetTriple().getOS() != Triple::AMDHSA3.26k
)
218
19.8k
    return;
219
3.26k
220
3.26k
  auto &Streamer = getTargetStreamer()->getStreamer();
221
3.26k
  auto &Context = Streamer.getContext();
222
3.26k
  auto &ObjectFileInfo = *Context.getObjectFileInfo();
223
3.26k
  auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
224
3.26k
225
3.26k
  Streamer.PushSection();
226
3.26k
  Streamer.SwitchSection(&ReadOnlySection);
227
3.26k
228
3.26k
  // CP microcode requires the kernel descriptor to be allocated on 64 byte
229
3.26k
  // alignment.
230
3.26k
  Streamer.EmitValueToAlignment(64, 0, 1, 0);
231
3.26k
  if (ReadOnlySection.getAlignment() < 64)
232
233
    ReadOnlySection.setAlignment(64);
233
3.26k
234
3.26k
  const MCSubtargetInfo &STI = MF->getSubtarget();
235
3.26k
236
3.26k
  SmallString<128> KernelName;
237
3.26k
  getNameWithPrefix(KernelName, &MF->getFunction());
238
3.26k
  getTargetStreamer()->EmitAmdhsaKernelDescriptor(
239
3.26k
      STI, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
240
3.26k
      CurrentProgramInfo.NumVGPRsForWavesPerEU,
241
3.26k
      CurrentProgramInfo.NumSGPRsForWavesPerEU -
242
3.26k
          IsaInfo::getNumExtraSGPRs(&STI,
243
3.26k
                                    CurrentProgramInfo.VCCUsed,
244
3.26k
                                    CurrentProgramInfo.FlatUsed),
245
3.26k
      CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
246
3.26k
      hasXNACK(STI));
247
3.26k
248
3.26k
  Streamer.PopSection();
249
3.26k
}
250
251
25.4k
void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
252
25.4k
  if (IsaInfo::hasCodeObjectV3(getGlobalSTI()) &&
253
25.4k
      
TM.getTargetTriple().getOS() == Triple::AMDHSA3.62k
) {
254
3.62k
    AsmPrinter::EmitFunctionEntryLabel();
255
3.62k
    return;
256
3.62k
  }
257
21.7k
258
21.7k
  const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
259
21.7k
  const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
260
21.7k
  if (MFI->isEntryFunction() && 
STM.isAmdHsaOrMesa(MF->getFunction())19.8k
) {
261
1.17k
    SmallString<128> SymbolName;
262
1.17k
    getNameWithPrefix(SymbolName, &MF->getFunction()),
263
1.17k
    getTargetStreamer()->EmitAMDGPUSymbolType(
264
1.17k
        SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
265
1.17k
  }
266
21.7k
  if (DumpCodeInstEmitter) {
267
2
    // Disassemble function name label to text.
268
2
    DisasmLines.push_back(MF->getName().str() + ":");
269
2
    DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
270
2
    HexLines.push_back("");
271
2
  }
272
21.7k
273
21.7k
  AsmPrinter::EmitFunctionEntryLabel();
274
21.7k
}
275
276
28.8k
void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const {
277
28.8k
  if (DumpCodeInstEmitter && 
!isBlockOnlyReachableByFallthrough(&MBB)2
) {
278
2
    // Write a line for the basic block label if it is not only fallthrough.
279
2
    DisasmLines.push_back(
280
2
        (Twine("BB") + Twine(getFunctionNumber())
281
2
         + "_" + Twine(MBB.getNumber()) + ":").str());
282
2
    DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
283
2
    HexLines.push_back("");
284
2
  }
285
28.8k
  AsmPrinter::EmitBasicBlockStart(MBB);
286
28.8k
}
287
288
348
void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
289
348
  if (GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
290
232
    if (GV->hasInitializer() && 
!isa<UndefValue>(GV->getInitializer())224
) {
291
4
      OutContext.reportError({},
292
4
                             Twine(GV->getName()) +
293
4
                                 ": unsupported initializer for address space");
294
4
      return;
295
4
    }
296
228
297
228
    // LDS variables aren't emitted in HSA or PAL yet.
298
228
    const Triple::OSType OS = TM.getTargetTriple().getOS();
299
228
    if (OS == Triple::AMDHSA || 
OS == Triple::AMDPAL161
)
300
73
      return;
301
155
302
155
    MCSymbol *GVSym = getSymbol(GV);
303
155
304
155
    GVSym->redefineIfPossible();
305
155
    if (GVSym->isDefined() || GVSym->isVariable())
306
0
      report_fatal_error("symbol '" + Twine(GVSym->getName()) +
307
0
                         "' is already defined");
308
155
309
155
    const DataLayout &DL = GV->getParent()->getDataLayout();
310
155
    uint64_t Size = DL.getTypeAllocSize(GV->getValueType());
311
155
    unsigned Align = GV->getAlignment();
312
155
    if (!Align)
313
38
      Align = 4;
314
155
315
155
    EmitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
316
155
    EmitLinkage(GV, GVSym);
317
155
    if (auto TS = getTargetStreamer())
318
154
      TS->emitAMDGPULDS(GVSym, Size, Align);
319
155
    return;
320
155
  }
321
116
322
116
  AsmPrinter::EmitGlobalVariable(GV);
323
116
}
324
325
2.40k
bool AMDGPUAsmPrinter::doFinalization(Module &M) {
326
2.40k
  CallGraphResourceInfo.clear();
327
2.40k
328
2.40k
  // Pad with s_code_end to help tools and guard against instruction prefetch
329
2.40k
  // causing stale data in caches. Arguably this should be done by the linker,
330
2.40k
  // which is why this isn't done for Mesa.
331
2.40k
  const MCSubtargetInfo &STI = *getGlobalSTI();
332
2.40k
  if (AMDGPU::isGFX10(STI) &&
333
2.40k
      
(153
STI.getTargetTriple().getOS() == Triple::AMDHSA153
||
334
153
       
STI.getTargetTriple().getOS() == Triple::AMDPAL121
)) {
335
35
    OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
336
35
    getTargetStreamer()->EmitCodeEnd();
337
35
  }
338
2.40k
339
2.40k
  return AsmPrinter::doFinalization(M);
340
2.40k
}
341
342
// Print comments that apply to both callable functions and entry points.
343
void AMDGPUAsmPrinter::emitCommonFunctionComments(
344
  uint32_t NumVGPR,
345
  uint32_t NumSGPR,
346
  uint64_t ScratchSize,
347
  uint64_t CodeSize,
348
24.7k
  const AMDGPUMachineFunction *MFI) {
349
24.7k
  OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
350
24.7k
  OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false);
351
24.7k
  OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false);
352
24.7k
  OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false);
353
24.7k
  OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
354
24.7k
                              false);
355
24.7k
}
356
357
uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
358
3.26k
    const MachineFunction &MF) const {
359
3.26k
  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
360
3.26k
  uint16_t KernelCodeProperties = 0;
361
3.26k
362
3.26k
  if (MFI.hasPrivateSegmentBuffer()) {
363
3.26k
    KernelCodeProperties |=
364
3.26k
        amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
365
3.26k
  }
366
3.26k
  if (MFI.hasDispatchPtr()) {
367
7
    KernelCodeProperties |=
368
7
        amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
369
7
  }
370
3.26k
  if (MFI.hasQueuePtr()) {
371
4
    KernelCodeProperties |=
372
4
        amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
373
4
  }
374
3.26k
  if (MFI.hasKernargSegmentPtr()) {
375
2.72k
    KernelCodeProperties |=
376
2.72k
        amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
377
2.72k
  }
378
3.26k
  if (MFI.hasDispatchID()) {
379
0
    KernelCodeProperties |=
380
0
        amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
381
0
  }
382
3.26k
  if (MFI.hasFlatScratchInit()) {
383
240
    KernelCodeProperties |=
384
240
        amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
385
240
  }
386
3.26k
  if (MF.getSubtarget<GCNSubtarget>().isWave32()) {
387
864
    KernelCodeProperties |=
388
864
        amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
389
864
  }
390
3.26k
391
3.26k
  return KernelCodeProperties;
392
3.26k
}
393
394
amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
395
    const MachineFunction &MF,
396
3.26k
    const SIProgramInfo &PI) const {
397
3.26k
  amdhsa::kernel_descriptor_t KernelDescriptor;
398
3.26k
  memset(&KernelDescriptor, 0x0, sizeof(KernelDescriptor));
399
3.26k
400
3.26k
  assert(isUInt<32>(PI.ScratchSize));
401
3.26k
  assert(isUInt<32>(PI.ComputePGMRSrc1));
402
3.26k
  assert(isUInt<32>(PI.ComputePGMRSrc2));
403
3.26k
404
3.26k
  KernelDescriptor.group_segment_fixed_size = PI.LDSSize;
405
3.26k
  KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
406
3.26k
  KernelDescriptor.compute_pgm_rsrc1 = PI.ComputePGMRSrc1;
407
3.26k
  KernelDescriptor.compute_pgm_rsrc2 = PI.ComputePGMRSrc2;
408
3.26k
  KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
409
3.26k
410
3.26k
  return KernelDescriptor;
411
3.26k
}
412
413
25.4k
bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
414
25.4k
  CurrentProgramInfo = SIProgramInfo();
415
25.4k
416
25.4k
  const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
417
25.4k
418
25.4k
  // The starting address of all shader programs must be 256 bytes aligned.
419
25.4k
  // Regular functions just need the basic required instruction alignment.
420
25.4k
  MF.setAlignment(MFI->isEntryFunction() ? 
823.1k
:
22.25k
);
421
25.4k
422
25.4k
  SetupMachineFunction(MF);
423
25.4k
424
25.4k
  const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
425
25.4k
  MCContext &Context = getObjFileLowering().getContext();
426
25.4k
  // FIXME: This should be an explicit check for Mesa.
427
25.4k
  if (!STM.isAmdHsaOS() && 
!STM.isAmdPalOS()20.8k
) {
428
20.4k
    MCSectionELF *ConfigSection =
429
20.4k
        Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
430
20.4k
    OutStreamer->SwitchSection(ConfigSection);
431
20.4k
  }
432
25.4k
433
25.4k
  if (MFI->isEntryFunction()) {
434
23.1k
    getSIProgramInfo(CurrentProgramInfo, MF);
435
23.1k
  } else {
436
2.25k
    auto I = CallGraphResourceInfo.insert(
437
2.25k
      std::make_pair(&MF.getFunction(), SIFunctionResourceInfo()));
438
2.25k
    SIFunctionResourceInfo &Info = I.first->second;
439
2.25k
    assert(I.second && "should only be called once per function");
440
2.25k
    Info = analyzeResourceUsage(MF);
441
2.25k
  }
442
25.4k
443
25.4k
  if (STM.isAmdPalOS())
444
394
    EmitPALMetadata(MF, CurrentProgramInfo);
445
25.0k
  else if (!STM.isAmdHsaOS()) {
446
20.4k
    EmitProgramInfoSI(MF, CurrentProgramInfo);
447
20.4k
  }
448
25.4k
449
25.4k
  DumpCodeInstEmitter = nullptr;
450
25.4k
  if (STM.dumpCode()) {
451
2
    // For -dumpcode, get the assembler out of the streamer, even if it does
452
2
    // not really want to let us have it. This only works with -filetype=obj.
453
2
    bool SaveFlag = OutStreamer->getUseAssemblerInfoForParsing();
454
2
    OutStreamer->setUseAssemblerInfoForParsing(true);
455
2
    MCAssembler *Assembler = OutStreamer->getAssemblerPtr();
456
2
    OutStreamer->setUseAssemblerInfoForParsing(SaveFlag);
457
2
    if (Assembler)
458
2
      DumpCodeInstEmitter = Assembler->getEmitterPtr();
459
2
  }
460
25.4k
461
25.4k
  DisasmLines.clear();
462
25.4k
  HexLines.clear();
463
25.4k
  DisasmLineMaxLen = 0;
464
25.4k
465
25.4k
  EmitFunctionBody();
466
25.4k
467
25.4k
  if (isVerbose()) {
468
24.7k
    MCSectionELF *CommentSection =
469
24.7k
        Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
470
24.7k
    OutStreamer->SwitchSection(CommentSection);
471
24.7k
472
24.7k
    if (!MFI->isEntryFunction()) {
473
2.22k
      OutStreamer->emitRawComment(" Function info:", false);
474
2.22k
      SIFunctionResourceInfo &Info = CallGraphResourceInfo[&MF.getFunction()];
475
2.22k
      emitCommonFunctionComments(
476
2.22k
        Info.NumVGPR,
477
2.22k
        Info.getTotalNumSGPRs(MF.getSubtarget<GCNSubtarget>()),
478
2.22k
        Info.PrivateSegmentSize,
479
2.22k
        getFunctionCodeSize(MF), MFI);
480
2.22k
      return false;
481
2.22k
    }
482
22.4k
483
22.4k
    OutStreamer->emitRawComment(" Kernel info:", false);
484
22.4k
    emitCommonFunctionComments(CurrentProgramInfo.NumVGPR,
485
22.4k
                               CurrentProgramInfo.NumSGPR,
486
22.4k
                               CurrentProgramInfo.ScratchSize,
487
22.4k
                               getFunctionCodeSize(MF), MFI);
488
22.4k
489
22.4k
    OutStreamer->emitRawComment(
490
22.4k
      " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
491
22.4k
    OutStreamer->emitRawComment(
492
22.4k
      " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
493
22.4k
    OutStreamer->emitRawComment(
494
22.4k
      " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
495
22.4k
      " bytes/workgroup (compile time only)", false);
496
22.4k
497
22.4k
    OutStreamer->emitRawComment(
498
22.4k
      " SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), false);
499
22.4k
    OutStreamer->emitRawComment(
500
22.4k
      " VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), false);
501
22.4k
502
22.4k
    OutStreamer->emitRawComment(
503
22.4k
      " NumSGPRsForWavesPerEU: " +
504
22.4k
      Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), false);
505
22.4k
    OutStreamer->emitRawComment(
506
22.4k
      " NumVGPRsForWavesPerEU: " +
507
22.4k
      Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false);
508
22.4k
509
22.4k
    OutStreamer->emitRawComment(
510
22.4k
      " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
511
22.4k
512
22.4k
    OutStreamer->emitRawComment(
513
22.4k
      " COMPUTE_PGM_RSRC2:USER_SGPR: " +
514
22.4k
      Twine(G_00B84C_USER_SGPR(CurrentProgramInfo.ComputePGMRSrc2)), false);
515
22.4k
    OutStreamer->emitRawComment(
516
22.4k
      " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
517
22.4k
      Twine(G_00B84C_TRAP_HANDLER(CurrentProgramInfo.ComputePGMRSrc2)), false);
518
22.4k
    OutStreamer->emitRawComment(
519
22.4k
      " COMPUTE_PGM_RSRC2:TGID_X_EN: " +
520
22.4k
      Twine(G_00B84C_TGID_X_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
521
22.4k
    OutStreamer->emitRawComment(
522
22.4k
      " COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
523
22.4k
      Twine(G_00B84C_TGID_Y_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
524
22.4k
    OutStreamer->emitRawComment(
525
22.4k
      " COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
526
22.4k
      Twine(G_00B84C_TGID_Z_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
527
22.4k
    OutStreamer->emitRawComment(
528
22.4k
      " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
529
22.4k
      Twine(G_00B84C_TIDIG_COMP_CNT(CurrentProgramInfo.ComputePGMRSrc2)),
530
22.4k
      false);
531
22.4k
  }
532
25.4k
533
25.4k
  
if (23.1k
DumpCodeInstEmitter23.1k
) {
534
2
535
2
    OutStreamer->SwitchSection(
536
2
        Context.getELFSection(".AMDGPU.disasm", ELF::SHT_NOTE, 0));
537
2
538
20
    for (size_t i = 0; i < DisasmLines.size(); 
++i18
) {
539
18
      std::string Comment = "\n";
540
18
      if (!HexLines[i].empty()) {
541
14
        Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
542
14
        Comment += " ; " + HexLines[i] + "\n";
543
14
      }
544
18
545
18
      OutStreamer->EmitBytes(StringRef(DisasmLines[i]));
546
18
      OutStreamer->EmitBytes(StringRef(Comment));
547
18
    }
548
2
  }
549
23.1k
550
23.1k
  return false;
551
25.4k
}
552
553
24.7k
uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const {
554
24.7k
  const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
555
24.7k
  const SIInstrInfo *TII = STM.getInstrInfo();
556
24.7k
557
24.7k
  uint64_t CodeSize = 0;
558
24.7k
559
28.0k
  for (const MachineBasicBlock &MBB : MF) {
560
427k
    for (const MachineInstr &MI : MBB) {
561
427k
      // TODO: CodeSize should account for multiple functions.
562
427k
563
427k
      // TODO: Should we count size of debug info?
564
427k
      if (MI.isDebugInstr())
565
46
        continue;
566
427k
567
427k
      CodeSize += TII->getInstSizeInBytes(MI);
568
427k
    }
569
28.0k
  }
570
24.7k
571
24.7k
  return CodeSize;
572
24.7k
}
573
574
static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
575
                                  const SIInstrInfo &TII,
576
17.5k
                                  unsigned Reg) {
577
17.5k
  for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
578
12.8k
    if (!UseOp.isImplicit() || 
!TII.isFLAT(*UseOp.getParent())12.8k
)
579
55
      return true;
580
12.8k
  }
581
17.5k
582
17.5k
  
return false17.4k
;
583
17.5k
}
584
585
int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumSGPRs(
586
2.22k
  const GCNSubtarget &ST) const {
587
2.22k
  return NumExplicitSGPR + IsaInfo::getNumExtraSGPRs(&ST,
588
2.22k
                                                     UsesVCC, UsesFlatScratch);
589
2.22k
}
590
591
AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
592
25.4k
  const MachineFunction &MF) const {
593
25.4k
  SIFunctionResourceInfo Info;
594
25.4k
595
25.4k
  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
596
25.4k
  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
597
25.4k
  const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
598
25.4k
  const MachineRegisterInfo &MRI = MF.getRegInfo();
599
25.4k
  const SIInstrInfo *TII = ST.getInstrInfo();
600
25.4k
  const SIRegisterInfo &TRI = TII->getRegisterInfo();
601
25.4k
602
25.4k
  Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
603
25.4k
                         
MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI)19.1k
;
604
25.4k
605
25.4k
  // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
606
25.4k
  // instructions aren't used to access the scratch buffer. Inline assembly may
607
25.4k
  // need it though.
608
25.4k
  //
609
25.4k
  // If we only have implicit uses of flat_scr on flat instructions, it is not
610
25.4k
  // really needed.
611
25.4k
  if (Info.UsesFlatScratch && 
!MFI->hasFlatScratchInit()6.30k
&&
612
25.4k
      
(5.87k
!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR)5.87k
&&
613
5.87k
       
!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO)5.84k
&&
614
5.87k
       
!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI)5.83k
)) {
615
5.82k
    Info.UsesFlatScratch = false;
616
5.82k
  }
617
25.4k
618
25.4k
  Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
619
25.4k
  Info.PrivateSegmentSize = FrameInfo.getStackSize();
620
25.4k
  if (MFI->isStackRealigned())
621
8
    Info.PrivateSegmentSize += FrameInfo.getMaxAlignment();
622
25.4k
623
25.4k
624
25.4k
  Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) ||
625
25.4k
                 
MRI.isPhysRegUsed(AMDGPU::VCC_HI)19.6k
;
626
25.4k
627
25.4k
  // If there are no calls, MachineRegisterInfo can tell us the used register
628
25.4k
  // count easily.
629
25.4k
  // A tail call isn't considered a call for MachineFrameInfo's purposes.
630
25.4k
  if (!FrameInfo.hasCalls() && 
!FrameInfo.hasTailCall()24.7k
) {
631
24.7k
    MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
632
6.24M
    for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
633
6.24M
      if (MRI.isPhysRegUsed(Reg)) {
634
22.8k
        HighestVGPRReg = Reg;
635
22.8k
        break;
636
22.8k
      }
637
6.22M
      MCPhysReg AReg = AMDGPU::AGPR0 + TRI.getHWRegIndex(Reg);
638
6.22M
      if (MRI.isPhysRegUsed(AReg)) {
639
21
        HighestVGPRReg = AReg;
640
21
        break;
641
21
      }
642
6.22M
    }
643
24.7k
644
24.7k
    MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
645
2.41M
    for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
646
2.41M
      if (MRI.isPhysRegUsed(Reg)) {
647
22.5k
        HighestSGPRReg = Reg;
648
22.5k
        break;
649
22.5k
      }
650
2.41M
    }
651
24.7k
652
24.7k
    // We found the maximum register index. They start at 0, so add one to get the
653
24.7k
    // number of registers.
654
24.7k
    Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister ? 
01.83k
:
655
24.7k
      
TRI.getHWRegIndex(HighestVGPRReg) + 122.9k
;
656
24.7k
    Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister ? 
02.15k
:
657
24.7k
      
TRI.getHWRegIndex(HighestSGPRReg) + 122.5k
;
658
24.7k
659
24.7k
    return Info;
660
24.7k
  }
661
661
662
661
  int32_t MaxVGPR = -1;
663
661
  int32_t MaxSGPR = -1;
664
661
  uint64_t CalleeFrameSize = 0;
665
661
666
716
  for (const MachineBasicBlock &MBB : MF) {
667
12.6k
    for (const MachineInstr &MI : MBB) {
668
12.6k
      // TODO: Check regmasks? Do they occur anywhere except calls?
669
52.0k
      for (const MachineOperand &MO : MI.operands()) {
670
52.0k
        unsigned Width = 0;
671
52.0k
        bool IsSGPR = false;
672
52.0k
673
52.0k
        if (!MO.isReg())
674
16.7k
          continue;
675
35.2k
676
35.2k
        unsigned Reg = MO.getReg();
677
35.2k
        switch (Reg) {
678
35.2k
        case AMDGPU::EXEC:
679
7.15k
        case AMDGPU::EXEC_LO:
680
7.15k
        case AMDGPU::EXEC_HI:
681
7.15k
        case AMDGPU::SCC:
682
7.15k
        case AMDGPU::M0:
683
7.15k
        case AMDGPU::SRC_SHARED_BASE:
684
7.15k
        case AMDGPU::SRC_SHARED_LIMIT:
685
7.15k
        case AMDGPU::SRC_PRIVATE_BASE:
686
7.15k
        case AMDGPU::SRC_PRIVATE_LIMIT:
687
7.15k
        case AMDGPU::SGPR_NULL:
688
7.15k
          continue;
689
7.15k
690
7.15k
        case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
691
0
          llvm_unreachable("src_pops_exiting_wave_id should not be used");
692
7.15k
693
7.15k
        case AMDGPU::NoRegister:
694
5
          assert(MI.isDebugInstr());
695
5
          continue;
696
7.15k
697
7.15k
        case AMDGPU::VCC:
698
610
        case AMDGPU::VCC_LO:
699
610
        case AMDGPU::VCC_HI:
700
610
          Info.UsesVCC = true;
701
610
          continue;
702
610
703
763
        case AMDGPU::FLAT_SCR:
704
763
        case AMDGPU::FLAT_SCR_LO:
705
763
        case AMDGPU::FLAT_SCR_HI:
706
763
          continue;
707
763
708
763
        case AMDGPU::XNACK_MASK:
709
0
        case AMDGPU::XNACK_MASK_LO:
710
0
        case AMDGPU::XNACK_MASK_HI:
711
0
          llvm_unreachable("xnack_mask registers should not be used");
712
0
713
0
        case AMDGPU::LDS_DIRECT:
714
0
          llvm_unreachable("lds_direct register should not be used");
715
0
716
0
        case AMDGPU::TBA:
717
0
        case AMDGPU::TBA_LO:
718
0
        case AMDGPU::TBA_HI:
719
0
        case AMDGPU::TMA:
720
0
        case AMDGPU::TMA_LO:
721
0
        case AMDGPU::TMA_HI:
722
0
          llvm_unreachable("trap handler registers should not be used");
723
0
724
0
        case AMDGPU::SRC_VCCZ:
725
0
          llvm_unreachable("src_vccz register should not be used");
726
0
727
0
        case AMDGPU::SRC_EXECZ:
728
0
          llvm_unreachable("src_execz register should not be used");
729
0
730
0
        case AMDGPU::SRC_SCC:
731
0
          llvm_unreachable("src_scc register should not be used");
732
0
733
26.7k
        default:
734
26.7k
          break;
735
26.7k
        }
736
26.7k
737
26.7k
        if (AMDGPU::SReg_32RegClass.contains(Reg)) {
738
9.09k
          assert(!AMDGPU::TTMP_32RegClass.contains(Reg) &&
739
9.09k
                 "trap handler registers should not be used");
740
9.09k
          IsSGPR = true;
741
9.09k
          Width = 1;
742
17.6k
        } else if (AMDGPU::VGPR_32RegClass.contains(Reg)) {
743
9.14k
          IsSGPR = false;
744
9.14k
          Width = 1;
745
9.14k
        } else 
if (8.52k
AMDGPU::AGPR_32RegClass.contains(Reg)8.52k
) {
746
0
          IsSGPR = false;
747
0
          Width = 1;
748
8.52k
        } else if (AMDGPU::SReg_64RegClass.contains(Reg)) {
749
4.53k
          assert(!AMDGPU::TTMP_64RegClass.contains(Reg) &&
750
4.53k
                 "trap handler registers should not be used");
751
4.53k
          IsSGPR = true;
752
4.53k
          Width = 2;
753
4.53k
        } else 
if (3.98k
AMDGPU::VReg_64RegClass.contains(Reg)3.98k
) {
754
461
          IsSGPR = false;
755
461
          Width = 2;
756
3.52k
        } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
757
0
          IsSGPR = false;
758
0
          Width = 2;
759
3.52k
        } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
760
6
          IsSGPR = false;
761
6
          Width = 3;
762
3.51k
        } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
763
3
          Width = 3;
764
3.51k
        } else if (AMDGPU::SReg_128RegClass.contains(Reg)) {
765
3.31k
          assert(!AMDGPU::TTMP_128RegClass.contains(Reg) &&
766
3.31k
            "trap handler registers should not be used");
767
3.31k
          IsSGPR = true;
768
3.31k
          Width = 4;
769
3.31k
        } else 
if (204
AMDGPU::VReg_128RegClass.contains(Reg)204
) {
770
196
          IsSGPR = false;
771
196
          Width = 4;
772
196
        } else 
if (8
AMDGPU::AReg_128RegClass.contains(Reg)8
) {
773
0
          IsSGPR = false;
774
0
          Width = 4;
775
8
        } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
776
0
          assert(!AMDGPU::TTMP_256RegClass.contains(Reg) &&
777
0
            "trap handler registers should not be used");
778
0
          IsSGPR = true;
779
0
          Width = 8;
780
8
        } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
781
0
          IsSGPR = false;
782
0
          Width = 8;
783
8
        } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
784
8
          assert(!AMDGPU::TTMP_512RegClass.contains(Reg) &&
785
8
            "trap handler registers should not be used");
786
8
          IsSGPR = true;
787
8
          Width = 16;
788
8
        } else 
if (0
AMDGPU::VReg_512RegClass.contains(Reg)0
) {
789
0
          IsSGPR = false;
790
0
          Width = 16;
791
0
        } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
792
0
          IsSGPR = false;
793
0
          Width = 16;
794
0
        } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
795
0
          IsSGPR = true;
796
0
          Width = 32;
797
0
        } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
798
0
          IsSGPR = false;
799
0
          Width = 32;
800
0
        } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
801
0
          IsSGPR = false;
802
0
          Width = 32;
803
0
        } else {
804
0
          llvm_unreachable("Unknown register class");
805
0
        }
806
26.7k
        unsigned HWReg = TRI.getHWRegIndex(Reg);
807
26.7k
        int MaxUsed = HWReg + Width - 1;
808
26.7k
        if (IsSGPR) {
809
16.9k
          MaxSGPR = MaxUsed > MaxSGPR ? 
MaxUsed1.34k
:
MaxSGPR15.6k
;
810
16.9k
        } else {
811
9.81k
          MaxVGPR = MaxUsed > MaxVGPR ? 
MaxUsed1.52k
:
MaxVGPR8.29k
;
812
9.81k
        }
813
26.7k
      }
814
12.6k
815
12.6k
      if (MI.isCall()) {
816
694
        // Pseudo used just to encode the underlying global. Is there a better
817
694
        // way to track this?
818
694
819
694
        const MachineOperand *CalleeOp
820
694
          = TII->getNamedOperand(MI, AMDGPU::OpName::callee);
821
694
        const Function *Callee = cast<Function>(CalleeOp->getGlobal());
822
694
        if (Callee->isDeclaration()) {
823
445
          // If this is a call to an external function, we can't do much. Make
824
445
          // conservative guesses.
825
445
826
445
          // 48 SGPRs - vcc, - flat_scr, -xnack
827
445
          int MaxSGPRGuess =
828
445
            47 - IsaInfo::getNumExtraSGPRs(&ST, true, ST.hasFlatAddressSpace());
829
445
          MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess);
830
445
          MaxVGPR = std::max(MaxVGPR, 23);
831
445
832
445
          CalleeFrameSize = std::max(CalleeFrameSize, UINT64_C(16384));
833
445
          Info.UsesVCC = true;
834
445
          Info.UsesFlatScratch = ST.hasFlatAddressSpace();
835
445
          Info.HasDynamicallySizedStack = true;
836
445
        } else {
837
249
          // We force CodeGen to run in SCC order, so the callee's register
838
249
          // usage etc. should be the cumulative usage of all callees.
839
249
840
249
          auto I = CallGraphResourceInfo.find(Callee);
841
249
          if (I == CallGraphResourceInfo.end()) {
842
1
            // Avoid crashing on undefined behavior with an illegal call to a
843
1
            // kernel. If a callsite's calling convention doesn't match the
844
1
            // function's, it's undefined behavior. If the callsite calling
845
1
            // convention does match, that would have errored earlier.
846
1
            // FIXME: The verifier shouldn't allow this.
847
1
            if (AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
848
1
              report_fatal_error("invalid call to entry function");
849
0
850
0
            llvm_unreachable("callee should have been handled before caller");
851
0
          }
852
248
853
248
          MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
854
248
          MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
855
248
          CalleeFrameSize
856
248
            = std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
857
248
          Info.UsesVCC |= I->second.UsesVCC;
858
248
          Info.UsesFlatScratch |= I->second.UsesFlatScratch;
859
248
          Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack;
860
248
          Info.HasRecursion |= I->second.HasRecursion;
861
248
        }
862
694
863
694
        
if (693
!Callee->doesNotRecurse()693
)
864
637
          Info.HasRecursion = true;
865
693
      }
866
12.6k
    }
867
716
  }
868
661
869
661
  Info.NumExplicitSGPR = MaxSGPR + 1;
870
660
  Info.NumVGPR = MaxVGPR + 1;
871
660
  Info.PrivateSegmentSize += CalleeFrameSize;
872
660
873
660
  return Info;
874
661
}
875
876
void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
877
23.1k
                                        const MachineFunction &MF) {
878
23.1k
  SIFunctionResourceInfo Info = analyzeResourceUsage(MF);
879
23.1k
880
23.1k
  ProgInfo.NumVGPR = Info.NumVGPR;
881
23.1k
  ProgInfo.NumSGPR = Info.NumExplicitSGPR;
882
23.1k
  ProgInfo.ScratchSize = Info.PrivateSegmentSize;
883
23.1k
  ProgInfo.VCCUsed = Info.UsesVCC;
884
23.1k
  ProgInfo.FlatUsed = Info.UsesFlatScratch;
885
23.1k
  ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || 
Info.HasRecursion22.7k
;
886
23.1k
887
23.1k
  if (!isUInt<32>(ProgInfo.ScratchSize)) {
888
2
    DiagnosticInfoStackSize DiagStackSize(MF.getFunction(),
889
2
                                          ProgInfo.ScratchSize, DS_Error);
890
2
    MF.getFunction().getContext().diagnose(DiagStackSize);
891
2
  }
892
23.1k
893
23.1k
  const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
894
23.1k
  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
895
23.1k
896
23.1k
  // TODO(scott.linder): The calculations related to SGPR/VGPR blocks are
897
23.1k
  // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
898
23.1k
  // unified.
899
23.1k
  unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs(
900
23.1k
      &STM, ProgInfo.VCCUsed, ProgInfo.FlatUsed);
901
23.1k
902
23.1k
  // Check the addressable register limit before we add ExtraSGPRs.
903
23.1k
  if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
904
23.1k
      
!STM.hasSGPRInitBug()14.1k
) {
905
9.23k
    unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
906
9.23k
    if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
907
1
      // This can happen due to a compiler bug or when using inline asm.
908
1
      LLVMContext &Ctx = MF.getFunction().getContext();
909
1
      DiagnosticInfoResourceLimit Diag(MF.getFunction(),
910
1
                                       "addressable scalar registers",
911
1
                                       ProgInfo.NumSGPR, DS_Error,
912
1
                                       DK_ResourceLimit,
913
1
                                       MaxAddressableNumSGPRs);
914
1
      Ctx.diagnose(Diag);
915
1
      ProgInfo.NumSGPR = MaxAddressableNumSGPRs - 1;
916
1
    }
917
9.23k
  }
918
23.1k
919
23.1k
  // Account for extra SGPRs and VGPRs reserved for debugger use.
920
23.1k
  ProgInfo.NumSGPR += ExtraSGPRs;
921
23.1k
922
23.1k
  // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
923
23.1k
  // dispatch registers are function args.
924
23.1k
  unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
925
54.6k
  for (auto &Arg : MF.getFunction().args()) {
926
54.6k
    unsigned NumRegs = (Arg.getType()->getPrimitiveSizeInBits() + 31) / 32;
927
54.6k
    if (Arg.hasAttribute(Attribute::InReg))
928
4.46k
      WaveDispatchNumSGPR += NumRegs;
929
50.1k
    else
930
50.1k
      WaveDispatchNumVGPR += NumRegs;
931
54.6k
  }
932
23.1k
  ProgInfo.NumSGPR = std::max(ProgInfo.NumSGPR, WaveDispatchNumSGPR);
933
23.1k
  ProgInfo.NumVGPR = std::max(ProgInfo.NumVGPR, WaveDispatchNumVGPR);
934
23.1k
935
23.1k
  // Adjust number of registers used to meet default/requested minimum/maximum
936
23.1k
  // number of waves per execution unit request.
937
23.1k
  ProgInfo.NumSGPRsForWavesPerEU = std::max(
938
23.1k
    std::max(ProgInfo.NumSGPR, 1u), STM.getMinNumSGPRs(MFI->getMaxWavesPerEU()));
939
23.1k
  ProgInfo.NumVGPRsForWavesPerEU = std::max(
940
23.1k
    std::max(ProgInfo.NumVGPR, 1u), STM.getMinNumVGPRs(MFI->getMaxWavesPerEU()));
941
23.1k
942
23.1k
  if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
943
23.1k
      
STM.hasSGPRInitBug()14.1k
) {
944
13.9k
    unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
945
13.9k
    if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
946
4
      // This can happen due to a compiler bug or when using inline asm to use
947
4
      // the registers which are usually reserved for vcc etc.
948
4
      LLVMContext &Ctx = MF.getFunction().getContext();
949
4
      DiagnosticInfoResourceLimit Diag(MF.getFunction(),
950
4
                                       "scalar registers",
951
4
                                       ProgInfo.NumSGPR, DS_Error,
952
4
                                       DK_ResourceLimit,
953
4
                                       MaxAddressableNumSGPRs);
954
4
      Ctx.diagnose(Diag);
955
4
      ProgInfo.NumSGPR = MaxAddressableNumSGPRs;
956
4
      ProgInfo.NumSGPRsForWavesPerEU = MaxAddressableNumSGPRs;
957
4
    }
958
13.9k
  }
959
23.1k
960
23.1k
  if (STM.hasSGPRInitBug()) {
961
4.95k
    ProgInfo.NumSGPR =
962
4.95k
        AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
963
4.95k
    ProgInfo.NumSGPRsForWavesPerEU =
964
4.95k
        AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
965
4.95k
  }
966
23.1k
967
23.1k
  if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
968
0
    LLVMContext &Ctx = MF.getFunction().getContext();
969
0
    DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs",
970
0
                                     MFI->getNumUserSGPRs(), DS_Error);
971
0
    Ctx.diagnose(Diag);
972
0
  }
973
23.1k
974
23.1k
  if (MFI->getLDSSize() > static_cast<unsigned>(STM.getLocalMemorySize())) {
975
1
    LLVMContext &Ctx = MF.getFunction().getContext();
976
1
    DiagnosticInfoResourceLimit Diag(MF.getFunction(), "local memory",
977
1
                                     MFI->getLDSSize(), DS_Error);
978
1
    Ctx.diagnose(Diag);
979
1
  }
980
23.1k
981
23.1k
  ProgInfo.SGPRBlocks = IsaInfo::getNumSGPRBlocks(
982
23.1k
      &STM, ProgInfo.NumSGPRsForWavesPerEU);
983
23.1k
  ProgInfo.VGPRBlocks = IsaInfo::getNumVGPRBlocks(
984
23.1k
      &STM, ProgInfo.NumVGPRsForWavesPerEU);
985
23.1k
986
23.1k
  // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
987
23.1k
  // register.
988
23.1k
  ProgInfo.FloatMode = getFPMode(MF);
989
23.1k
990
23.1k
  const SIModeRegisterDefaults Mode = MFI->getMode();
991
23.1k
  ProgInfo.IEEEMode = Mode.IEEE;
992
23.1k
993
23.1k
  // Make clamp modifier on NaN input returns 0.
994
23.1k
  ProgInfo.DX10Clamp = Mode.DX10Clamp;
995
23.1k
996
23.1k
  unsigned LDSAlignShift;
997
23.1k
  if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
998
6.21k
    // LDS is allocated in 64 dword blocks.
999
6.21k
    LDSAlignShift = 8;
1000
16.9k
  } else {
1001
16.9k
    // LDS is allocated in 128 dword blocks.
1002
16.9k
    LDSAlignShift = 9;
1003
16.9k
  }
1004
23.1k
1005
23.1k
  unsigned LDSSpillSize =
1006
23.1k
    MFI->getLDSWaveSpillSize() * MFI->getMaxFlatWorkGroupSize();
1007
23.1k
1008
23.1k
  ProgInfo.LDSSize = MFI->getLDSSize() + LDSSpillSize;
1009
23.1k
  ProgInfo.LDSBlocks =
1010
23.1k
      alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
1011
23.1k
1012
23.1k
  // Scratch is allocated in 256 dword blocks.
1013
23.1k
  unsigned ScratchAlignShift = 10;
1014
23.1k
  // We need to program the hardware with the amount of scratch memory that
1015
23.1k
  // is used by the entire wave.  ProgInfo.ScratchSize is the amount of
1016
23.1k
  // scratch memory used per thread.
1017
23.1k
  ProgInfo.ScratchBlocks =
1018
23.1k
      alignTo(ProgInfo.ScratchSize * STM.getWavefrontSize(),
1019
23.1k
              1ULL << ScratchAlignShift) >>
1020
23.1k
      ScratchAlignShift;
1021
23.1k
1022
23.1k
  if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
1023
2.05k
    ProgInfo.WgpMode = STM.isCuModeEnabled() ? 
0404
:
11.64k
;
1024
2.05k
    ProgInfo.MemOrdered = 1;
1025
2.05k
  }
1026
23.1k
1027
23.1k
  ProgInfo.ComputePGMRSrc1 =
1028
23.1k
      S_00B848_VGPRS(ProgInfo.VGPRBlocks) |
1029
23.1k
      S_00B848_SGPRS(ProgInfo.SGPRBlocks) |
1030
23.1k
      S_00B848_PRIORITY(ProgInfo.Priority) |
1031
23.1k
      S_00B848_FLOAT_MODE(ProgInfo.FloatMode) |
1032
23.1k
      S_00B848_PRIV(ProgInfo.Priv) |
1033
23.1k
      S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) |
1034
23.1k
      S_00B848_DEBUG_MODE(ProgInfo.DebugMode) |
1035
23.1k
      S_00B848_IEEE_MODE(ProgInfo.IEEEMode) |
1036
23.1k
      S_00B848_WGP_MODE(ProgInfo.WgpMode) |
1037
23.1k
      S_00B848_MEM_ORDERED(ProgInfo.MemOrdered);
1038
23.1k
1039
23.1k
  // 0 = X, 1 = XY, 2 = XYZ
1040
23.1k
  unsigned TIDIGCompCnt = 0;
1041
23.1k
  if (MFI->hasWorkItemIDZ())
1042
84
    TIDIGCompCnt = 2;
1043
23.0k
  else if (MFI->hasWorkItemIDY())
1044
88
    TIDIGCompCnt = 1;
1045
23.1k
1046
23.1k
  ProgInfo.ComputePGMRSrc2 =
1047
23.1k
      S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) |
1048
23.1k
      S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) |
1049
23.1k
      // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
1050
23.1k
      S_00B84C_TRAP_HANDLER(STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled()) |
1051
23.1k
      S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) |
1052
23.1k
      S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) |
1053
23.1k
      S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) |
1054
23.1k
      S_00B84C_TG_SIZE_EN(MFI->hasWorkGroupInfo()) |
1055
23.1k
      S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) |
1056
23.1k
      S_00B84C_EXCP_EN_MSB(0) |
1057
23.1k
      // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
1058
23.1k
      S_00B84C_LDS_SIZE(STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks) |
1059
23.1k
      S_00B84C_EXCP_EN(0);
1060
23.1k
}
1061
1062
20.4k
static unsigned getRsrcReg(CallingConv::ID CallConv) {
1063
20.4k
  switch (CallConv) {
1064
20.4k
  
default: 17.5k
LLVM_FALLTHROUGH17.5k
;
1065
17.6k
  case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1;
1066
17.5k
  
case CallingConv::AMDGPU_LS: return 0
R_00B528_SPI_SHADER_PGM_RSRC1_LS0
;
1067
17.5k
  
case CallingConv::AMDGPU_HS: return 14
R_00B428_SPI_SHADER_PGM_RSRC1_HS14
;
1068
17.5k
  
case CallingConv::AMDGPU_ES: return 0
R_00B328_SPI_SHADER_PGM_RSRC1_ES0
;
1069
17.5k
  
case CallingConv::AMDGPU_GS: return 119
R_00B228_SPI_SHADER_PGM_RSRC1_GS119
;
1070
17.5k
  
case CallingConv::AMDGPU_VS: return 190
R_00B128_SPI_SHADER_PGM_RSRC1_VS190
;
1071
17.5k
  
case CallingConv::AMDGPU_PS: return 2.49k
R_00B028_SPI_SHADER_PGM_RSRC1_PS2.49k
;
1072
20.4k
  }
1073
20.4k
}
1074
1075
void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
1076
20.4k
                                         const SIProgramInfo &CurrentProgramInfo) {
1077
20.4k
  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1078
20.4k
  unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
1079
20.4k
1080
20.4k
  if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
1081
17.6k
    OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);
1082
17.6k
1083
17.6k
    OutStreamer->EmitIntValue(CurrentProgramInfo.ComputePGMRSrc1, 4);
1084
17.6k
1085
17.6k
    OutStreamer->EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
1086
17.6k
    OutStreamer->EmitIntValue(CurrentProgramInfo.ComputePGMRSrc2, 4);
1087
17.6k
1088
17.6k
    OutStreamer->EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4);
1089
17.6k
    OutStreamer->EmitIntValue(S_00B860_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);
1090
17.6k
1091
17.6k
    // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
1092
17.6k
    // 0" comment but I don't see a corresponding field in the register spec.
1093
17.6k
  } else {
1094
2.82k
    OutStreamer->EmitIntValue(RsrcReg, 4);
1095
2.82k
    OutStreamer->EmitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
1096
2.82k
                              S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
1097
2.82k
    OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);
1098
2.82k
    OutStreamer->EmitIntValue(
1099
2.82k
        S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);
1100
2.82k
  }
1101
20.4k
1102
20.4k
  if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1103
2.49k
    OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
1104
2.49k
    OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks), 4);
1105
2.49k
    OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
1106
2.49k
    OutStreamer->EmitIntValue(MFI->getPSInputEnable(), 4);
1107
2.49k
    OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4);
1108
2.49k
    OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4);
1109
2.49k
  }
1110
20.4k
1111
20.4k
  OutStreamer->EmitIntValue(R_SPILLED_SGPRS, 4);
1112
20.4k
  OutStreamer->EmitIntValue(MFI->getNumSpilledSGPRs(), 4);
1113
20.4k
  OutStreamer->EmitIntValue(R_SPILLED_VGPRS, 4);
1114
20.4k
  OutStreamer->EmitIntValue(MFI->getNumSpilledVGPRs(), 4);
1115
20.4k
}
1116
1117
// This is the equivalent of EmitProgramInfoSI above, but for when the OS type
1118
// is AMDPAL.  It stores each compute/SPI register setting and other PAL
1119
// metadata items into the PALMD::Metadata, combining with any provided by the
1120
// frontend as LLVM metadata. Once all functions are written, the PAL metadata
1121
// is then written as a single block in the .note section.
1122
void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
1123
394
       const SIProgramInfo &CurrentProgramInfo) {
1124
394
  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1125
394
  auto CC = MF.getFunction().getCallingConv();
1126
394
  auto MD = getTargetStreamer()->getPALMetadata();
1127
394
1128
394
  MD->setEntryPoint(CC, MF.getFunction().getName());
1129
394
  MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU);
1130
394
  MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU);
1131
394
  if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
1132
320
    MD->setRsrc1(CC, CurrentProgramInfo.ComputePGMRSrc1);
1133
320
    MD->setRsrc2(CC, CurrentProgramInfo.ComputePGMRSrc2);
1134
320
  } else {
1135
74
    MD->setRsrc1(CC, S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
1136
74
        S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks));
1137
74
    if (CurrentProgramInfo.ScratchBlocks > 0)
1138
1
      MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1));
1139
74
  }
1140
394
  // ScratchSize is in bytes, 16 aligned.
1141
394
  MD->setScratchSize(CC, alignTo(CurrentProgramInfo.ScratchSize, 16));
1142
394
  if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1143
33
    MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks));
1144
33
    MD->setSpiPsInputEna(MFI->getPSInputEnable());
1145
33
    MD->setSpiPsInputAddr(MFI->getPSInputAddr());
1146
33
  }
1147
394
1148
394
  const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1149
394
  if (STM.isWave32())
1150
9
    MD->setWave32(MF.getFunction().getCallingConv());
1151
394
}
1152
1153
// This is supposed to be log2(Size)
1154
1.16k
static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
1155
1.16k
  switch (Size) {
1156
1.16k
  case 4:
1157
1.15k
    return AMD_ELEMENT_4_BYTES;
1158
1.16k
  case 8:
1159
5
    return AMD_ELEMENT_8_BYTES;
1160
1.16k
  case 16:
1161
5
    return AMD_ELEMENT_16_BYTES;
1162
1.16k
  default:
1163
0
    llvm_unreachable("invalid private_element_size");
1164
1.16k
  }
1165
1.16k
}
1166
1167
void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
1168
                                        const SIProgramInfo &CurrentProgramInfo,
1169
1.16k
                                        const MachineFunction &MF) const {
1170
1.16k
  const Function &F = MF.getFunction();
1171
1.16k
  assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
1172
1.16k
         F.getCallingConv() == CallingConv::SPIR_KERNEL);
1173
1.16k
1174
1.16k
  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1175
1.16k
  const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1176
1.16k
1177
1.16k
  AMDGPU::initDefaultAMDKernelCodeT(Out, &STM);
1178
1.16k
1179
1.16k
  Out.compute_pgm_resource_registers =
1180
1.16k
      CurrentProgramInfo.ComputePGMRSrc1 |
1181
1.16k
      (CurrentProgramInfo.ComputePGMRSrc2 << 32);
1182
1.16k
  Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64;
1183
1.16k
1184
1.16k
  if (CurrentProgramInfo.DynamicCallStack)
1185
77
    Out.code_properties |= AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK;
1186
1.16k
1187
1.16k
  AMD_HSA_BITS_SET(Out.code_properties,
1188
1.16k
                   AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
1189
1.16k
                   getElementByteSizeValue(STM.getMaxPrivateElementSize()));
1190
1.16k
1191
1.16k
  if (MFI->hasPrivateSegmentBuffer()) {
1192
1.16k
    Out.code_properties |=
1193
1.16k
      AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
1194
1.16k
  }
1195
1.16k
1196
1.16k
  if (MFI->hasDispatchPtr())
1197
33
    Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
1198
1.16k
1199
1.16k
  if (MFI->hasQueuePtr())
1200
56
    Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
1201
1.16k
1202
1.16k
  if (MFI->hasKernargSegmentPtr())
1203
856
    Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
1204
1.16k
1205
1.16k
  if (MFI->hasDispatchID())
1206
6
    Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
1207
1.16k
1208
1.16k
  if (MFI->hasFlatScratchInit())
1209
190
    Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
1210
1.16k
1211
1.16k
  if (MFI->hasDispatchPtr())
1212
33
    Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
1213
1.16k
1214
1.16k
  if (STM.isXNACKEnabled())
1215
185
    Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
1216
1.16k
1217
1.16k
  unsigned MaxKernArgAlign;
1218
1.16k
  Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
1219
1.16k
  Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
1220
1.16k
  Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
1221
1.16k
  Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
1222
1.16k
  Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
1223
1.16k
1224
1.16k
  // These alignment values are specified in powers of two, so alignment =
1225
1.16k
  // 2^n.  The minimum alignment is 2^4 = 16.
1226
1.16k
  Out.kernarg_segment_alignment = std::max<size_t>(4,
1227
1.16k
      countTrailingZeros(MaxKernArgAlign));
1228
1.16k
}
1229
1230
bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
1231
789
                                       const char *ExtraCode, raw_ostream &O) {
1232
789
  // First try the generic code, which knows about modifiers like 'c' and 'n'.
1233
789
  if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O))
1234
4
    return false;
1235
785
1236
785
  if (ExtraCode && 
ExtraCode[0]0
) {
1237
0
    if (ExtraCode[1] != 0)
1238
0
      return true; // Unknown modifier.
1239
0
1240
0
    switch (ExtraCode[0]) {
1241
0
    case 'r':
1242
0
      break;
1243
0
    default:
1244
0
      return true;
1245
785
    }
1246
785
  }
1247
785
1248
785
  // TODO: Should be able to support other operand types like globals.
1249
785
  const MachineOperand &MO = MI->getOperand(OpNo);
1250
785
  if (MO.isReg()) {
1251
785
    AMDGPUInstPrinter::printRegOperand(MO.getReg(), O,
1252
785
                                       *MF->getSubtarget().getRegisterInfo());
1253
785
    return false;
1254
785
  }
1255
0
1256
0
  return true;
1257
0
}