Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Line
Count
Source (jump to first uncovered line)
1
//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
/// \file
10
/// The AMDGPU target machine contains all of the hardware specific
11
/// information  needed to emit code for R600 and SI GPUs.
12
//
13
//===----------------------------------------------------------------------===//
14
15
#include "AMDGPUTargetMachine.h"
16
#include "AMDGPU.h"
17
#include "AMDGPUAliasAnalysis.h"
18
#include "AMDGPUCallLowering.h"
19
#include "AMDGPUInstructionSelector.h"
20
#include "AMDGPULegalizerInfo.h"
21
#include "AMDGPUMacroFusion.h"
22
#include "AMDGPUTargetObjectFile.h"
23
#include "AMDGPUTargetTransformInfo.h"
24
#include "GCNIterativeScheduler.h"
25
#include "GCNSchedStrategy.h"
26
#include "R600MachineScheduler.h"
27
#include "SIMachineFunctionInfo.h"
28
#include "SIMachineScheduler.h"
29
#include "TargetInfo/AMDGPUTargetInfo.h"
30
#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
31
#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
32
#include "llvm/CodeGen/GlobalISel/Legalizer.h"
33
#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
34
#include "llvm/CodeGen/MIRParser/MIParser.h"
35
#include "llvm/CodeGen/Passes.h"
36
#include "llvm/CodeGen/TargetPassConfig.h"
37
#include "llvm/IR/Attributes.h"
38
#include "llvm/IR/Function.h"
39
#include "llvm/IR/LegacyPassManager.h"
40
#include "llvm/Pass.h"
41
#include "llvm/Support/CommandLine.h"
42
#include "llvm/Support/Compiler.h"
43
#include "llvm/Support/TargetRegistry.h"
44
#include "llvm/Target/TargetLoweringObjectFile.h"
45
#include "llvm/Transforms/IPO.h"
46
#include "llvm/Transforms/IPO/AlwaysInliner.h"
47
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
48
#include "llvm/Transforms/Scalar.h"
49
#include "llvm/Transforms/Scalar/GVN.h"
50
#include "llvm/Transforms/Utils.h"
51
#include "llvm/Transforms/Vectorize.h"
52
#include <memory>
53
54
using namespace llvm;
55
56
static cl::opt<bool> EnableR600StructurizeCFG(
57
  "r600-ir-structurize",
58
  cl::desc("Use StructurizeCFG IR pass"),
59
  cl::init(true));
60
61
static cl::opt<bool> EnableSROA(
62
  "amdgpu-sroa",
63
  cl::desc("Run SROA after promote alloca pass"),
64
  cl::ReallyHidden,
65
  cl::init(true));
66
67
static cl::opt<bool>
68
EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
69
                        cl::desc("Run early if-conversion"),
70
                        cl::init(false));
71
72
static cl::opt<bool>
73
OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
74
            cl::desc("Run pre-RA exec mask optimizations"),
75
            cl::init(true));
76
77
static cl::opt<bool> EnableR600IfConvert(
78
  "r600-if-convert",
79
  cl::desc("Use if conversion pass"),
80
  cl::ReallyHidden,
81
  cl::init(true));
82
83
// Option to disable vectorizer for tests.
84
static cl::opt<bool> EnableLoadStoreVectorizer(
85
  "amdgpu-load-store-vectorizer",
86
  cl::desc("Enable load store vectorizer"),
87
  cl::init(true),
88
  cl::Hidden);
89
90
// Option to control global loads scalarization
91
static cl::opt<bool> ScalarizeGlobal(
92
  "amdgpu-scalarize-global-loads",
93
  cl::desc("Enable global load scalarization"),
94
  cl::init(true),
95
  cl::Hidden);
96
97
// Option to run internalize pass.
98
static cl::opt<bool> InternalizeSymbols(
99
  "amdgpu-internalize-symbols",
100
  cl::desc("Enable elimination of non-kernel functions and unused globals"),
101
  cl::init(false),
102
  cl::Hidden);
103
104
// Option to inline all early.
105
static cl::opt<bool> EarlyInlineAll(
106
  "amdgpu-early-inline-all",
107
  cl::desc("Inline all functions early"),
108
  cl::init(false),
109
  cl::Hidden);
110
111
static cl::opt<bool> EnableSDWAPeephole(
112
  "amdgpu-sdwa-peephole",
113
  cl::desc("Enable SDWA peepholer"),
114
  cl::init(true));
115
116
static cl::opt<bool> EnableDPPCombine(
117
  "amdgpu-dpp-combine",
118
  cl::desc("Enable DPP combiner"),
119
  cl::init(true));
120
121
// Enable address space based alias analysis
122
static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
123
  cl::desc("Enable AMDGPU Alias Analysis"),
124
  cl::init(true));
125
126
// Option to run late CFG structurizer
127
static cl::opt<bool, true> LateCFGStructurize(
128
  "amdgpu-late-structurize",
129
  cl::desc("Enable late CFG structurization"),
130
  cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG),
131
  cl::Hidden);
132
133
static cl::opt<bool, true> EnableAMDGPUFunctionCallsOpt(
134
  "amdgpu-function-calls",
135
  cl::desc("Enable AMDGPU function call support"),
136
  cl::location(AMDGPUTargetMachine::EnableFunctionCalls),
137
  cl::init(true),
138
  cl::Hidden);
139
140
// Enable lib calls simplifications
141
static cl::opt<bool> EnableLibCallSimplify(
142
  "amdgpu-simplify-libcall",
143
  cl::desc("Enable amdgpu library simplifications"),
144
  cl::init(true),
145
  cl::Hidden);
146
147
static cl::opt<bool> EnableLowerKernelArguments(
148
  "amdgpu-ir-lower-kernel-arguments",
149
  cl::desc("Lower kernel argument loads in IR pass"),
150
  cl::init(true),
151
  cl::Hidden);
152
153
static cl::opt<bool> EnableRegReassign(
154
  "amdgpu-reassign-regs",
155
  cl::desc("Enable register reassign optimizations on gfx10+"),
156
  cl::init(true),
157
  cl::Hidden);
158
159
// Enable atomic optimization
160
static cl::opt<bool> EnableAtomicOptimizations(
161
  "amdgpu-atomic-optimizations",
162
  cl::desc("Enable atomic optimizations"),
163
  cl::init(false),
164
  cl::Hidden);
165
166
// Enable Mode register optimization
167
static cl::opt<bool> EnableSIModeRegisterPass(
168
  "amdgpu-mode-register",
169
  cl::desc("Enable mode register pass"),
170
  cl::init(true),
171
  cl::Hidden);
172
173
// Option is used in lit tests to prevent deadcoding of patterns inspected.
174
static cl::opt<bool>
175
EnableDCEInRA("amdgpu-dce-in-ra",
176
    cl::init(true), cl::Hidden,
177
    cl::desc("Enable machine DCE inside regalloc"));
178
179
static cl::opt<bool> EnableScalarIRPasses(
180
  "amdgpu-scalar-ir-passes",
181
  cl::desc("Enable scalar IR passes"),
182
  cl::init(true),
183
  cl::Hidden);
184
185
139k
extern "C" void LLVMInitializeAMDGPUTarget() {
186
139k
  // Register the target
187
139k
  RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
188
139k
  RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
189
139k
190
139k
  PassRegistry *PR = PassRegistry::getPassRegistry();
191
139k
  initializeR600ClauseMergePassPass(*PR);
192
139k
  initializeR600ControlFlowFinalizerPass(*PR);
193
139k
  initializeR600PacketizerPass(*PR);
194
139k
  initializeR600ExpandSpecialInstrsPassPass(*PR);
195
139k
  initializeR600VectorRegMergerPass(*PR);
196
139k
  initializeGlobalISel(*PR);
197
139k
  initializeAMDGPUDAGToDAGISelPass(*PR);
198
139k
  initializeGCNDPPCombinePass(*PR);
199
139k
  initializeSILowerI1CopiesPass(*PR);
200
139k
  initializeSILowerSGPRSpillsPass(*PR);
201
139k
  initializeSIFixSGPRCopiesPass(*PR);
202
139k
  initializeSIFixVGPRCopiesPass(*PR);
203
139k
  initializeSIFixupVectorISelPass(*PR);
204
139k
  initializeSIFoldOperandsPass(*PR);
205
139k
  initializeSIPeepholeSDWAPass(*PR);
206
139k
  initializeSIShrinkInstructionsPass(*PR);
207
139k
  initializeSIOptimizeExecMaskingPreRAPass(*PR);
208
139k
  initializeSILoadStoreOptimizerPass(*PR);
209
139k
  initializeAMDGPUFixFunctionBitcastsPass(*PR);
210
139k
  initializeAMDGPUAlwaysInlinePass(*PR);
211
139k
  initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
212
139k
  initializeAMDGPUAnnotateUniformValuesPass(*PR);
213
139k
  initializeAMDGPUArgumentUsageInfoPass(*PR);
214
139k
  initializeAMDGPUAtomicOptimizerPass(*PR);
215
139k
  initializeAMDGPULowerKernelArgumentsPass(*PR);
216
139k
  initializeAMDGPULowerKernelAttributesPass(*PR);
217
139k
  initializeAMDGPULowerIntrinsicsPass(*PR);
218
139k
  initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
219
139k
  initializeAMDGPUPromoteAllocaPass(*PR);
220
139k
  initializeAMDGPUCodeGenPreparePass(*PR);
221
139k
  initializeAMDGPUPropagateAttributesEarlyPass(*PR);
222
139k
  initializeAMDGPUPropagateAttributesLatePass(*PR);
223
139k
  initializeAMDGPURewriteOutArgumentsPass(*PR);
224
139k
  initializeAMDGPUUnifyMetadataPass(*PR);
225
139k
  initializeSIAnnotateControlFlowPass(*PR);
226
139k
  initializeSIInsertWaitcntsPass(*PR);
227
139k
  initializeSIModeRegisterPass(*PR);
228
139k
  initializeSIWholeQuadModePass(*PR);
229
139k
  initializeSILowerControlFlowPass(*PR);
230
139k
  initializeSIInsertSkipsPass(*PR);
231
139k
  initializeSIMemoryLegalizerPass(*PR);
232
139k
  initializeSIOptimizeExecMaskingPass(*PR);
233
139k
  initializeSIPreAllocateWWMRegsPass(*PR);
234
139k
  initializeSIFormMemoryClausesPass(*PR);
235
139k
  initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
236
139k
  initializeAMDGPUAAWrapperPassPass(*PR);
237
139k
  initializeAMDGPUExternalAAWrapperPass(*PR);
238
139k
  initializeAMDGPUUseNativeCallsPass(*PR);
239
139k
  initializeAMDGPUSimplifyLibCallsPass(*PR);
240
139k
  initializeAMDGPUInlinerPass(*PR);
241
139k
  initializeGCNRegBankReassignPass(*PR);
242
139k
  initializeGCNNSAReassignPass(*PR);
243
139k
}
244
245
3.93k
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
246
3.93k
  return llvm::make_unique<AMDGPUTargetObjectFile>();
247
3.93k
}
248
249
2.29k
static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
250
2.29k
  return new ScheduleDAGMILive(C, llvm::make_unique<R600SchedStrategy>());
251
2.29k
}
252
253
4
static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
254
4
  return new SIScheduleDAGMI(C);
255
4
}
256
257
static ScheduleDAGInstrs *
258
25.1k
createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
259
25.1k
  ScheduleDAGMILive *DAG =
260
25.1k
    new GCNScheduleDAGMILive(C, make_unique<GCNMaxOccupancySchedStrategy>(C));
261
25.1k
  DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
262
25.1k
  DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
263
25.1k
  DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
264
25.1k
  return DAG;
265
25.1k
}
266
267
static ScheduleDAGInstrs *
268
3
createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
269
3
  auto DAG = new GCNIterativeScheduler(C,
270
3
    GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
271
3
  DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
272
3
  DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
273
3
  return DAG;
274
3
}
275
276
3
static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
277
3
  return new GCNIterativeScheduler(C,
278
3
    GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
279
3
}
280
281
static ScheduleDAGInstrs *
282
2
createIterativeILPMachineScheduler(MachineSchedContext *C) {
283
2
  auto DAG = new GCNIterativeScheduler(C,
284
2
    GCNIterativeScheduler::SCHEDULE_ILP);
285
2
  DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
286
2
  DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
287
2
  DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
288
2
  return DAG;
289
2
}
290
291
static MachineSchedRegistry
292
R600SchedRegistry("r600", "Run R600's custom scheduler",
293
                   createR600MachineScheduler);
294
295
static MachineSchedRegistry
296
SISchedRegistry("si", "Run SI's custom scheduler",
297
                createSIMachineScheduler);
298
299
static MachineSchedRegistry
300
GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
301
                             "Run GCN scheduler to maximize occupancy",
302
                             createGCNMaxOccupancyMachineScheduler);
303
304
static MachineSchedRegistry
305
IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental",
306
  "Run GCN scheduler to maximize occupancy (experimental)",
307
  createIterativeGCNMaxOccupancyMachineScheduler);
308
309
static MachineSchedRegistry
310
GCNMinRegSchedRegistry("gcn-minreg",
311
  "Run GCN iterative scheduler for minimal register usage (experimental)",
312
  createMinRegScheduler);
313
314
static MachineSchedRegistry
315
GCNILPSchedRegistry("gcn-ilp",
316
  "Run GCN iterative scheduler for ILP scheduling (experimental)",
317
  createIterativeILPMachineScheduler);
318
319
3.93k
static StringRef computeDataLayout(const Triple &TT) {
320
3.93k
  if (TT.getArch() == Triple::r600) {
321
295
    // 32-bit pointers.
322
295
      return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
323
295
             "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5";
324
295
  }
325
3.63k
326
3.63k
  // 32-bit private, local, and region pointers. 64-bit global, constant and
327
3.63k
  // flat, non-integral buffer fat pointers.
328
3.63k
    return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
329
3.63k
         "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
330
3.63k
         "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
331
3.63k
         "-ni:7";
332
3.63k
}
333
334
LLVM_READNONE
335
3.93k
static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
336
3.93k
  if (!GPU.empty())
337
3.00k
    return GPU;
338
927
339
927
  // Need to default to a target with flat support for HSA.
340
927
  if (TT.getArch() == Triple::amdgcn)
341
890
    return TT.getOS() == Triple::AMDHSA ? 
"generic-hsa"241
:
"generic"649
;
342
37
343
37
  return "r600";
344
37
}
345
346
3.93k
static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
347
3.93k
  // The AMDGPU toolchain only supports generating shared objects, so we
348
3.93k
  // must always use PIC.
349
3.93k
  return Reloc::PIC_;
350
3.93k
}
351
352
AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
353
                                         StringRef CPU, StringRef FS,
354
                                         TargetOptions Options,
355
                                         Optional<Reloc::Model> RM,
356
                                         Optional<CodeModel::Model> CM,
357
                                         CodeGenOpt::Level OptLevel)
358
    : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
359
                        FS, Options, getEffectiveRelocModel(RM),
360
                        getEffectiveCodeModel(CM, CodeModel::Small), OptLevel),
361
3.93k
      TLOF(createTLOF(getTargetTriple())) {
362
3.93k
  initAsmInfo();
363
3.93k
}
364
365
bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
366
bool AMDGPUTargetMachine::EnableFunctionCalls = false;
367
368
3.89k
AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
369
370
1.72M
StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
371
1.72M
  Attribute GPUAttr = F.getFnAttribute("target-cpu");
372
1.72M
  return GPUAttr.hasAttribute(Attribute::None) ?
373
1.47M
    
getTargetCPU()252k
: GPUAttr.getValueAsString();
374
1.72M
}
375
376
1.72M
StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
377
1.72M
  Attribute FSAttr = F.getFnAttribute("target-features");
378
1.72M
379
1.72M
  return FSAttr.hasAttribute(Attribute::None) ?
380
1.08M
    getTargetFeatureString() :
381
1.72M
    
FSAttr.getValueAsString()644k
;
382
1.72M
}
383
384
/// Predicate for Internalize pass.
385
7
static bool mustPreserveGV(const GlobalValue &GV) {
386
7
  if (const Function *F = dyn_cast<Function>(&GV))
387
5
    return F->isDeclaration() || AMDGPU::isEntryFunctionCC(F->getCallingConv());
388
2
389
2
  return !GV.use_empty();
390
2
}
391
392
166
void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
393
166
  Builder.DivergentTarget = true;
394
166
395
166
  bool EnableOpt = getOptLevel() > CodeGenOpt::None;
396
166
  bool Internalize = InternalizeSymbols;
397
166
  bool EarlyInline = EarlyInlineAll && 
EnableOpt1
&&
!EnableFunctionCalls1
;
398
166
  bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt;
399
166
  bool LibCallSimplify = EnableLibCallSimplify && EnableOpt;
400
166
401
166
  if (EnableFunctionCalls) {
402
157
    delete Builder.Inliner;
403
157
    Builder.Inliner = createAMDGPUFunctionInliningPass();
404
157
  }
405
166
406
166
  Builder.addExtension(
407
166
    PassManagerBuilder::EP_ModuleOptimizerEarly,
408
166
    [Internalize, EarlyInline, AMDGPUAA, this](const PassManagerBuilder &,
409
166
                                               legacy::PassManagerBase &PM) {
410
97
      if (AMDGPUAA) {
411
97
        PM.add(createAMDGPUAAWrapperPass());
412
97
        PM.add(createAMDGPUExternalAAWrapperPass());
413
97
      }
414
97
      PM.add(createAMDGPUUnifyMetadataPass());
415
97
      PM.add(createAMDGPUPropagateAttributesLatePass(this));
416
97
      if (Internalize) {
417
1
        PM.add(createInternalizePass(mustPreserveGV));
418
1
        PM.add(createGlobalDCEPass());
419
1
      }
420
97
      if (EarlyInline)
421
0
        PM.add(createAMDGPUAlwaysInlinePass(false));
422
97
  });
423
166
424
166
  const auto &Opt = Options;
425
166
  Builder.addExtension(
426
166
    PassManagerBuilder::EP_EarlyAsPossible,
427
166
    [AMDGPUAA, LibCallSimplify, &Opt, this](const PassManagerBuilder &,
428
166
                                            legacy::PassManagerBase &PM) {
429
166
      if (AMDGPUAA) {
430
97
        PM.add(createAMDGPUAAWrapperPass());
431
97
        PM.add(createAMDGPUExternalAAWrapperPass());
432
97
      }
433
166
      PM.add(llvm::createAMDGPUPropagateAttributesEarlyPass(this));
434
166
      PM.add(llvm::createAMDGPUUseNativeCallsPass());
435
166
      if (LibCallSimplify)
436
97
        PM.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt, this));
437
166
  });
438
166
439
166
  Builder.addExtension(
440
166
    PassManagerBuilder::EP_CGSCCOptimizerLate,
441
166
    [](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
442
97
      // Add infer address spaces pass to the opt pipeline after inlining
443
97
      // but before SROA to increase SROA opportunities.
444
97
      PM.add(createInferAddressSpacesPass());
445
97
446
97
      // This should run after inlining to have any chance of doing anything,
447
97
      // and before other cleanup optimizations.
448
97
      PM.add(createAMDGPULowerKernelAttributesPass());
449
97
  });
450
166
}
451
452
//===----------------------------------------------------------------------===//
453
// R600 Target Machine (R600 -> Cayman)
454
//===----------------------------------------------------------------------===//
455
456
R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT,
457
                                     StringRef CPU, StringRef FS,
458
                                     TargetOptions Options,
459
                                     Optional<Reloc::Model> RM,
460
                                     Optional<CodeModel::Model> CM,
461
                                     CodeGenOpt::Level OL, bool JIT)
462
295
    : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {
463
295
  setRequiresStructuredCFG(true);
464
295
465
295
  // Override the default since calls aren't supported for r600.
466
295
  if (EnableFunctionCalls &&
467
295
      
EnableAMDGPUFunctionCallsOpt.getNumOccurrences() == 0294
)
468
294
    EnableFunctionCalls = false;
469
295
}
470
471
const R600Subtarget *R600TargetMachine::getSubtargetImpl(
472
116k
  const Function &F) const {
473
116k
  StringRef GPU = getGPUName(F);
474
116k
  StringRef FS = getFeatureString(F);
475
116k
476
116k
  SmallString<128> SubtargetKey(GPU);
477
116k
  SubtargetKey.append(FS);
478
116k
479
116k
  auto &I = SubtargetMap[SubtargetKey];
480
116k
  if (!I) {
481
290
    // This needs to be done before we create a new subtarget since any
482
290
    // creation will depend on the TM and the code generation flags on the
483
290
    // function that reside in TargetOptions.
484
290
    resetTargetOptions(F);
485
290
    I = llvm::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this);
486
290
  }
487
116k
488
116k
  return I.get();
489
116k
}
490
491
TargetTransformInfo
492
37.9k
R600TargetMachine::getTargetTransformInfo(const Function &F) {
493
37.9k
  return TargetTransformInfo(R600TTIImpl(this, F));
494
37.9k
}
495
496
//===----------------------------------------------------------------------===//
497
// GCN Target Machine (SI+)
498
//===----------------------------------------------------------------------===//
499
500
GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
501
                                   StringRef CPU, StringRef FS,
502
                                   TargetOptions Options,
503
                                   Optional<Reloc::Model> RM,
504
                                   Optional<CodeModel::Model> CM,
505
                                   CodeGenOpt::Level OL, bool JIT)
506
3.63k
    : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
507
508
1.61M
const GCNSubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
509
1.61M
  StringRef GPU = getGPUName(F);
510
1.61M
  StringRef FS = getFeatureString(F);
511
1.61M
512
1.61M
  SmallString<128> SubtargetKey(GPU);
513
1.61M
  SubtargetKey.append(FS);
514
1.61M
515
1.61M
  auto &I = SubtargetMap[SubtargetKey];
516
1.61M
  if (!I) {
517
3.64k
    // This needs to be done before we create a new subtarget since any
518
3.64k
    // creation will depend on the TM and the code generation flags on the
519
3.64k
    // function that reside in TargetOptions.
520
3.64k
    resetTargetOptions(F);
521
3.64k
    I = llvm::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
522
3.64k
  }
523
1.61M
524
1.61M
  I->setScalarizeGlobalBehavior(ScalarizeGlobal);
525
1.61M
526
1.61M
  return I.get();
527
1.61M
}
528
529
TargetTransformInfo
530
513k
GCNTargetMachine::getTargetTransformInfo(const Function &F) {
531
513k
  return TargetTransformInfo(GCNTTIImpl(this, F));
532
513k
}
533
534
//===----------------------------------------------------------------------===//
535
// AMDGPU Pass Setup
536
//===----------------------------------------------------------------------===//
537
538
namespace {
539
540
class AMDGPUPassConfig : public TargetPassConfig {
541
public:
542
  AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
543
3.75k
    : TargetPassConfig(TM, PM) {
544
3.75k
    // Exceptions and StackMaps are not supported, so these passes will never do
545
3.75k
    // anything.
546
3.75k
    disablePass(&StackMapLivenessID);
547
3.75k
    disablePass(&FuncletLayoutID);
548
3.75k
  }
549
550
5.40k
  AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
551
5.40k
    return getTM<AMDGPUTargetMachine>();
552
5.40k
  }
553
554
  ScheduleDAGInstrs *
555
0
  createMachineScheduler(MachineSchedContext *C) const override {
556
0
    ScheduleDAGMILive *DAG = createGenericSchedLive(C);
557
0
    DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
558
0
    DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
559
0
    return DAG;
560
0
  }
561
562
  void addEarlyCSEOrGVNPass();
563
  void addStraightLineScalarOptimizationPasses();
564
  void addIRPasses() override;
565
  void addCodeGenPrepare() override;
566
  bool addPreISel() override;
567
  bool addInstSelector() override;
568
  bool addGCPasses() override;
569
570
  std::unique_ptr<CSEConfigBase> getCSEConfig() const override;
571
};
572
573
2.64k
std::unique_ptr<CSEConfigBase> AMDGPUPassConfig::getCSEConfig() const {
574
2.64k
  return getStandardCSEConfigForOpt(TM->getOptLevel());
575
2.64k
}
576
577
class R600PassConfig final : public AMDGPUPassConfig {
578
public:
579
  R600PassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
580
287
    : AMDGPUPassConfig(TM, PM) {}
581
582
  ScheduleDAGInstrs *createMachineScheduler(
583
2.29k
    MachineSchedContext *C) const override {
584
2.29k
    return createR600MachineScheduler(C);
585
2.29k
  }
586
587
  bool addPreISel() override;
588
  bool addInstSelector() override;
589
  void addPreRegAlloc() override;
590
  void addPreSched2() override;
591
  void addPreEmitPass() override;
592
};
593
594
class GCNPassConfig final : public AMDGPUPassConfig {
595
public:
596
  GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
597
3.46k
    : AMDGPUPassConfig(TM, PM) {
598
3.46k
    // It is necessary to know the register usage of the entire call graph.  We
599
3.46k
    // allow calls without EnableAMDGPUFunctionCalls if they are marked
600
3.46k
    // noinline, so this is always required.
601
3.46k
    setRequiresCodeGenSCCOrder(true);
602
3.46k
  }
603
604
0
  GCNTargetMachine &getGCNTargetMachine() const {
605
0
    return getTM<GCNTargetMachine>();
606
0
  }
607
608
  ScheduleDAGInstrs *
609
  createMachineScheduler(MachineSchedContext *C) const override;
610
611
  bool addPreISel() override;
612
  void addMachineSSAOptimization() override;
613
  bool addILPOpts() override;
614
  bool addInstSelector() override;
615
  bool addIRTranslator() override;
616
  bool addLegalizeMachineIR() override;
617
  bool addRegBankSelect() override;
618
  bool addGlobalInstructionSelect() override;
619
  void addFastRegAlloc() override;
620
  void addOptimizedRegAlloc() override;
621
  void addPreRegAlloc() override;
622
  bool addPreRewrite() override;
623
  void addPostRegAlloc() override;
624
  void addPreSched2() override;
625
  void addPreEmitPass() override;
626
};
627
628
} // end anonymous namespace
629
630
5.33k
void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
631
5.33k
  if (getOptLevel() == CodeGenOpt::Aggressive)
632
0
    addPass(createGVNPass());
633
5.33k
  else
634
5.33k
    addPass(createEarlyCSEPass());
635
5.33k
}
636
637
2.66k
void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
638
2.66k
  addPass(createLICMPass());
639
2.66k
  addPass(createSeparateConstOffsetFromGEPPass());
640
2.66k
  addPass(createSpeculativeExecutionPass());
641
2.66k
  // ReassociateGEPs exposes more opportunites for SLSR. See
642
2.66k
  // the example in reassociate-geps-and-slsr.ll.
643
2.66k
  addPass(createStraightLineStrengthReducePass());
644
2.66k
  // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
645
2.66k
  // EarlyCSE can reuse.
646
2.66k
  addEarlyCSEOrGVNPass();
647
2.66k
  // Run NaryReassociate after EarlyCSE/GVN to be more effective.
648
2.66k
  addPass(createNaryReassociatePass());
649
2.66k
  // NaryReassociate on GEPs creates redundant common expressions, so run
650
2.66k
  // EarlyCSE after it.
651
2.66k
  addPass(createEarlyCSEPass());
652
2.66k
}
653
654
2.72k
void AMDGPUPassConfig::addIRPasses() {
655
2.72k
  const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
656
2.72k
657
2.72k
  // There is no reason to run these.
658
2.72k
  disablePass(&StackMapLivenessID);
659
2.72k
  disablePass(&FuncletLayoutID);
660
2.72k
  disablePass(&PatchableFunctionID);
661
2.72k
662
2.72k
  // This must occur before inlining, as the inliner will not look through
663
2.72k
  // bitcast calls.
664
2.72k
  addPass(createAMDGPUFixFunctionBitcastsPass());
665
2.72k
666
2.72k
  // A call to propagate attributes pass in the backend in case opt was not run.
667
2.72k
  addPass(createAMDGPUPropagateAttributesEarlyPass(&TM));
668
2.72k
669
2.72k
  addPass(createAtomicExpandPass());
670
2.72k
671
2.72k
672
2.72k
  addPass(createAMDGPULowerIntrinsicsPass());
673
2.72k
674
2.72k
  // Function calls are not supported, so make sure we inline everything.
675
2.72k
  addPass(createAMDGPUAlwaysInlinePass());
676
2.72k
  addPass(createAlwaysInlinerLegacyPass());
677
2.72k
  // We need to add the barrier noop pass, otherwise adding the function
678
2.72k
  // inlining pass will cause all of the PassConfigs passes to be run
679
2.72k
  // one function at a time, which means if we have a nodule with two
680
2.72k
  // functions, then we will generate code for the first function
681
2.72k
  // without ever running any passes on the second.
682
2.72k
  addPass(createBarrierNoopPass());
683
2.72k
684
2.72k
  if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
685
2.44k
    // TODO: May want to move later or split into an early and late one.
686
2.44k
687
2.44k
    addPass(createAMDGPUCodeGenPreparePass());
688
2.44k
  }
689
2.72k
690
2.72k
  // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
691
2.72k
  if (TM.getTargetTriple().getArch() == Triple::r600)
692
280
    addPass(createR600OpenCLImageTypeLoweringPass());
693
2.72k
694
2.72k
  // Replace OpenCL enqueued block function pointers with global variables.
695
2.72k
  addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
696
2.72k
697
2.72k
  if (TM.getOptLevel() > CodeGenOpt::None) {
698
2.67k
    addPass(createInferAddressSpacesPass());
699
2.67k
    addPass(createAMDGPUPromoteAlloca());
700
2.67k
701
2.67k
    if (EnableSROA)
702
2.64k
      addPass(createSROAPass());
703
2.67k
704
2.67k
    if (EnableScalarIRPasses)
705
2.66k
      addStraightLineScalarOptimizationPasses();
706
2.67k
707
2.67k
    if (EnableAMDGPUAliasAnalysis) {
708
2.65k
      addPass(createAMDGPUAAWrapperPass());
709
2.65k
      addPass(createExternalAAWrapperPass([](Pass &P, Function &,
710
183k
                                             AAResults &AAR) {
711
183k
        if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
712
183k
          AAR.addAAResult(WrapperPass->getResult());
713
183k
        }));
714
2.65k
    }
715
2.67k
  }
716
2.72k
717
2.72k
  TargetPassConfig::addIRPasses();
718
2.72k
719
2.72k
  // EarlyCSE is not always strong enough to clean up what LSR produces. For
720
2.72k
  // example, GVN can combine
721
2.72k
  //
722
2.72k
  //   %0 = add %a, %b
723
2.72k
  //   %1 = add %b, %a
724
2.72k
  //
725
2.72k
  // and
726
2.72k
  //
727
2.72k
  //   %0 = shl nsw %a, 2
728
2.72k
  //   %1 = shl %a, 2
729
2.72k
  //
730
2.72k
  // but EarlyCSE can do neither of them.
731
2.72k
  if (getOptLevel() != CodeGenOpt::None && 
EnableScalarIRPasses2.67k
)
732
2.66k
    addEarlyCSEOrGVNPass();
733
2.72k
}
734
735
2.72k
void AMDGPUPassConfig::addCodeGenPrepare() {
736
2.72k
  if (TM->getTargetTriple().getArch() == Triple::amdgcn)
737
2.44k
    addPass(createAMDGPUAnnotateKernelFeaturesPass());
738
2.72k
739
2.72k
  if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
740
2.72k
      
EnableLowerKernelArguments2.44k
)
741
2.44k
    addPass(createAMDGPULowerKernelArgumentsPass());
742
2.72k
743
2.72k
  addPass(&AMDGPUPerfHintAnalysisID);
744
2.72k
745
2.72k
  TargetPassConfig::addCodeGenPrepare();
746
2.72k
747
2.72k
  if (EnableLoadStoreVectorizer)
748
2.71k
    addPass(createLoadStoreVectorizerPass());
749
2.72k
}
750
751
2.72k
bool AMDGPUPassConfig::addPreISel() {
752
2.72k
  addPass(createLowerSwitchPass());
753
2.72k
  addPass(createFlattenCFGPass());
754
2.72k
  return false;
755
2.72k
}
756
757
2.40k
bool AMDGPUPassConfig::addInstSelector() {
758
2.40k
  // Defer the verifier until FinalizeISel.
759
2.40k
  addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel()), false);
760
2.40k
  return false;
761
2.40k
}
762
763
2.72k
bool AMDGPUPassConfig::addGCPasses() {
764
2.72k
  // Do nothing. GC is not supported.
765
2.72k
  return false;
766
2.72k
}
767
768
//===----------------------------------------------------------------------===//
769
// R600 Pass Setup
770
//===----------------------------------------------------------------------===//
771
772
280
bool R600PassConfig::addPreISel() {
773
280
  AMDGPUPassConfig::addPreISel();
774
280
775
280
  if (EnableR600StructurizeCFG)
776
278
    addPass(createStructurizeCFGPass());
777
280
  return false;
778
280
}
779
780
280
bool R600PassConfig::addInstSelector() {
781
280
  addPass(createR600ISelDag(&getAMDGPUTargetMachine(), getOptLevel()));
782
280
  return false;
783
280
}
784
785
280
void R600PassConfig::addPreRegAlloc() {
786
280
  addPass(createR600VectorRegMerger());
787
280
}
788
789
280
void R600PassConfig::addPreSched2() {
790
280
  addPass(createR600EmitClauseMarkers(), false);
791
280
  if (EnableR600IfConvert)
792
279
    addPass(&IfConverterID, false);
793
280
  addPass(createR600ClauseMergePass(), false);
794
280
}
795
796
280
void R600PassConfig::addPreEmitPass() {
797
280
  addPass(createAMDGPUCFGStructurizerPass(), false);
798
280
  addPass(createR600ExpandSpecialInstrsPass(), false);
799
280
  addPass(&FinalizeMachineBundlesID, false);
800
280
  addPass(createR600Packetizer(), false);
801
280
  addPass(createR600ControlFlowFinalizer(), false);
802
280
}
803
804
287
TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
805
287
  return new R600PassConfig(*this, PM);
806
287
}
807
808
//===----------------------------------------------------------------------===//
809
// GCN Pass Setup
810
//===----------------------------------------------------------------------===//
811
812
ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
813
25.1k
  MachineSchedContext *C) const {
814
25.1k
  const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
815
25.1k
  if (ST.enableSIScheduler())
816
0
    return createSIMachineScheduler(C);
817
25.1k
  return createGCNMaxOccupancyMachineScheduler(C);
818
25.1k
}
819
820
2.44k
bool GCNPassConfig::addPreISel() {
821
2.44k
  AMDGPUPassConfig::addPreISel();
822
2.44k
823
2.44k
  if (EnableAtomicOptimizations) {
824
18
    addPass(createAMDGPUAtomicOptimizerPass());
825
18
  }
826
2.44k
827
2.44k
  // FIXME: We need to run a pass to propagate the attributes when calls are
828
2.44k
  // supported.
829
2.44k
830
2.44k
  // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
831
2.44k
  // regions formed by them.
832
2.44k
  addPass(&AMDGPUUnifyDivergentExitNodesID);
833
2.44k
  if (!LateCFGStructurize) {
834
2.44k
    addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
835
2.44k
  }
836
2.44k
  addPass(createSinkingPass());
837
2.44k
  addPass(createAMDGPUAnnotateUniformValues());
838
2.44k
  if (!LateCFGStructurize) {
839
2.44k
    addPass(createSIAnnotateControlFlowPass());
840
2.44k
  }
841
2.44k
  addPass(createLCSSAPass());
842
2.44k
843
2.44k
  return false;
844
2.44k
}
845
846
2.39k
void GCNPassConfig::addMachineSSAOptimization() {
847
2.39k
  TargetPassConfig::addMachineSSAOptimization();
848
2.39k
849
2.39k
  // We want to fold operands after PeepholeOptimizer has run (or as part of
850
2.39k
  // it), because it will eliminate extra copies making it easier to fold the
851
2.39k
  // real source operand. We want to eliminate dead instructions after, so that
852
2.39k
  // we see fewer uses of the copies. We then need to clean up the dead
853
2.39k
  // instructions leftover after the operands are folded as well.
854
2.39k
  //
855
2.39k
  // XXX - Can we get away without running DeadMachineInstructionElim again?
856
2.39k
  addPass(&SIFoldOperandsID);
857
2.39k
  if (EnableDPPCombine)
858
2.38k
    addPass(&GCNDPPCombineID);
859
2.39k
  addPass(&DeadMachineInstructionElimID);
860
2.39k
  addPass(&SILoadStoreOptimizerID);
861
2.39k
  if (EnableSDWAPeephole) {
862
2.38k
    addPass(&SIPeepholeSDWAID);
863
2.38k
    addPass(&EarlyMachineLICMID);
864
2.38k
    addPass(&MachineCSEID);
865
2.38k
    addPass(&SIFoldOperandsID);
866
2.38k
    addPass(&DeadMachineInstructionElimID);
867
2.38k
  }
868
2.39k
  addPass(createSIShrinkInstructionsPass());
869
2.39k
}
870
871
2.39k
bool GCNPassConfig::addILPOpts() {
872
2.39k
  if (EnableEarlyIfConversion)
873
5
    addPass(&EarlyIfConverterID);
874
2.39k
875
2.39k
  TargetPassConfig::addILPOpts();
876
2.39k
  return false;
877
2.39k
}
878
879
2.40k
bool GCNPassConfig::addInstSelector() {
880
2.40k
  AMDGPUPassConfig::addInstSelector();
881
2.40k
  addPass(&SIFixSGPRCopiesID);
882
2.40k
  addPass(createSILowerI1CopiesPass());
883
2.40k
  addPass(createSIFixupVectorISelPass());
884
2.40k
  addPass(createSIAddIMGInitPass());
885
2.40k
  return false;
886
2.40k
}
887
888
41
bool GCNPassConfig::addIRTranslator() {
889
41
  addPass(new IRTranslator());
890
41
  return false;
891
41
}
892
893
41
bool GCNPassConfig::addLegalizeMachineIR() {
894
41
  addPass(new Legalizer());
895
41
  return false;
896
41
}
897
898
41
bool GCNPassConfig::addRegBankSelect() {
899
41
  addPass(new RegBankSelect());
900
41
  return false;
901
41
}
902
903
41
bool GCNPassConfig::addGlobalInstructionSelect() {
904
41
  addPass(new InstructionSelect());
905
41
  return false;
906
41
}
907
908
2.44k
void GCNPassConfig::addPreRegAlloc() {
909
2.44k
  if (LateCFGStructurize) {
910
0
    addPass(createAMDGPUMachineCFGStructurizerPass());
911
0
  }
912
2.44k
  addPass(createSIWholeQuadModePass());
913
2.44k
}
914
915
53
void GCNPassConfig::addFastRegAlloc() {
916
53
  // FIXME: We have to disable the verifier here because of PHIElimination +
917
53
  // TwoAddressInstructions disabling it.
918
53
919
53
  // This must be run immediately after phi elimination and before
920
53
  // TwoAddressInstructions, otherwise the processing of the tied operand of
921
53
  // SI_ELSE will introduce a copy of the tied operand source after the else.
922
53
  insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
923
53
924
53
  // This must be run just after RegisterCoalescing.
925
53
  insertPass(&RegisterCoalescerID, &SIPreAllocateWWMRegsID, false);
926
53
927
53
  TargetPassConfig::addFastRegAlloc();
928
53
}
929
930
2.39k
void GCNPassConfig::addOptimizedRegAlloc() {
931
2.39k
  if (OptExecMaskPreRA) {
932
2.39k
    insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
933
2.39k
    insertPass(&SIOptimizeExecMaskingPreRAID, &SIFormMemoryClausesID);
934
2.39k
  } else {
935
1
    insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
936
1
  }
937
2.39k
938
2.39k
  // This must be run immediately after phi elimination and before
939
2.39k
  // TwoAddressInstructions, otherwise the processing of the tied operand of
940
2.39k
  // SI_ELSE will introduce a copy of the tied operand source after the else.
941
2.39k
  insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
942
2.39k
943
2.39k
  // This must be run just after RegisterCoalescing.
944
2.39k
  insertPass(&RegisterCoalescerID, &SIPreAllocateWWMRegsID, false);
945
2.39k
946
2.39k
  if (EnableDCEInRA)
947
2.38k
    insertPass(&RenameIndependentSubregsID, &DeadMachineInstructionElimID);
948
2.39k
949
2.39k
  TargetPassConfig::addOptimizedRegAlloc();
950
2.39k
}
951
952
2.39k
bool GCNPassConfig::addPreRewrite() {
953
2.39k
  if (EnableRegReassign) {
954
2.39k
    addPass(&GCNNSAReassignID);
955
2.39k
    addPass(&GCNRegBankReassignID);
956
2.39k
  }
957
2.39k
  return true;
958
2.39k
}
959
960
2.44k
void GCNPassConfig::addPostRegAlloc() {
961
2.44k
  addPass(&SIFixVGPRCopiesID);
962
2.44k
  if (getOptLevel() > CodeGenOpt::None)
963
2.39k
    addPass(&SIOptimizeExecMaskingID);
964
2.44k
  TargetPassConfig::addPostRegAlloc();
965
2.44k
966
2.44k
  // Equivalent of PEI for SGPRs.
967
2.44k
  addPass(&SILowerSGPRSpillsID);
968
2.44k
}
969
970
2.44k
void GCNPassConfig::addPreSched2() {
971
2.44k
}
972
973
2.44k
void GCNPassConfig::addPreEmitPass() {
974
2.44k
  addPass(createSIMemoryLegalizerPass());
975
2.44k
  addPass(createSIInsertWaitcntsPass());
976
2.44k
  addPass(createSIShrinkInstructionsPass());
977
2.44k
  addPass(createSIModeRegisterPass());
978
2.44k
979
2.44k
  // The hazard recognizer that runs as part of the post-ra scheduler does not
980
2.44k
  // guarantee to be able handle all hazards correctly. This is because if there
981
2.44k
  // are multiple scheduling regions in a basic block, the regions are scheduled
982
2.44k
  // bottom up, so when we begin to schedule a region we don't know what
983
2.44k
  // instructions were emitted directly before it.
984
2.44k
  //
985
2.44k
  // Here we add a stand-alone hazard recognizer pass which can handle all
986
2.44k
  // cases.
987
2.44k
  //
988
2.44k
  // FIXME: This stand-alone pass will emit indiv. S_NOP 0, as needed. It would
989
2.44k
  // be better for it to emit S_NOP <N> when possible.
990
2.44k
  addPass(&PostRAHazardRecognizerID);
991
2.44k
992
2.44k
  addPass(&SIInsertSkipsPassID);
993
2.44k
  addPass(&BranchRelaxationPassID);
994
2.44k
}
995
996
3.46k
TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
997
3.46k
  return new GCNPassConfig(*this, PM);
998
3.46k
}
999
1000
5.61k
yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const {
1001
5.61k
  return new yaml::SIMachineFunctionInfo();
1002
5.61k
}
1003
1004
yaml::MachineFunctionInfo *
1005
5.80k
GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const {
1006
5.80k
  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1007
5.80k
  return new yaml::SIMachineFunctionInfo(*MFI,
1008
5.80k
                                         *MF.getSubtarget().getRegisterInfo());
1009
5.80k
}
1010
1011
bool GCNTargetMachine::parseMachineFunctionInfo(
1012
    const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS,
1013
5.60k
    SMDiagnostic &Error, SMRange &SourceRange) const {
1014
5.60k
  const yaml::SIMachineFunctionInfo &YamlMFI =
1015
5.60k
      reinterpret_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
1016
5.60k
  MachineFunction &MF = PFS.MF;
1017
5.60k
  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1018
5.60k
1019
5.60k
  MFI->initializeBaseYamlFields(YamlMFI);
1020
5.60k
1021
22.4k
  auto parseRegister = [&](const yaml::StringValue &RegName, unsigned &RegVal) {
1022
22.4k
    if (parseNamedRegisterReference(PFS, RegVal, RegName.Value, Error)) {
1023
5
      SourceRange = RegName.SourceRange;
1024
5
      return true;
1025
5
    }
1026
22.4k
1027
22.4k
    return false;
1028
22.4k
  };
1029
5.60k
1030
5.60k
  auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) {
1031
5
    // Create a diagnostic for a the register string literal.
1032
5
    const MemoryBuffer &Buffer =
1033
5
        *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
1034
5
    Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1,
1035
5
                         RegName.Value.size(), SourceMgr::DK_Error,
1036
5
                         "incorrect register class for field", RegName.Value,
1037
5
                         None, None);
1038
5
    SourceRange = RegName.SourceRange;
1039
5
    return true;
1040
5
  };
1041
5.60k
1042
5.60k
  if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) ||
1043
5.60k
      
parseRegister(YamlMFI.ScratchWaveOffsetReg, MFI->ScratchWaveOffsetReg)5.60k
||
1044
5.60k
      
parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg)5.60k
||
1045
5.60k
      
parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg)5.60k
)
1046
5
    return true;
1047
5.60k
1048
5.60k
  if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG &&
1049
5.60k
      
!AMDGPU::SReg_128RegClass.contains(MFI->ScratchRSrcReg)117
) {
1050
2
    return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
1051
2
  }
1052
5.60k
1053
5.60k
  if (MFI->ScratchWaveOffsetReg != AMDGPU::SCRATCH_WAVE_OFFSET_REG &&
1054
5.60k
      
!AMDGPU::SGPR_32RegClass.contains(MFI->ScratchWaveOffsetReg)116
) {
1055
1
    return diagnoseRegisterClass(YamlMFI.ScratchWaveOffsetReg);
1056
1
  }
1057
5.60k
1058
5.60k
  if (MFI->FrameOffsetReg != AMDGPU::FP_REG &&
1059
5.60k
      
!AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)27
) {
1060
1
    return diagnoseRegisterClass(YamlMFI.FrameOffsetReg);
1061
1
  }
1062
5.60k
1063
5.60k
  if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG &&
1064
5.60k
      
!AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)110
) {
1065
1
    return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg);
1066
1
  }
1067
5.59k
1068
5.59k
  auto parseAndCheckArgument = [&](const Optional<yaml::SIArgument> &A,
1069
5.59k
                                   const TargetRegisterClass &RC,
1070
5.59k
                                   ArgDescriptor &Arg, unsigned UserSGPRs,
1071
5.59k
                                   unsigned SystemSGPRs) {
1072
255
    // Skip parsing if it's not present.
1073
255
    if (!A)
1074
210
      return false;
1075
45
1076
45
    if (A->IsRegister) {
1077
43
      unsigned Reg;
1078
43
      if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) {
1079
0
        SourceRange = A->RegisterName.SourceRange;
1080
0
        return true;
1081
0
      }
1082
43
      if (!RC.contains(Reg))
1083
0
        return diagnoseRegisterClass(A->RegisterName);
1084
43
      Arg = ArgDescriptor::createRegister(Reg);
1085
43
    } else
1086
2
      Arg = ArgDescriptor::createStack(A->StackOffset);
1087
45
    // Check and apply the optional mask.
1088
45
    if (A->Mask)
1089
2
      Arg = ArgDescriptor::createArg(Arg, A->Mask.getValue());
1090
45
1091
45
    MFI->NumUserSGPRs += UserSGPRs;
1092
45
    MFI->NumSystemSGPRs += SystemSGPRs;
1093
45
    return false;
1094
45
  };
1095
5.59k
1096
5.59k
  if (YamlMFI.ArgInfo &&
1097
5.59k
      
(15
parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer,
1098
15
                             AMDGPU::SReg_128RegClass,
1099
15
                             MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) ||
1100
15
       parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr,
1101
15
                             AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr,
1102
15
                             2, 0) ||
1103
15
       parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass,
1104
15
                             MFI->ArgInfo.QueuePtr, 2, 0) ||
1105
15
       parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr,
1106
15
                             AMDGPU::SReg_64RegClass,
1107
15
                             MFI->ArgInfo.KernargSegmentPtr, 2, 0) ||
1108
15
       parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID,
1109
15
                             AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID,
1110
15
                             2, 0) ||
1111
15
       parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit,
1112
15
                             AMDGPU::SReg_64RegClass,
1113
15
                             MFI->ArgInfo.FlatScratchInit, 2, 0) ||
1114
15
       parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize,
1115
15
                             AMDGPU::SGPR_32RegClass,
1116
15
                             MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
1117
15
       parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
1118
15
                             AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
1119
15
                             0, 1) ||
1120
15
       parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY,
1121
15
                             AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY,
1122
15
                             0, 1) ||
1123
15
       parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ,
1124
15
                             AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ,
1125
15
                             0, 1) ||
1126
15
       parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo,
1127
15
                             AMDGPU::SGPR_32RegClass,
1128
15
                             MFI->ArgInfo.WorkGroupInfo, 0, 1) ||
1129
15
       parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset,
1130
15
                             AMDGPU::SGPR_32RegClass,
1131
15
                             MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) ||
1132
15
       parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr,
1133
15
                             AMDGPU::SReg_64RegClass,
1134
15
                             MFI->ArgInfo.ImplicitArgPtr, 0, 0) ||
1135
15
       parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr,
1136
15
                             AMDGPU::SReg_64RegClass,
1137
15
                             MFI->ArgInfo.ImplicitBufferPtr, 2, 0) ||
1138
15
       parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX,
1139
15
                             AMDGPU::VGPR_32RegClass,
1140
15
                             MFI->ArgInfo.WorkItemIDX, 0, 0) ||
1141
15
       parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY,
1142
15
                             AMDGPU::VGPR_32RegClass,
1143
15
                             MFI->ArgInfo.WorkItemIDY, 0, 0) ||
1144
15
       parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ,
1145
15
                             AMDGPU::VGPR_32RegClass,
1146
15
                             MFI->ArgInfo.WorkItemIDZ, 0, 0)))
1147
0
    return true;
1148
5.59k
1149
5.59k
  MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
1150
5.59k
  MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
1151
5.59k
1152
5.59k
  return false;
1153
5.59k
}