Coverage Report

Created: 2017-10-03 07:32

/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Line
Count
Source (jump to first uncovered line)
1
//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
2
//
3
//                     The LLVM Compiler Infrastructure
4
//
5
// This file is distributed under the University of Illinois Open Source
6
// License. See LICENSE.TXT for details.
7
//
8
//===----------------------------------------------------------------------===//
9
//
10
/// \file
11
/// \brief The AMDGPU target machine contains all of the hardware specific
12
/// information  needed to emit code for R600 and SI GPUs.
13
//
14
//===----------------------------------------------------------------------===//
15
16
#include "AMDGPUTargetMachine.h"
17
#include "AMDGPU.h"
18
#include "AMDGPUAliasAnalysis.h"
19
#include "AMDGPUCallLowering.h"
20
#include "AMDGPUInstructionSelector.h"
21
#include "AMDGPULegalizerInfo.h"
22
#include "AMDGPUMacroFusion.h"
23
#include "AMDGPUTargetObjectFile.h"
24
#include "AMDGPUTargetTransformInfo.h"
25
#include "GCNIterativeScheduler.h"
26
#include "GCNSchedStrategy.h"
27
#include "R600MachineScheduler.h"
28
#include "SIMachineScheduler.h"
29
#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
30
#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
31
#include "llvm/CodeGen/GlobalISel/Legalizer.h"
32
#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
33
#include "llvm/CodeGen/Passes.h"
34
#include "llvm/CodeGen/TargetPassConfig.h"
35
#include "llvm/IR/Attributes.h"
36
#include "llvm/IR/Function.h"
37
#include "llvm/IR/LegacyPassManager.h"
38
#include "llvm/Pass.h"
39
#include "llvm/Support/CommandLine.h"
40
#include "llvm/Support/Compiler.h"
41
#include "llvm/Support/TargetRegistry.h"
42
#include "llvm/Target/TargetLoweringObjectFile.h"
43
#include "llvm/Transforms/IPO.h"
44
#include "llvm/Transforms/IPO/AlwaysInliner.h"
45
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
46
#include "llvm/Transforms/Scalar.h"
47
#include "llvm/Transforms/Scalar/GVN.h"
48
#include "llvm/Transforms/Vectorize.h"
49
#include <memory>
50
51
using namespace llvm;
52
53
static cl::opt<bool> EnableR600StructurizeCFG(
54
  "r600-ir-structurize",
55
  cl::desc("Use StructurizeCFG IR pass"),
56
  cl::init(true));
57
58
static cl::opt<bool> EnableSROA(
59
  "amdgpu-sroa",
60
  cl::desc("Run SROA after promote alloca pass"),
61
  cl::ReallyHidden,
62
  cl::init(true));
63
64
static cl::opt<bool>
65
EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
66
                        cl::desc("Run early if-conversion"),
67
                        cl::init(false));
68
69
static cl::opt<bool> EnableR600IfConvert(
70
  "r600-if-convert",
71
  cl::desc("Use if conversion pass"),
72
  cl::ReallyHidden,
73
  cl::init(true));
74
75
// Option to disable vectorizer for tests.
76
static cl::opt<bool> EnableLoadStoreVectorizer(
77
  "amdgpu-load-store-vectorizer",
78
  cl::desc("Enable load store vectorizer"),
79
  cl::init(true),
80
  cl::Hidden);
81
82
// Option to to control global loads scalarization
83
static cl::opt<bool> ScalarizeGlobal(
84
  "amdgpu-scalarize-global-loads",
85
  cl::desc("Enable global load scalarization"),
86
  cl::init(true),
87
  cl::Hidden);
88
89
// Option to run internalize pass.
90
static cl::opt<bool> InternalizeSymbols(
91
  "amdgpu-internalize-symbols",
92
  cl::desc("Enable elimination of non-kernel functions and unused globals"),
93
  cl::init(false),
94
  cl::Hidden);
95
96
// Option to inline all early.
97
static cl::opt<bool> EarlyInlineAll(
98
  "amdgpu-early-inline-all",
99
  cl::desc("Inline all functions early"),
100
  cl::init(false),
101
  cl::Hidden);
102
103
static cl::opt<bool> EnableSDWAPeephole(
104
  "amdgpu-sdwa-peephole",
105
  cl::desc("Enable SDWA peepholer"),
106
  cl::init(true));
107
108
// Enable address space based alias analysis
109
static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
110
  cl::desc("Enable AMDGPU Alias Analysis"),
111
  cl::init(true));
112
113
// Option to enable new waitcnt insertion pass.
114
static cl::opt<bool> EnableSIInsertWaitcntsPass(
115
  "enable-si-insert-waitcnts",
116
  cl::desc("Use new waitcnt insertion pass"),
117
  cl::init(true));
118
119
// Option to run late CFG structurizer
120
static cl::opt<bool> LateCFGStructurize(
121
  "amdgpu-late-structurize",
122
  cl::desc("Enable late CFG structurization"),
123
  cl::init(false),
124
  cl::Hidden);
125
126
static cl::opt<bool> EnableAMDGPUFunctionCalls(
127
  "amdgpu-function-calls",
128
  cl::Hidden,
129
  cl::desc("Enable AMDGPU function call support"),
130
  cl::init(false));
131
132
// Enable lib calls simplifications
133
static cl::opt<bool> EnableLibCallSimplify(
134
  "amdgpu-simplify-libcall",
135
  cl::desc("Enable mdgpu library simplifications"),
136
  cl::init(true),
137
  cl::Hidden);
138
139
123k
extern "C" void LLVMInitializeAMDGPUTarget() {
140
123k
  // Register the target
141
123k
  RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
142
123k
  RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
143
123k
144
123k
  PassRegistry *PR = PassRegistry::getPassRegistry();
145
123k
  initializeR600ClauseMergePassPass(*PR);
146
123k
  initializeR600ControlFlowFinalizerPass(*PR);
147
123k
  initializeR600PacketizerPass(*PR);
148
123k
  initializeR600ExpandSpecialInstrsPassPass(*PR);
149
123k
  initializeR600VectorRegMergerPass(*PR);
150
123k
  initializeAMDGPUDAGToDAGISelPass(*PR);
151
123k
  initializeSILowerI1CopiesPass(*PR);
152
123k
  initializeSIFixSGPRCopiesPass(*PR);
153
123k
  initializeSIFixVGPRCopiesPass(*PR);
154
123k
  initializeSIFoldOperandsPass(*PR);
155
123k
  initializeSIPeepholeSDWAPass(*PR);
156
123k
  initializeSIShrinkInstructionsPass(*PR);
157
123k
  initializeSIOptimizeExecMaskingPreRAPass(*PR);
158
123k
  initializeSILoadStoreOptimizerPass(*PR);
159
123k
  initializeAMDGPUAlwaysInlinePass(*PR);
160
123k
  initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
161
123k
  initializeAMDGPUAnnotateUniformValuesPass(*PR);
162
123k
  initializeAMDGPUArgumentUsageInfoPass(*PR);
163
123k
  initializeAMDGPULowerIntrinsicsPass(*PR);
164
123k
  initializeAMDGPUPromoteAllocaPass(*PR);
165
123k
  initializeAMDGPUCodeGenPreparePass(*PR);
166
123k
  initializeAMDGPURewriteOutArgumentsPass(*PR);
167
123k
  initializeAMDGPUUnifyMetadataPass(*PR);
168
123k
  initializeSIAnnotateControlFlowPass(*PR);
169
123k
  initializeSIInsertWaitsPass(*PR);
170
123k
  initializeSIInsertWaitcntsPass(*PR);
171
123k
  initializeSIWholeQuadModePass(*PR);
172
123k
  initializeSILowerControlFlowPass(*PR);
173
123k
  initializeSIInsertSkipsPass(*PR);
174
123k
  initializeSIMemoryLegalizerPass(*PR);
175
123k
  initializeSIDebuggerInsertNopsPass(*PR);
176
123k
  initializeSIOptimizeExecMaskingPass(*PR);
177
123k
  initializeSIFixWWMLivenessPass(*PR);
178
123k
  initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
179
123k
  initializeAMDGPUAAWrapperPassPass(*PR);
180
123k
  initializeAMDGPUUseNativeCallsPass(*PR);
181
123k
  initializeAMDGPUSimplifyLibCallsPass(*PR);
182
123k
  initializeAMDGPUInlinerPass(*PR);
183
123k
}
184
185
2.07k
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
186
2.07k
  return llvm::make_unique<AMDGPUTargetObjectFile>();
187
2.07k
}
188
189
2.05k
static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
190
2.05k
  return new ScheduleDAGMILive(C, llvm::make_unique<R600SchedStrategy>());
191
2.05k
}
192
193
1
static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
194
1
  return new SIScheduleDAGMI(C);
195
1
}
196
197
static ScheduleDAGInstrs *
198
14.8k
createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
199
14.8k
  ScheduleDAGMILive *DAG =
200
14.8k
    new GCNScheduleDAGMILive(C, make_unique<GCNMaxOccupancySchedStrategy>(C));
201
14.8k
  DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
202
14.8k
  DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
203
14.8k
  DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
204
14.8k
  return DAG;
205
14.8k
}
206
207
static ScheduleDAGInstrs *
208
3
createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
209
3
  auto DAG = new GCNIterativeScheduler(C,
210
3
    GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
211
3
  DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
212
3
  DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
213
3
  return DAG;
214
3
}
215
216
3
static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
217
3
  return new GCNIterativeScheduler(C,
218
3
    GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
219
3
}
220
221
static MachineSchedRegistry
222
R600SchedRegistry("r600", "Run R600's custom scheduler",
223
                   createR600MachineScheduler);
224
225
static MachineSchedRegistry
226
SISchedRegistry("si", "Run SI's custom scheduler",
227
                createSIMachineScheduler);
228
229
static MachineSchedRegistry
230
GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
231
                             "Run GCN scheduler to maximize occupancy",
232
                             createGCNMaxOccupancyMachineScheduler);
233
234
static MachineSchedRegistry
235
IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental",
236
  "Run GCN scheduler to maximize occupancy (experimental)",
237
  createIterativeGCNMaxOccupancyMachineScheduler);
238
239
static MachineSchedRegistry
240
GCNMinRegSchedRegistry("gcn-minreg",
241
  "Run GCN iterative scheduler for minimal register usage (experimental)",
242
  createMinRegScheduler);
243
244
2.07k
static StringRef computeDataLayout(const Triple &TT) {
245
2.07k
  if (
TT.getArch() == Triple::r6002.07k
) {
246
255
    // 32-bit pointers.
247
255
    return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
248
255
            "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
249
255
  }
250
1.82k
251
1.82k
  // 32-bit private, local, and region pointers. 64-bit global, constant and
252
1.82k
  // flat.
253
1.82k
  
if (1.82k
TT.getEnvironmentName() == "amdgiz" ||
254
1.81k
      TT.getEnvironmentName() == "amdgizcl")
255
24
    return "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32"
256
24
         "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
257
24
         "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5";
258
1.79k
  return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32"
259
1.79k
      "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
260
1.79k
      "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
261
1.79k
}
262
263
LLVM_READNONE
264
2.07k
static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
265
2.07k
  if (!GPU.empty())
266
1.44k
    return GPU;
267
637
268
637
  
if (637
TT.getArch() == Triple::amdgcn637
)
269
609
    return "generic";
270
28
271
28
  return "r600";
272
28
}
273
274
2.07k
static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
275
2.07k
  // The AMDGPU toolchain only supports generating shared objects, so we
276
2.07k
  // must always use PIC.
277
2.07k
  return Reloc::PIC_;
278
2.07k
}
279
280
2.07k
static CodeModel::Model getEffectiveCodeModel(Optional<CodeModel::Model> CM) {
281
2.07k
  if (CM)
282
0
    return *CM;
283
2.07k
  return CodeModel::Small;
284
2.07k
}
285
286
AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
287
                                         StringRef CPU, StringRef FS,
288
                                         TargetOptions Options,
289
                                         Optional<Reloc::Model> RM,
290
                                         Optional<CodeModel::Model> CM,
291
                                         CodeGenOpt::Level OptLevel)
292
    : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
293
                        FS, Options, getEffectiveRelocModel(RM),
294
                        getEffectiveCodeModel(CM), OptLevel),
295
2.07k
      TLOF(createTLOF(getTargetTriple())) {
296
2.07k
  AS = AMDGPU::getAMDGPUAS(TT);
297
2.07k
  initAsmInfo();
298
2.07k
}
299
300
2.06k
AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
301
302
407k
StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
303
407k
  Attribute GPUAttr = F.getFnAttribute("target-cpu");
304
407k
  return GPUAttr.hasAttribute(Attribute::None) ?
305
407k
    
getTargetCPU()93.9k
:
GPUAttr.getValueAsString()313k
;
306
407k
}
307
308
407k
StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
309
407k
  Attribute FSAttr = F.getFnAttribute("target-features");
310
407k
311
407k
  return FSAttr.hasAttribute(Attribute::None) ?
312
260k
    getTargetFeatureString() :
313
147k
    FSAttr.getValueAsString();
314
407k
}
315
316
110
static ImmutablePass *createAMDGPUExternalAAWrapperPass() {
317
12.6k
  return createExternalAAWrapperPass([](Pass &P, Function &, AAResults &AAR) {
318
12.6k
      if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
319
12.6k
        AAR.addAAResult(WrapperPass->getResult());
320
12.6k
      });
321
110
}
322
323
/// Predicate for Internalize pass.
324
14
bool mustPreserveGV(const GlobalValue &GV) {
325
14
  if (const Function *F = dyn_cast<Function>(&GV))
326
10
    
return F->isDeclaration() || 10
AMDGPU::isEntryFunctionCC(F->getCallingConv())10
;
327
4
328
4
  return !GV.use_empty();
329
4
}
330
331
97
void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
332
97
  Builder.DivergentTarget = true;
333
97
334
97
  bool EnableOpt = getOptLevel() > CodeGenOpt::None;
335
97
  bool Internalize = InternalizeSymbols;
336
97
  bool EarlyInline = EarlyInlineAll && 
EnableOpt1
&&
!EnableAMDGPUFunctionCalls1
;
337
97
  bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt;
338
97
  bool LibCallSimplify = EnableLibCallSimplify && EnableOpt;
339
97
340
97
  if (
EnableAMDGPUFunctionCalls97
) {
341
2
    delete Builder.Inliner;
342
2
    Builder.Inliner = createAMDGPUFunctionInliningPass();
343
2
  }
344
97
345
97
  if (
Internalize97
) {
346
2
    // If we're generating code, we always have the whole program available. The
347
2
    // relocations expected for externally visible functions aren't supported,
348
2
    // so make sure every non-entry function is hidden.
349
2
    Builder.addExtension(
350
2
      PassManagerBuilder::EP_EnabledOnOptLevel0,
351
1
      [](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
352
1
        PM.add(createInternalizePass(mustPreserveGV));
353
1
      });
354
2
  }
355
97
356
97
  Builder.addExtension(
357
97
    PassManagerBuilder::EP_ModuleOptimizerEarly,
358
97
    [Internalize, EarlyInline, AMDGPUAA](const PassManagerBuilder &,
359
55
                                         legacy::PassManagerBase &PM) {
360
55
      if (
AMDGPUAA55
) {
361
55
        PM.add(createAMDGPUAAWrapperPass());
362
55
        PM.add(createAMDGPUExternalAAWrapperPass());
363
55
      }
364
55
      PM.add(createAMDGPUUnifyMetadataPass());
365
55
      if (
Internalize55
) {
366
1
        PM.add(createInternalizePass(mustPreserveGV));
367
1
        PM.add(createGlobalDCEPass());
368
1
      }
369
55
      if (EarlyInline)
370
1
        PM.add(createAMDGPUAlwaysInlinePass(false));
371
55
  });
372
97
373
97
  Builder.addExtension(
374
97
    PassManagerBuilder::EP_EarlyAsPossible,
375
97
    [AMDGPUAA, LibCallSimplify](const PassManagerBuilder &,
376
97
                                legacy::PassManagerBase &PM) {
377
97
      if (
AMDGPUAA97
) {
378
55
        PM.add(createAMDGPUAAWrapperPass());
379
55
        PM.add(createAMDGPUExternalAAWrapperPass());
380
55
      }
381
97
      PM.add(llvm::createAMDGPUUseNativeCallsPass());
382
97
      if (LibCallSimplify)
383
55
        PM.add(llvm::createAMDGPUSimplifyLibCallsPass());
384
97
  });
385
97
386
97
  Builder.addExtension(
387
97
    PassManagerBuilder::EP_CGSCCOptimizerLate,
388
55
    [](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
389
55
      // Add infer address spaces pass to the opt pipeline after inlining
390
55
      // but before SROA to increase SROA opportunities.
391
55
      PM.add(createInferAddressSpacesPass());
392
55
  });
393
97
}
394
395
//===----------------------------------------------------------------------===//
396
// R600 Target Machine (R600 -> Cayman)
397
//===----------------------------------------------------------------------===//
398
399
R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT,
400
                                     StringRef CPU, StringRef FS,
401
                                     TargetOptions Options,
402
                                     Optional<Reloc::Model> RM,
403
                                     Optional<CodeModel::Model> CM,
404
                                     CodeGenOpt::Level OL, bool JIT)
405
255
    : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {
406
255
  setRequiresStructuredCFG(true);
407
255
}
408
409
const R600Subtarget *R600TargetMachine::getSubtargetImpl(
410
38.5k
  const Function &F) const {
411
38.5k
  StringRef GPU = getGPUName(F);
412
38.5k
  StringRef FS = getFeatureString(F);
413
38.5k
414
38.5k
  SmallString<128> SubtargetKey(GPU);
415
38.5k
  SubtargetKey.append(FS);
416
38.5k
417
38.5k
  auto &I = SubtargetMap[SubtargetKey];
418
38.5k
  if (
!I38.5k
) {
419
253
    // This needs to be done before we create a new subtarget since any
420
253
    // creation will depend on the TM and the code generation flags on the
421
253
    // function that reside in TargetOptions.
422
253
    resetTargetOptions(F);
423
253
    I = llvm::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this);
424
253
  }
425
38.5k
426
38.5k
  return I.get();
427
38.5k
}
428
429
//===----------------------------------------------------------------------===//
430
// GCN Target Machine (SI+)
431
//===----------------------------------------------------------------------===//
432
433
GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
434
                                   StringRef CPU, StringRef FS,
435
                                   TargetOptions Options,
436
                                   Optional<Reloc::Model> RM,
437
                                   Optional<CodeModel::Model> CM,
438
                                   CodeGenOpt::Level OL, bool JIT)
439
1.82k
    : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
440
441
368k
const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
442
368k
  StringRef GPU = getGPUName(F);
443
368k
  StringRef FS = getFeatureString(F);
444
368k
445
368k
  SmallString<128> SubtargetKey(GPU);
446
368k
  SubtargetKey.append(FS);
447
368k
448
368k
  auto &I = SubtargetMap[SubtargetKey];
449
368k
  if (
!I368k
) {
450
1.81k
    // This needs to be done before we create a new subtarget since any
451
1.81k
    // creation will depend on the TM and the code generation flags on the
452
1.81k
    // function that reside in TargetOptions.
453
1.81k
    resetTargetOptions(F);
454
1.81k
    I = llvm::make_unique<SISubtarget>(TargetTriple, GPU, FS, *this);
455
1.81k
  }
456
368k
457
368k
  I->setScalarizeGlobalBehavior(ScalarizeGlobal);
458
368k
459
368k
  return I.get();
460
368k
}
461
462
//===----------------------------------------------------------------------===//
463
// AMDGPU Pass Setup
464
//===----------------------------------------------------------------------===//
465
466
namespace {
467
468
class AMDGPUPassConfig : public TargetPassConfig {
469
public:
470
  AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
471
1.97k
    : TargetPassConfig(TM, PM) {
472
1.97k
    // Exceptions and StackMaps are not supported, so these passes will never do
473
1.97k
    // anything.
474
1.97k
    disablePass(&StackMapLivenessID);
475
1.97k
    disablePass(&FuncletLayoutID);
476
1.97k
  }
477
478
3.44k
  AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
479
3.44k
    return getTM<AMDGPUTargetMachine>();
480
3.44k
  }
481
482
  ScheduleDAGInstrs *
483
0
  createMachineScheduler(MachineSchedContext *C) const override {
484
0
    ScheduleDAGMILive *DAG = createGenericSchedLive(C);
485
0
    DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
486
0
    DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
487
0
    return DAG;
488
0
  }
489
490
  void addEarlyCSEOrGVNPass();
491
  void addStraightLineScalarOptimizationPasses();
492
  void addIRPasses() override;
493
  void addCodeGenPrepare() override;
494
  bool addPreISel() override;
495
  bool addInstSelector() override;
496
  bool addGCPasses() override;
497
};
498
499
class R600PassConfig final : public AMDGPUPassConfig {
500
public:
501
  R600PassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
502
248
    : AMDGPUPassConfig(TM, PM) {}
503
504
  ScheduleDAGInstrs *createMachineScheduler(
505
2.05k
    MachineSchedContext *C) const override {
506
2.05k
    return createR600MachineScheduler(C);
507
2.05k
  }
508
509
  bool addPreISel() override;
510
  bool addInstSelector() override;
511
  void addPreRegAlloc() override;
512
  void addPreSched2() override;
513
  void addPreEmitPass() override;
514
};
515
516
class GCNPassConfig final : public AMDGPUPassConfig {
517
public:
518
  GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
519
1.72k
    : AMDGPUPassConfig(TM, PM) {
520
1.72k
    // It is necessary to know the register usage of the entire call graph.  We
521
1.72k
    // allow calls without EnableAMDGPUFunctionCalls if they are marked
522
1.72k
    // noinline, so this is always required.
523
1.72k
    setRequiresCodeGenSCCOrder(true);
524
1.72k
  }
525
526
0
  GCNTargetMachine &getGCNTargetMachine() const {
527
0
    return getTM<GCNTargetMachine>();
528
0
  }
529
530
  ScheduleDAGInstrs *
531
  createMachineScheduler(MachineSchedContext *C) const override;
532
533
  bool addPreISel() override;
534
  void addMachineSSAOptimization() override;
535
  bool addILPOpts() override;
536
  bool addInstSelector() override;
537
  bool addIRTranslator() override;
538
  bool addLegalizeMachineIR() override;
539
  bool addRegBankSelect() override;
540
  bool addGlobalInstructionSelect() override;
541
  void addFastRegAlloc(FunctionPass *RegAllocPass) override;
542
  void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
543
  void addPreRegAlloc() override;
544
  void addPostRegAlloc() override;
545
  void addPreSched2() override;
546
  void addPreEmitPass() override;
547
};
548
549
} // end anonymous namespace
550
551
2.16k
TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() {
552
290k
  return TargetIRAnalysis([this](const Function &F) {
553
290k
    return TargetTransformInfo(AMDGPUTTIImpl(this, F));
554
290k
  });
555
2.16k
}
556
557
3.36k
void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
558
3.36k
  if (getOptLevel() == CodeGenOpt::Aggressive)
559
0
    addPass(createGVNPass());
560
3.36k
  else
561
3.36k
    addPass(createEarlyCSEPass());
562
3.36k
}
563
564
1.68k
void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
565
1.68k
  addPass(createSeparateConstOffsetFromGEPPass());
566
1.68k
  addPass(createSpeculativeExecutionPass());
567
1.68k
  // ReassociateGEPs exposes more opportunites for SLSR. See
568
1.68k
  // the example in reassociate-geps-and-slsr.ll.
569
1.68k
  addPass(createStraightLineStrengthReducePass());
570
1.68k
  // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
571
1.68k
  // EarlyCSE can reuse.
572
1.68k
  addEarlyCSEOrGVNPass();
573
1.68k
  // Run NaryReassociate after EarlyCSE/GVN to be more effective.
574
1.68k
  addPass(createNaryReassociatePass());
575
1.68k
  // NaryReassociate on GEPs creates redundant common expressions, so run
576
1.68k
  // EarlyCSE after it.
577
1.68k
  addPass(createEarlyCSEPass());
578
1.68k
}
579
580
1.72k
void AMDGPUPassConfig::addIRPasses() {
581
1.72k
  const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
582
1.72k
583
1.72k
  // There is no reason to run these.
584
1.72k
  disablePass(&StackMapLivenessID);
585
1.72k
  disablePass(&FuncletLayoutID);
586
1.72k
  disablePass(&PatchableFunctionID);
587
1.72k
588
1.72k
  addPass(createAMDGPULowerIntrinsicsPass());
589
1.72k
590
1.72k
  if (TM.getTargetTriple().getArch() == Triple::r600 ||
591
1.72k
      
!EnableAMDGPUFunctionCalls1.48k
) {
592
1.72k
    // Function calls are not supported, so make sure we inline everything.
593
1.72k
    addPass(createAMDGPUAlwaysInlinePass());
594
1.72k
    addPass(createAlwaysInlinerLegacyPass());
595
1.72k
    // We need to add the barrier noop pass, otherwise adding the function
596
1.72k
    // inlining pass will cause all of the PassConfigs passes to be run
597
1.72k
    // one function at a time, which means if we have a nodule with two
598
1.72k
    // functions, then we will generate code for the first function
599
1.72k
    // without ever running any passes on the second.
600
1.72k
    addPass(createBarrierNoopPass());
601
1.72k
  }
602
1.72k
603
1.72k
  if (
TM.getTargetTriple().getArch() == Triple::amdgcn1.72k
) {
604
1.48k
    // TODO: May want to move later or split into an early and late one.
605
1.48k
606
1.48k
    addPass(createAMDGPUCodeGenPreparePass());
607
1.48k
  }
608
1.72k
609
1.72k
  // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
610
1.72k
  addPass(createAMDGPUOpenCLImageTypeLoweringPass());
611
1.72k
612
1.72k
  if (
TM.getOptLevel() > CodeGenOpt::None1.72k
) {
613
1.68k
    addPass(createInferAddressSpacesPass());
614
1.68k
    addPass(createAMDGPUPromoteAlloca());
615
1.68k
616
1.68k
    if (EnableSROA)
617
1.66k
      addPass(createSROAPass());
618
1.68k
619
1.68k
    addStraightLineScalarOptimizationPasses();
620
1.68k
621
1.68k
    if (
EnableAMDGPUAliasAnalysis1.68k
) {
622
1.66k
      addPass(createAMDGPUAAWrapperPass());
623
1.66k
      addPass(createExternalAAWrapperPass([](Pass &P, Function &,
624
78.5k
                                             AAResults &AAR) {
625
78.5k
        if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
626
78.5k
          AAR.addAAResult(WrapperPass->getResult());
627
78.5k
        }));
628
1.66k
    }
629
1.68k
  }
630
1.72k
631
1.72k
  TargetPassConfig::addIRPasses();
632
1.72k
633
1.72k
  // EarlyCSE is not always strong enough to clean up what LSR produces. For
634
1.72k
  // example, GVN can combine
635
1.72k
  //
636
1.72k
  //   %0 = add %a, %b
637
1.72k
  //   %1 = add %b, %a
638
1.72k
  //
639
1.72k
  // and
640
1.72k
  //
641
1.72k
  //   %0 = shl nsw %a, 2
642
1.72k
  //   %1 = shl %a, 2
643
1.72k
  //
644
1.72k
  // but EarlyCSE can do neither of them.
645
1.72k
  if (getOptLevel() != CodeGenOpt::None)
646
1.68k
    addEarlyCSEOrGVNPass();
647
1.72k
}
648
649
1.72k
void AMDGPUPassConfig::addCodeGenPrepare() {
650
1.72k
  TargetPassConfig::addCodeGenPrepare();
651
1.72k
652
1.72k
  if (EnableLoadStoreVectorizer)
653
1.71k
    addPass(createLoadStoreVectorizerPass());
654
1.72k
}
655
656
1.72k
bool AMDGPUPassConfig::addPreISel() {
657
1.72k
  addPass(createFlattenCFGPass());
658
1.72k
  return false;
659
1.72k
}
660
661
1.47k
bool AMDGPUPassConfig::addInstSelector() {
662
1.47k
  addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel()));
663
1.47k
  return false;
664
1.47k
}
665
666
1.72k
bool AMDGPUPassConfig::addGCPasses() {
667
1.72k
  // Do nothing. GC is not supported.
668
1.72k
  return false;
669
1.72k
}
670
671
//===----------------------------------------------------------------------===//
672
// R600 Pass Setup
673
//===----------------------------------------------------------------------===//
674
675
244
bool R600PassConfig::addPreISel() {
676
244
  AMDGPUPassConfig::addPreISel();
677
244
678
244
  if (EnableR600StructurizeCFG)
679
242
    addPass(createStructurizeCFGPass());
680
244
  return false;
681
244
}
682
683
244
bool R600PassConfig::addInstSelector() {
684
244
  addPass(createR600ISelDag(&getAMDGPUTargetMachine(), getOptLevel()));
685
244
  return false;
686
244
}
687
688
244
void R600PassConfig::addPreRegAlloc() {
689
244
  addPass(createR600VectorRegMerger());
690
244
}
691
692
244
void R600PassConfig::addPreSched2() {
693
244
  addPass(createR600EmitClauseMarkers(), false);
694
244
  if (EnableR600IfConvert)
695
243
    addPass(&IfConverterID, false);
696
244
  addPass(createR600ClauseMergePass(), false);
697
244
}
698
699
244
void R600PassConfig::addPreEmitPass() {
700
244
  addPass(createAMDGPUCFGStructurizerPass(), false);
701
244
  addPass(createR600ExpandSpecialInstrsPass(), false);
702
244
  addPass(&FinalizeMachineBundlesID, false);
703
244
  addPass(createR600Packetizer(), false);
704
244
  addPass(createR600ControlFlowFinalizer(), false);
705
244
}
706
707
248
TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
708
248
  return new R600PassConfig(*this, PM);
709
248
}
710
711
//===----------------------------------------------------------------------===//
712
// GCN Pass Setup
713
//===----------------------------------------------------------------------===//
714
715
ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
716
14.8k
  MachineSchedContext *C) const {
717
14.8k
  const SISubtarget &ST = C->MF->getSubtarget<SISubtarget>();
718
14.8k
  if (ST.enableSIScheduler())
719
0
    return createSIMachineScheduler(C);
720
14.8k
  return createGCNMaxOccupancyMachineScheduler(C);
721
14.8k
}
722
723
1.48k
bool GCNPassConfig::addPreISel() {
724
1.48k
  AMDGPUPassConfig::addPreISel();
725
1.48k
726
1.48k
  // FIXME: We need to run a pass to propagate the attributes when calls are
727
1.48k
  // supported.
728
1.48k
  addPass(createAMDGPUAnnotateKernelFeaturesPass());
729
1.48k
730
1.48k
  // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
731
1.48k
  // regions formed by them.
732
1.48k
  addPass(&AMDGPUUnifyDivergentExitNodesID);
733
1.48k
  if (
!LateCFGStructurize1.48k
) {
734
1.48k
    addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
735
1.48k
  }
736
1.48k
  addPass(createSinkingPass());
737
1.48k
  addPass(createAMDGPUAnnotateUniformValues());
738
1.48k
  if (
!LateCFGStructurize1.48k
) {
739
1.48k
    addPass(createSIAnnotateControlFlowPass());
740
1.48k
  }
741
1.48k
742
1.48k
  return false;
743
1.48k
}
744
745
1.43k
void GCNPassConfig::addMachineSSAOptimization() {
746
1.43k
  TargetPassConfig::addMachineSSAOptimization();
747
1.43k
748
1.43k
  // We want to fold operands after PeepholeOptimizer has run (or as part of
749
1.43k
  // it), because it will eliminate extra copies making it easier to fold the
750
1.43k
  // real source operand. We want to eliminate dead instructions after, so that
751
1.43k
  // we see fewer uses of the copies. We then need to clean up the dead
752
1.43k
  // instructions leftover after the operands are folded as well.
753
1.43k
  //
754
1.43k
  // XXX - Can we get away without running DeadMachineInstructionElim again?
755
1.43k
  addPass(&SIFoldOperandsID);
756
1.43k
  addPass(&DeadMachineInstructionElimID);
757
1.43k
  addPass(&SILoadStoreOptimizerID);
758
1.43k
  if (
EnableSDWAPeephole1.43k
) {
759
1.43k
    addPass(&SIPeepholeSDWAID);
760
1.43k
    addPass(&MachineLICMID);
761
1.43k
    addPass(&MachineCSEID);
762
1.43k
    addPass(&SIFoldOperandsID);
763
1.43k
    addPass(&DeadMachineInstructionElimID);
764
1.43k
  }
765
1.43k
  addPass(createSIShrinkInstructionsPass());
766
1.43k
}
767
768
1.43k
bool GCNPassConfig::addILPOpts() {
769
1.43k
  if (EnableEarlyIfConversion)
770
2
    addPass(&EarlyIfConverterID);
771
1.43k
772
1.43k
  TargetPassConfig::addILPOpts();
773
1.43k
  return false;
774
1.43k
}
775
776
1.47k
bool GCNPassConfig::addInstSelector() {
777
1.47k
  AMDGPUPassConfig::addInstSelector();
778
1.47k
  addPass(createSILowerI1CopiesPass());
779
1.47k
  addPass(&SIFixSGPRCopiesID);
780
1.47k
  return false;
781
1.47k
}
782
783
5
bool GCNPassConfig::addIRTranslator() {
784
5
  addPass(new IRTranslator());
785
5
  return false;
786
5
}
787
788
5
bool GCNPassConfig::addLegalizeMachineIR() {
789
5
  addPass(new Legalizer());
790
5
  return false;
791
5
}
792
793
5
bool GCNPassConfig::addRegBankSelect() {
794
5
  addPass(new RegBankSelect());
795
5
  return false;
796
5
}
797
798
5
bool GCNPassConfig::addGlobalInstructionSelect() {
799
5
  addPass(new InstructionSelect());
800
5
  return false;
801
5
}
802
803
1.48k
void GCNPassConfig::addPreRegAlloc() {
804
1.48k
  if (
LateCFGStructurize1.48k
) {
805
0
    addPass(createAMDGPUMachineCFGStructurizerPass());
806
0
  }
807
1.48k
  addPass(createSIWholeQuadModePass());
808
1.48k
}
809
810
44
void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
811
44
  // FIXME: We have to disable the verifier here because of PHIElimination +
812
44
  // TwoAddressInstructions disabling it.
813
44
814
44
  // This must be run immediately after phi elimination and before
815
44
  // TwoAddressInstructions, otherwise the processing of the tied operand of
816
44
  // SI_ELSE will introduce a copy of the tied operand source after the else.
817
44
  insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
818
44
819
44
  // This must be run after SILowerControlFlow, since it needs to use the
820
44
  // machine-level CFG, but before register allocation.
821
44
  insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false);
822
44
823
44
  TargetPassConfig::addFastRegAlloc(RegAllocPass);
824
44
}
825
826
1.43k
void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
827
1.43k
  insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
828
1.43k
829
1.43k
  // This must be run immediately after phi elimination and before
830
1.43k
  // TwoAddressInstructions, otherwise the processing of the tied operand of
831
1.43k
  // SI_ELSE will introduce a copy of the tied operand source after the else.
832
1.43k
  insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
833
1.43k
834
1.43k
  // This must be run after SILowerControlFlow, since it needs to use the
835
1.43k
  // machine-level CFG, but before register allocation.
836
1.43k
  insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false);
837
1.43k
838
1.43k
  TargetPassConfig::addOptimizedRegAlloc(RegAllocPass);
839
1.43k
}
840
841
1.48k
void GCNPassConfig::addPostRegAlloc() {
842
1.48k
  addPass(&SIFixVGPRCopiesID);
843
1.48k
  addPass(&SIOptimizeExecMaskingID);
844
1.48k
  TargetPassConfig::addPostRegAlloc();
845
1.48k
}
846
847
1.48k
void GCNPassConfig::addPreSched2() {
848
1.48k
}
849
850
1.48k
void GCNPassConfig::addPreEmitPass() {
851
1.48k
  // The hazard recognizer that runs as part of the post-ra scheduler does not
852
1.48k
  // guarantee to be able handle all hazards correctly. This is because if there
853
1.48k
  // are multiple scheduling regions in a basic block, the regions are scheduled
854
1.48k
  // bottom up, so when we begin to schedule a region we don't know what
855
1.48k
  // instructions were emitted directly before it.
856
1.48k
  //
857
1.48k
  // Here we add a stand-alone hazard recognizer pass which can handle all
858
1.48k
  // cases.
859
1.48k
  addPass(&PostRAHazardRecognizerID);
860
1.48k
861
1.48k
  if (EnableSIInsertWaitcntsPass)
862
1.48k
    addPass(createSIInsertWaitcntsPass());
863
1.48k
  else
864
0
    addPass(createSIInsertWaitsPass());
865
1.48k
  addPass(createSIShrinkInstructionsPass());
866
1.48k
  addPass(&SIInsertSkipsPassID);
867
1.48k
  addPass(createSIMemoryLegalizerPass());
868
1.48k
  addPass(createSIDebuggerInsertNopsPass());
869
1.48k
  addPass(&BranchRelaxationPassID);
870
1.48k
}
871
872
1.72k
TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
873
1.72k
  return new GCNPassConfig(*this, PM);
874
1.72k
}
875