/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===// |
2 | | // |
3 | | // The LLVM Compiler Infrastructure |
4 | | // |
5 | | // This file is distributed under the University of Illinois Open Source |
6 | | // License. See LICENSE.TXT for details. |
7 | | // |
8 | | //===----------------------------------------------------------------------===// |
9 | | // |
10 | | /// \file |
11 | | /// \brief The AMDGPU target machine contains all of the hardware specific |
12 | | /// information needed to emit code for R600 and SI GPUs. |
13 | | // |
14 | | //===----------------------------------------------------------------------===// |
15 | | |
16 | | #include "AMDGPUTargetMachine.h" |
17 | | #include "AMDGPU.h" |
18 | | #include "AMDGPUAliasAnalysis.h" |
19 | | #include "AMDGPUCallLowering.h" |
20 | | #include "AMDGPUInstructionSelector.h" |
21 | | #include "AMDGPULegalizerInfo.h" |
22 | | #include "AMDGPUMacroFusion.h" |
23 | | #include "AMDGPUTargetObjectFile.h" |
24 | | #include "AMDGPUTargetTransformInfo.h" |
25 | | #include "GCNIterativeScheduler.h" |
26 | | #include "GCNSchedStrategy.h" |
27 | | #include "R600MachineScheduler.h" |
28 | | #include "SIMachineScheduler.h" |
29 | | #include "llvm/CodeGen/GlobalISel/IRTranslator.h" |
30 | | #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" |
31 | | #include "llvm/CodeGen/GlobalISel/Legalizer.h" |
32 | | #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" |
33 | | #include "llvm/CodeGen/Passes.h" |
34 | | #include "llvm/CodeGen/TargetPassConfig.h" |
35 | | #include "llvm/IR/Attributes.h" |
36 | | #include "llvm/IR/Function.h" |
37 | | #include "llvm/IR/LegacyPassManager.h" |
38 | | #include "llvm/Pass.h" |
39 | | #include "llvm/Support/CommandLine.h" |
40 | | #include "llvm/Support/Compiler.h" |
41 | | #include "llvm/Support/TargetRegistry.h" |
42 | | #include "llvm/Target/TargetLoweringObjectFile.h" |
43 | | #include "llvm/Transforms/IPO.h" |
44 | | #include "llvm/Transforms/IPO/AlwaysInliner.h" |
45 | | #include "llvm/Transforms/IPO/PassManagerBuilder.h" |
46 | | #include "llvm/Transforms/Scalar.h" |
47 | | #include "llvm/Transforms/Scalar/GVN.h" |
48 | | #include "llvm/Transforms/Vectorize.h" |
49 | | #include <memory> |
50 | | |
51 | | using namespace llvm; |
52 | | |
53 | | static cl::opt<bool> EnableR600StructurizeCFG( |
54 | | "r600-ir-structurize", |
55 | | cl::desc("Use StructurizeCFG IR pass"), |
56 | | cl::init(true)); |
57 | | |
58 | | static cl::opt<bool> EnableSROA( |
59 | | "amdgpu-sroa", |
60 | | cl::desc("Run SROA after promote alloca pass"), |
61 | | cl::ReallyHidden, |
62 | | cl::init(true)); |
63 | | |
64 | | static cl::opt<bool> |
65 | | EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden, |
66 | | cl::desc("Run early if-conversion"), |
67 | | cl::init(false)); |
68 | | |
69 | | static cl::opt<bool> EnableR600IfConvert( |
70 | | "r600-if-convert", |
71 | | cl::desc("Use if conversion pass"), |
72 | | cl::ReallyHidden, |
73 | | cl::init(true)); |
74 | | |
75 | | // Option to disable vectorizer for tests. |
76 | | static cl::opt<bool> EnableLoadStoreVectorizer( |
77 | | "amdgpu-load-store-vectorizer", |
78 | | cl::desc("Enable load store vectorizer"), |
79 | | cl::init(true), |
80 | | cl::Hidden); |
81 | | |
82 | | // Option to to control global loads scalarization |
83 | | static cl::opt<bool> ScalarizeGlobal( |
84 | | "amdgpu-scalarize-global-loads", |
85 | | cl::desc("Enable global load scalarization"), |
86 | | cl::init(true), |
87 | | cl::Hidden); |
88 | | |
89 | | // Option to run internalize pass. |
90 | | static cl::opt<bool> InternalizeSymbols( |
91 | | "amdgpu-internalize-symbols", |
92 | | cl::desc("Enable elimination of non-kernel functions and unused globals"), |
93 | | cl::init(false), |
94 | | cl::Hidden); |
95 | | |
96 | | // Option to inline all early. |
97 | | static cl::opt<bool> EarlyInlineAll( |
98 | | "amdgpu-early-inline-all", |
99 | | cl::desc("Inline all functions early"), |
100 | | cl::init(false), |
101 | | cl::Hidden); |
102 | | |
103 | | static cl::opt<bool> EnableSDWAPeephole( |
104 | | "amdgpu-sdwa-peephole", |
105 | | cl::desc("Enable SDWA peepholer"), |
106 | | cl::init(true)); |
107 | | |
108 | | // Enable address space based alias analysis |
109 | | static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, |
110 | | cl::desc("Enable AMDGPU Alias Analysis"), |
111 | | cl::init(true)); |
112 | | |
113 | | // Option to enable new waitcnt insertion pass. |
114 | | static cl::opt<bool> EnableSIInsertWaitcntsPass( |
115 | | "enable-si-insert-waitcnts", |
116 | | cl::desc("Use new waitcnt insertion pass"), |
117 | | cl::init(true)); |
118 | | |
119 | | // Option to run late CFG structurizer |
120 | | static cl::opt<bool> LateCFGStructurize( |
121 | | "amdgpu-late-structurize", |
122 | | cl::desc("Enable late CFG structurization"), |
123 | | cl::init(false), |
124 | | cl::Hidden); |
125 | | |
126 | | static cl::opt<bool> EnableAMDGPUFunctionCalls( |
127 | | "amdgpu-function-calls", |
128 | | cl::Hidden, |
129 | | cl::desc("Enable AMDGPU function call support"), |
130 | | cl::init(false)); |
131 | | |
132 | | // Enable lib calls simplifications |
133 | | static cl::opt<bool> EnableLibCallSimplify( |
134 | | "amdgpu-simplify-libcall", |
135 | | cl::desc("Enable mdgpu library simplifications"), |
136 | | cl::init(true), |
137 | | cl::Hidden); |
138 | | |
139 | 123k | extern "C" void LLVMInitializeAMDGPUTarget() { |
140 | 123k | // Register the target |
141 | 123k | RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget()); |
142 | 123k | RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget()); |
143 | 123k | |
144 | 123k | PassRegistry *PR = PassRegistry::getPassRegistry(); |
145 | 123k | initializeR600ClauseMergePassPass(*PR); |
146 | 123k | initializeR600ControlFlowFinalizerPass(*PR); |
147 | 123k | initializeR600PacketizerPass(*PR); |
148 | 123k | initializeR600ExpandSpecialInstrsPassPass(*PR); |
149 | 123k | initializeR600VectorRegMergerPass(*PR); |
150 | 123k | initializeAMDGPUDAGToDAGISelPass(*PR); |
151 | 123k | initializeSILowerI1CopiesPass(*PR); |
152 | 123k | initializeSIFixSGPRCopiesPass(*PR); |
153 | 123k | initializeSIFixVGPRCopiesPass(*PR); |
154 | 123k | initializeSIFoldOperandsPass(*PR); |
155 | 123k | initializeSIPeepholeSDWAPass(*PR); |
156 | 123k | initializeSIShrinkInstructionsPass(*PR); |
157 | 123k | initializeSIOptimizeExecMaskingPreRAPass(*PR); |
158 | 123k | initializeSILoadStoreOptimizerPass(*PR); |
159 | 123k | initializeAMDGPUAlwaysInlinePass(*PR); |
160 | 123k | initializeAMDGPUAnnotateKernelFeaturesPass(*PR); |
161 | 123k | initializeAMDGPUAnnotateUniformValuesPass(*PR); |
162 | 123k | initializeAMDGPUArgumentUsageInfoPass(*PR); |
163 | 123k | initializeAMDGPULowerIntrinsicsPass(*PR); |
164 | 123k | initializeAMDGPUPromoteAllocaPass(*PR); |
165 | 123k | initializeAMDGPUCodeGenPreparePass(*PR); |
166 | 123k | initializeAMDGPURewriteOutArgumentsPass(*PR); |
167 | 123k | initializeAMDGPUUnifyMetadataPass(*PR); |
168 | 123k | initializeSIAnnotateControlFlowPass(*PR); |
169 | 123k | initializeSIInsertWaitsPass(*PR); |
170 | 123k | initializeSIInsertWaitcntsPass(*PR); |
171 | 123k | initializeSIWholeQuadModePass(*PR); |
172 | 123k | initializeSILowerControlFlowPass(*PR); |
173 | 123k | initializeSIInsertSkipsPass(*PR); |
174 | 123k | initializeSIMemoryLegalizerPass(*PR); |
175 | 123k | initializeSIDebuggerInsertNopsPass(*PR); |
176 | 123k | initializeSIOptimizeExecMaskingPass(*PR); |
177 | 123k | initializeSIFixWWMLivenessPass(*PR); |
178 | 123k | initializeAMDGPUUnifyDivergentExitNodesPass(*PR); |
179 | 123k | initializeAMDGPUAAWrapperPassPass(*PR); |
180 | 123k | initializeAMDGPUUseNativeCallsPass(*PR); |
181 | 123k | initializeAMDGPUSimplifyLibCallsPass(*PR); |
182 | 123k | initializeAMDGPUInlinerPass(*PR); |
183 | 123k | } |
184 | | |
185 | 2.07k | static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { |
186 | 2.07k | return llvm::make_unique<AMDGPUTargetObjectFile>(); |
187 | 2.07k | } |
188 | | |
189 | 2.05k | static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) { |
190 | 2.05k | return new ScheduleDAGMILive(C, llvm::make_unique<R600SchedStrategy>()); |
191 | 2.05k | } |
192 | | |
193 | 1 | static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) { |
194 | 1 | return new SIScheduleDAGMI(C); |
195 | 1 | } |
196 | | |
197 | | static ScheduleDAGInstrs * |
198 | 14.8k | createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { |
199 | 14.8k | ScheduleDAGMILive *DAG = |
200 | 14.8k | new GCNScheduleDAGMILive(C, make_unique<GCNMaxOccupancySchedStrategy>(C)); |
201 | 14.8k | DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); |
202 | 14.8k | DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); |
203 | 14.8k | DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); |
204 | 14.8k | return DAG; |
205 | 14.8k | } |
206 | | |
207 | | static ScheduleDAGInstrs * |
208 | 3 | createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { |
209 | 3 | auto DAG = new GCNIterativeScheduler(C, |
210 | 3 | GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY); |
211 | 3 | DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); |
212 | 3 | DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); |
213 | 3 | return DAG; |
214 | 3 | } |
215 | | |
216 | 3 | static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) { |
217 | 3 | return new GCNIterativeScheduler(C, |
218 | 3 | GCNIterativeScheduler::SCHEDULE_MINREGFORCED); |
219 | 3 | } |
220 | | |
221 | | static MachineSchedRegistry |
222 | | R600SchedRegistry("r600", "Run R600's custom scheduler", |
223 | | createR600MachineScheduler); |
224 | | |
225 | | static MachineSchedRegistry |
226 | | SISchedRegistry("si", "Run SI's custom scheduler", |
227 | | createSIMachineScheduler); |
228 | | |
229 | | static MachineSchedRegistry |
230 | | GCNMaxOccupancySchedRegistry("gcn-max-occupancy", |
231 | | "Run GCN scheduler to maximize occupancy", |
232 | | createGCNMaxOccupancyMachineScheduler); |
233 | | |
234 | | static MachineSchedRegistry |
235 | | IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental", |
236 | | "Run GCN scheduler to maximize occupancy (experimental)", |
237 | | createIterativeGCNMaxOccupancyMachineScheduler); |
238 | | |
239 | | static MachineSchedRegistry |
240 | | GCNMinRegSchedRegistry("gcn-minreg", |
241 | | "Run GCN iterative scheduler for minimal register usage (experimental)", |
242 | | createMinRegScheduler); |
243 | | |
244 | 2.07k | static StringRef computeDataLayout(const Triple &TT) { |
245 | 2.07k | if (TT.getArch() == Triple::r6002.07k ) { |
246 | 255 | // 32-bit pointers. |
247 | 255 | return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" |
248 | 255 | "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; |
249 | 255 | } |
250 | 1.82k | |
251 | 1.82k | // 32-bit private, local, and region pointers. 64-bit global, constant and |
252 | 1.82k | // flat. |
253 | 1.82k | if (1.82k TT.getEnvironmentName() == "amdgiz" || |
254 | 1.81k | TT.getEnvironmentName() == "amdgizcl") |
255 | 24 | return "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32" |
256 | 24 | "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" |
257 | 24 | "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5"; |
258 | 1.79k | return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32" |
259 | 1.79k | "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" |
260 | 1.79k | "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; |
261 | 1.79k | } |
262 | | |
263 | | LLVM_READNONE |
264 | 2.07k | static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) { |
265 | 2.07k | if (!GPU.empty()) |
266 | 1.44k | return GPU; |
267 | 637 | |
268 | 637 | if (637 TT.getArch() == Triple::amdgcn637 ) |
269 | 609 | return "generic"; |
270 | 28 | |
271 | 28 | return "r600"; |
272 | 28 | } |
273 | | |
274 | 2.07k | static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) { |
275 | 2.07k | // The AMDGPU toolchain only supports generating shared objects, so we |
276 | 2.07k | // must always use PIC. |
277 | 2.07k | return Reloc::PIC_; |
278 | 2.07k | } |
279 | | |
280 | 2.07k | static CodeModel::Model getEffectiveCodeModel(Optional<CodeModel::Model> CM) { |
281 | 2.07k | if (CM) |
282 | 0 | return *CM; |
283 | 2.07k | return CodeModel::Small; |
284 | 2.07k | } |
285 | | |
286 | | AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, |
287 | | StringRef CPU, StringRef FS, |
288 | | TargetOptions Options, |
289 | | Optional<Reloc::Model> RM, |
290 | | Optional<CodeModel::Model> CM, |
291 | | CodeGenOpt::Level OptLevel) |
292 | | : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU), |
293 | | FS, Options, getEffectiveRelocModel(RM), |
294 | | getEffectiveCodeModel(CM), OptLevel), |
295 | 2.07k | TLOF(createTLOF(getTargetTriple())) { |
296 | 2.07k | AS = AMDGPU::getAMDGPUAS(TT); |
297 | 2.07k | initAsmInfo(); |
298 | 2.07k | } |
299 | | |
300 | 2.06k | AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; |
301 | | |
302 | 407k | StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const { |
303 | 407k | Attribute GPUAttr = F.getFnAttribute("target-cpu"); |
304 | 407k | return GPUAttr.hasAttribute(Attribute::None) ? |
305 | 407k | getTargetCPU()93.9k : GPUAttr.getValueAsString()313k ; |
306 | 407k | } |
307 | | |
308 | 407k | StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const { |
309 | 407k | Attribute FSAttr = F.getFnAttribute("target-features"); |
310 | 407k | |
311 | 407k | return FSAttr.hasAttribute(Attribute::None) ? |
312 | 260k | getTargetFeatureString() : |
313 | 147k | FSAttr.getValueAsString(); |
314 | 407k | } |
315 | | |
316 | 110 | static ImmutablePass *createAMDGPUExternalAAWrapperPass() { |
317 | 12.6k | return createExternalAAWrapperPass([](Pass &P, Function &, AAResults &AAR) { |
318 | 12.6k | if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>()) |
319 | 12.6k | AAR.addAAResult(WrapperPass->getResult()); |
320 | 12.6k | }); |
321 | 110 | } |
322 | | |
323 | | /// Predicate for Internalize pass. |
324 | 14 | bool mustPreserveGV(const GlobalValue &GV) { |
325 | 14 | if (const Function *F = dyn_cast<Function>(&GV)) |
326 | 10 | return F->isDeclaration() || 10 AMDGPU::isEntryFunctionCC(F->getCallingConv())10 ; |
327 | 4 | |
328 | 4 | return !GV.use_empty(); |
329 | 4 | } |
330 | | |
331 | 97 | void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { |
332 | 97 | Builder.DivergentTarget = true; |
333 | 97 | |
334 | 97 | bool EnableOpt = getOptLevel() > CodeGenOpt::None; |
335 | 97 | bool Internalize = InternalizeSymbols; |
336 | 97 | bool EarlyInline = EarlyInlineAll && EnableOpt1 && !EnableAMDGPUFunctionCalls1 ; |
337 | 97 | bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt; |
338 | 97 | bool LibCallSimplify = EnableLibCallSimplify && EnableOpt; |
339 | 97 | |
340 | 97 | if (EnableAMDGPUFunctionCalls97 ) { |
341 | 2 | delete Builder.Inliner; |
342 | 2 | Builder.Inliner = createAMDGPUFunctionInliningPass(); |
343 | 2 | } |
344 | 97 | |
345 | 97 | if (Internalize97 ) { |
346 | 2 | // If we're generating code, we always have the whole program available. The |
347 | 2 | // relocations expected for externally visible functions aren't supported, |
348 | 2 | // so make sure every non-entry function is hidden. |
349 | 2 | Builder.addExtension( |
350 | 2 | PassManagerBuilder::EP_EnabledOnOptLevel0, |
351 | 1 | [](const PassManagerBuilder &, legacy::PassManagerBase &PM) { |
352 | 1 | PM.add(createInternalizePass(mustPreserveGV)); |
353 | 1 | }); |
354 | 2 | } |
355 | 97 | |
356 | 97 | Builder.addExtension( |
357 | 97 | PassManagerBuilder::EP_ModuleOptimizerEarly, |
358 | 97 | [Internalize, EarlyInline, AMDGPUAA](const PassManagerBuilder &, |
359 | 55 | legacy::PassManagerBase &PM) { |
360 | 55 | if (AMDGPUAA55 ) { |
361 | 55 | PM.add(createAMDGPUAAWrapperPass()); |
362 | 55 | PM.add(createAMDGPUExternalAAWrapperPass()); |
363 | 55 | } |
364 | 55 | PM.add(createAMDGPUUnifyMetadataPass()); |
365 | 55 | if (Internalize55 ) { |
366 | 1 | PM.add(createInternalizePass(mustPreserveGV)); |
367 | 1 | PM.add(createGlobalDCEPass()); |
368 | 1 | } |
369 | 55 | if (EarlyInline) |
370 | 1 | PM.add(createAMDGPUAlwaysInlinePass(false)); |
371 | 55 | }); |
372 | 97 | |
373 | 97 | Builder.addExtension( |
374 | 97 | PassManagerBuilder::EP_EarlyAsPossible, |
375 | 97 | [AMDGPUAA, LibCallSimplify](const PassManagerBuilder &, |
376 | 97 | legacy::PassManagerBase &PM) { |
377 | 97 | if (AMDGPUAA97 ) { |
378 | 55 | PM.add(createAMDGPUAAWrapperPass()); |
379 | 55 | PM.add(createAMDGPUExternalAAWrapperPass()); |
380 | 55 | } |
381 | 97 | PM.add(llvm::createAMDGPUUseNativeCallsPass()); |
382 | 97 | if (LibCallSimplify) |
383 | 55 | PM.add(llvm::createAMDGPUSimplifyLibCallsPass()); |
384 | 97 | }); |
385 | 97 | |
386 | 97 | Builder.addExtension( |
387 | 97 | PassManagerBuilder::EP_CGSCCOptimizerLate, |
388 | 55 | [](const PassManagerBuilder &, legacy::PassManagerBase &PM) { |
389 | 55 | // Add infer address spaces pass to the opt pipeline after inlining |
390 | 55 | // but before SROA to increase SROA opportunities. |
391 | 55 | PM.add(createInferAddressSpacesPass()); |
392 | 55 | }); |
393 | 97 | } |
394 | | |
395 | | //===----------------------------------------------------------------------===// |
396 | | // R600 Target Machine (R600 -> Cayman) |
397 | | //===----------------------------------------------------------------------===// |
398 | | |
399 | | R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT, |
400 | | StringRef CPU, StringRef FS, |
401 | | TargetOptions Options, |
402 | | Optional<Reloc::Model> RM, |
403 | | Optional<CodeModel::Model> CM, |
404 | | CodeGenOpt::Level OL, bool JIT) |
405 | 255 | : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) { |
406 | 255 | setRequiresStructuredCFG(true); |
407 | 255 | } |
408 | | |
409 | | const R600Subtarget *R600TargetMachine::getSubtargetImpl( |
410 | 38.5k | const Function &F) const { |
411 | 38.5k | StringRef GPU = getGPUName(F); |
412 | 38.5k | StringRef FS = getFeatureString(F); |
413 | 38.5k | |
414 | 38.5k | SmallString<128> SubtargetKey(GPU); |
415 | 38.5k | SubtargetKey.append(FS); |
416 | 38.5k | |
417 | 38.5k | auto &I = SubtargetMap[SubtargetKey]; |
418 | 38.5k | if (!I38.5k ) { |
419 | 253 | // This needs to be done before we create a new subtarget since any |
420 | 253 | // creation will depend on the TM and the code generation flags on the |
421 | 253 | // function that reside in TargetOptions. |
422 | 253 | resetTargetOptions(F); |
423 | 253 | I = llvm::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this); |
424 | 253 | } |
425 | 38.5k | |
426 | 38.5k | return I.get(); |
427 | 38.5k | } |
428 | | |
429 | | //===----------------------------------------------------------------------===// |
430 | | // GCN Target Machine (SI+) |
431 | | //===----------------------------------------------------------------------===// |
432 | | |
433 | | GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT, |
434 | | StringRef CPU, StringRef FS, |
435 | | TargetOptions Options, |
436 | | Optional<Reloc::Model> RM, |
437 | | Optional<CodeModel::Model> CM, |
438 | | CodeGenOpt::Level OL, bool JIT) |
439 | 1.82k | : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} |
440 | | |
441 | 368k | const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const { |
442 | 368k | StringRef GPU = getGPUName(F); |
443 | 368k | StringRef FS = getFeatureString(F); |
444 | 368k | |
445 | 368k | SmallString<128> SubtargetKey(GPU); |
446 | 368k | SubtargetKey.append(FS); |
447 | 368k | |
448 | 368k | auto &I = SubtargetMap[SubtargetKey]; |
449 | 368k | if (!I368k ) { |
450 | 1.81k | // This needs to be done before we create a new subtarget since any |
451 | 1.81k | // creation will depend on the TM and the code generation flags on the |
452 | 1.81k | // function that reside in TargetOptions. |
453 | 1.81k | resetTargetOptions(F); |
454 | 1.81k | I = llvm::make_unique<SISubtarget>(TargetTriple, GPU, FS, *this); |
455 | 1.81k | } |
456 | 368k | |
457 | 368k | I->setScalarizeGlobalBehavior(ScalarizeGlobal); |
458 | 368k | |
459 | 368k | return I.get(); |
460 | 368k | } |
461 | | |
462 | | //===----------------------------------------------------------------------===// |
463 | | // AMDGPU Pass Setup |
464 | | //===----------------------------------------------------------------------===// |
465 | | |
466 | | namespace { |
467 | | |
468 | | class AMDGPUPassConfig : public TargetPassConfig { |
469 | | public: |
470 | | AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) |
471 | 1.97k | : TargetPassConfig(TM, PM) { |
472 | 1.97k | // Exceptions and StackMaps are not supported, so these passes will never do |
473 | 1.97k | // anything. |
474 | 1.97k | disablePass(&StackMapLivenessID); |
475 | 1.97k | disablePass(&FuncletLayoutID); |
476 | 1.97k | } |
477 | | |
478 | 3.44k | AMDGPUTargetMachine &getAMDGPUTargetMachine() const { |
479 | 3.44k | return getTM<AMDGPUTargetMachine>(); |
480 | 3.44k | } |
481 | | |
482 | | ScheduleDAGInstrs * |
483 | 0 | createMachineScheduler(MachineSchedContext *C) const override { |
484 | 0 | ScheduleDAGMILive *DAG = createGenericSchedLive(C); |
485 | 0 | DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); |
486 | 0 | DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); |
487 | 0 | return DAG; |
488 | 0 | } |
489 | | |
490 | | void addEarlyCSEOrGVNPass(); |
491 | | void addStraightLineScalarOptimizationPasses(); |
492 | | void addIRPasses() override; |
493 | | void addCodeGenPrepare() override; |
494 | | bool addPreISel() override; |
495 | | bool addInstSelector() override; |
496 | | bool addGCPasses() override; |
497 | | }; |
498 | | |
499 | | class R600PassConfig final : public AMDGPUPassConfig { |
500 | | public: |
501 | | R600PassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) |
502 | 248 | : AMDGPUPassConfig(TM, PM) {} |
503 | | |
504 | | ScheduleDAGInstrs *createMachineScheduler( |
505 | 2.05k | MachineSchedContext *C) const override { |
506 | 2.05k | return createR600MachineScheduler(C); |
507 | 2.05k | } |
508 | | |
509 | | bool addPreISel() override; |
510 | | bool addInstSelector() override; |
511 | | void addPreRegAlloc() override; |
512 | | void addPreSched2() override; |
513 | | void addPreEmitPass() override; |
514 | | }; |
515 | | |
516 | | class GCNPassConfig final : public AMDGPUPassConfig { |
517 | | public: |
518 | | GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) |
519 | 1.72k | : AMDGPUPassConfig(TM, PM) { |
520 | 1.72k | // It is necessary to know the register usage of the entire call graph. We |
521 | 1.72k | // allow calls without EnableAMDGPUFunctionCalls if they are marked |
522 | 1.72k | // noinline, so this is always required. |
523 | 1.72k | setRequiresCodeGenSCCOrder(true); |
524 | 1.72k | } |
525 | | |
526 | 0 | GCNTargetMachine &getGCNTargetMachine() const { |
527 | 0 | return getTM<GCNTargetMachine>(); |
528 | 0 | } |
529 | | |
530 | | ScheduleDAGInstrs * |
531 | | createMachineScheduler(MachineSchedContext *C) const override; |
532 | | |
533 | | bool addPreISel() override; |
534 | | void addMachineSSAOptimization() override; |
535 | | bool addILPOpts() override; |
536 | | bool addInstSelector() override; |
537 | | bool addIRTranslator() override; |
538 | | bool addLegalizeMachineIR() override; |
539 | | bool addRegBankSelect() override; |
540 | | bool addGlobalInstructionSelect() override; |
541 | | void addFastRegAlloc(FunctionPass *RegAllocPass) override; |
542 | | void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override; |
543 | | void addPreRegAlloc() override; |
544 | | void addPostRegAlloc() override; |
545 | | void addPreSched2() override; |
546 | | void addPreEmitPass() override; |
547 | | }; |
548 | | |
549 | | } // end anonymous namespace |
550 | | |
551 | 2.16k | TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() { |
552 | 290k | return TargetIRAnalysis([this](const Function &F) { |
553 | 290k | return TargetTransformInfo(AMDGPUTTIImpl(this, F)); |
554 | 290k | }); |
555 | 2.16k | } |
556 | | |
557 | 3.36k | void AMDGPUPassConfig::addEarlyCSEOrGVNPass() { |
558 | 3.36k | if (getOptLevel() == CodeGenOpt::Aggressive) |
559 | 0 | addPass(createGVNPass()); |
560 | 3.36k | else |
561 | 3.36k | addPass(createEarlyCSEPass()); |
562 | 3.36k | } |
563 | | |
564 | 1.68k | void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() { |
565 | 1.68k | addPass(createSeparateConstOffsetFromGEPPass()); |
566 | 1.68k | addPass(createSpeculativeExecutionPass()); |
567 | 1.68k | // ReassociateGEPs exposes more opportunites for SLSR. See |
568 | 1.68k | // the example in reassociate-geps-and-slsr.ll. |
569 | 1.68k | addPass(createStraightLineStrengthReducePass()); |
570 | 1.68k | // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or |
571 | 1.68k | // EarlyCSE can reuse. |
572 | 1.68k | addEarlyCSEOrGVNPass(); |
573 | 1.68k | // Run NaryReassociate after EarlyCSE/GVN to be more effective. |
574 | 1.68k | addPass(createNaryReassociatePass()); |
575 | 1.68k | // NaryReassociate on GEPs creates redundant common expressions, so run |
576 | 1.68k | // EarlyCSE after it. |
577 | 1.68k | addPass(createEarlyCSEPass()); |
578 | 1.68k | } |
579 | | |
580 | 1.72k | void AMDGPUPassConfig::addIRPasses() { |
581 | 1.72k | const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); |
582 | 1.72k | |
583 | 1.72k | // There is no reason to run these. |
584 | 1.72k | disablePass(&StackMapLivenessID); |
585 | 1.72k | disablePass(&FuncletLayoutID); |
586 | 1.72k | disablePass(&PatchableFunctionID); |
587 | 1.72k | |
588 | 1.72k | addPass(createAMDGPULowerIntrinsicsPass()); |
589 | 1.72k | |
590 | 1.72k | if (TM.getTargetTriple().getArch() == Triple::r600 || |
591 | 1.72k | !EnableAMDGPUFunctionCalls1.48k ) { |
592 | 1.72k | // Function calls are not supported, so make sure we inline everything. |
593 | 1.72k | addPass(createAMDGPUAlwaysInlinePass()); |
594 | 1.72k | addPass(createAlwaysInlinerLegacyPass()); |
595 | 1.72k | // We need to add the barrier noop pass, otherwise adding the function |
596 | 1.72k | // inlining pass will cause all of the PassConfigs passes to be run |
597 | 1.72k | // one function at a time, which means if we have a nodule with two |
598 | 1.72k | // functions, then we will generate code for the first function |
599 | 1.72k | // without ever running any passes on the second. |
600 | 1.72k | addPass(createBarrierNoopPass()); |
601 | 1.72k | } |
602 | 1.72k | |
603 | 1.72k | if (TM.getTargetTriple().getArch() == Triple::amdgcn1.72k ) { |
604 | 1.48k | // TODO: May want to move later or split into an early and late one. |
605 | 1.48k | |
606 | 1.48k | addPass(createAMDGPUCodeGenPreparePass()); |
607 | 1.48k | } |
608 | 1.72k | |
609 | 1.72k | // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. |
610 | 1.72k | addPass(createAMDGPUOpenCLImageTypeLoweringPass()); |
611 | 1.72k | |
612 | 1.72k | if (TM.getOptLevel() > CodeGenOpt::None1.72k ) { |
613 | 1.68k | addPass(createInferAddressSpacesPass()); |
614 | 1.68k | addPass(createAMDGPUPromoteAlloca()); |
615 | 1.68k | |
616 | 1.68k | if (EnableSROA) |
617 | 1.66k | addPass(createSROAPass()); |
618 | 1.68k | |
619 | 1.68k | addStraightLineScalarOptimizationPasses(); |
620 | 1.68k | |
621 | 1.68k | if (EnableAMDGPUAliasAnalysis1.68k ) { |
622 | 1.66k | addPass(createAMDGPUAAWrapperPass()); |
623 | 1.66k | addPass(createExternalAAWrapperPass([](Pass &P, Function &, |
624 | 78.5k | AAResults &AAR) { |
625 | 78.5k | if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>()) |
626 | 78.5k | AAR.addAAResult(WrapperPass->getResult()); |
627 | 78.5k | })); |
628 | 1.66k | } |
629 | 1.68k | } |
630 | 1.72k | |
631 | 1.72k | TargetPassConfig::addIRPasses(); |
632 | 1.72k | |
633 | 1.72k | // EarlyCSE is not always strong enough to clean up what LSR produces. For |
634 | 1.72k | // example, GVN can combine |
635 | 1.72k | // |
636 | 1.72k | // %0 = add %a, %b |
637 | 1.72k | // %1 = add %b, %a |
638 | 1.72k | // |
639 | 1.72k | // and |
640 | 1.72k | // |
641 | 1.72k | // %0 = shl nsw %a, 2 |
642 | 1.72k | // %1 = shl %a, 2 |
643 | 1.72k | // |
644 | 1.72k | // but EarlyCSE can do neither of them. |
645 | 1.72k | if (getOptLevel() != CodeGenOpt::None) |
646 | 1.68k | addEarlyCSEOrGVNPass(); |
647 | 1.72k | } |
648 | | |
649 | 1.72k | void AMDGPUPassConfig::addCodeGenPrepare() { |
650 | 1.72k | TargetPassConfig::addCodeGenPrepare(); |
651 | 1.72k | |
652 | 1.72k | if (EnableLoadStoreVectorizer) |
653 | 1.71k | addPass(createLoadStoreVectorizerPass()); |
654 | 1.72k | } |
655 | | |
656 | 1.72k | bool AMDGPUPassConfig::addPreISel() { |
657 | 1.72k | addPass(createFlattenCFGPass()); |
658 | 1.72k | return false; |
659 | 1.72k | } |
660 | | |
661 | 1.47k | bool AMDGPUPassConfig::addInstSelector() { |
662 | 1.47k | addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel())); |
663 | 1.47k | return false; |
664 | 1.47k | } |
665 | | |
666 | 1.72k | bool AMDGPUPassConfig::addGCPasses() { |
667 | 1.72k | // Do nothing. GC is not supported. |
668 | 1.72k | return false; |
669 | 1.72k | } |
670 | | |
671 | | //===----------------------------------------------------------------------===// |
672 | | // R600 Pass Setup |
673 | | //===----------------------------------------------------------------------===// |
674 | | |
675 | 244 | bool R600PassConfig::addPreISel() { |
676 | 244 | AMDGPUPassConfig::addPreISel(); |
677 | 244 | |
678 | 244 | if (EnableR600StructurizeCFG) |
679 | 242 | addPass(createStructurizeCFGPass()); |
680 | 244 | return false; |
681 | 244 | } |
682 | | |
683 | 244 | bool R600PassConfig::addInstSelector() { |
684 | 244 | addPass(createR600ISelDag(&getAMDGPUTargetMachine(), getOptLevel())); |
685 | 244 | return false; |
686 | 244 | } |
687 | | |
688 | 244 | void R600PassConfig::addPreRegAlloc() { |
689 | 244 | addPass(createR600VectorRegMerger()); |
690 | 244 | } |
691 | | |
692 | 244 | void R600PassConfig::addPreSched2() { |
693 | 244 | addPass(createR600EmitClauseMarkers(), false); |
694 | 244 | if (EnableR600IfConvert) |
695 | 243 | addPass(&IfConverterID, false); |
696 | 244 | addPass(createR600ClauseMergePass(), false); |
697 | 244 | } |
698 | | |
699 | 244 | void R600PassConfig::addPreEmitPass() { |
700 | 244 | addPass(createAMDGPUCFGStructurizerPass(), false); |
701 | 244 | addPass(createR600ExpandSpecialInstrsPass(), false); |
702 | 244 | addPass(&FinalizeMachineBundlesID, false); |
703 | 244 | addPass(createR600Packetizer(), false); |
704 | 244 | addPass(createR600ControlFlowFinalizer(), false); |
705 | 244 | } |
706 | | |
707 | 248 | TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) { |
708 | 248 | return new R600PassConfig(*this, PM); |
709 | 248 | } |
710 | | |
711 | | //===----------------------------------------------------------------------===// |
712 | | // GCN Pass Setup |
713 | | //===----------------------------------------------------------------------===// |
714 | | |
715 | | ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler( |
716 | 14.8k | MachineSchedContext *C) const { |
717 | 14.8k | const SISubtarget &ST = C->MF->getSubtarget<SISubtarget>(); |
718 | 14.8k | if (ST.enableSIScheduler()) |
719 | 0 | return createSIMachineScheduler(C); |
720 | 14.8k | return createGCNMaxOccupancyMachineScheduler(C); |
721 | 14.8k | } |
722 | | |
723 | 1.48k | bool GCNPassConfig::addPreISel() { |
724 | 1.48k | AMDGPUPassConfig::addPreISel(); |
725 | 1.48k | |
726 | 1.48k | // FIXME: We need to run a pass to propagate the attributes when calls are |
727 | 1.48k | // supported. |
728 | 1.48k | addPass(createAMDGPUAnnotateKernelFeaturesPass()); |
729 | 1.48k | |
730 | 1.48k | // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit |
731 | 1.48k | // regions formed by them. |
732 | 1.48k | addPass(&AMDGPUUnifyDivergentExitNodesID); |
733 | 1.48k | if (!LateCFGStructurize1.48k ) { |
734 | 1.48k | addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions |
735 | 1.48k | } |
736 | 1.48k | addPass(createSinkingPass()); |
737 | 1.48k | addPass(createAMDGPUAnnotateUniformValues()); |
738 | 1.48k | if (!LateCFGStructurize1.48k ) { |
739 | 1.48k | addPass(createSIAnnotateControlFlowPass()); |
740 | 1.48k | } |
741 | 1.48k | |
742 | 1.48k | return false; |
743 | 1.48k | } |
744 | | |
745 | 1.43k | void GCNPassConfig::addMachineSSAOptimization() { |
746 | 1.43k | TargetPassConfig::addMachineSSAOptimization(); |
747 | 1.43k | |
748 | 1.43k | // We want to fold operands after PeepholeOptimizer has run (or as part of |
749 | 1.43k | // it), because it will eliminate extra copies making it easier to fold the |
750 | 1.43k | // real source operand. We want to eliminate dead instructions after, so that |
751 | 1.43k | // we see fewer uses of the copies. We then need to clean up the dead |
752 | 1.43k | // instructions leftover after the operands are folded as well. |
753 | 1.43k | // |
754 | 1.43k | // XXX - Can we get away without running DeadMachineInstructionElim again? |
755 | 1.43k | addPass(&SIFoldOperandsID); |
756 | 1.43k | addPass(&DeadMachineInstructionElimID); |
757 | 1.43k | addPass(&SILoadStoreOptimizerID); |
758 | 1.43k | if (EnableSDWAPeephole1.43k ) { |
759 | 1.43k | addPass(&SIPeepholeSDWAID); |
760 | 1.43k | addPass(&MachineLICMID); |
761 | 1.43k | addPass(&MachineCSEID); |
762 | 1.43k | addPass(&SIFoldOperandsID); |
763 | 1.43k | addPass(&DeadMachineInstructionElimID); |
764 | 1.43k | } |
765 | 1.43k | addPass(createSIShrinkInstructionsPass()); |
766 | 1.43k | } |
767 | | |
768 | 1.43k | bool GCNPassConfig::addILPOpts() { |
769 | 1.43k | if (EnableEarlyIfConversion) |
770 | 2 | addPass(&EarlyIfConverterID); |
771 | 1.43k | |
772 | 1.43k | TargetPassConfig::addILPOpts(); |
773 | 1.43k | return false; |
774 | 1.43k | } |
775 | | |
776 | 1.47k | bool GCNPassConfig::addInstSelector() { |
777 | 1.47k | AMDGPUPassConfig::addInstSelector(); |
778 | 1.47k | addPass(createSILowerI1CopiesPass()); |
779 | 1.47k | addPass(&SIFixSGPRCopiesID); |
780 | 1.47k | return false; |
781 | 1.47k | } |
782 | | |
783 | 5 | bool GCNPassConfig::addIRTranslator() { |
784 | 5 | addPass(new IRTranslator()); |
785 | 5 | return false; |
786 | 5 | } |
787 | | |
788 | 5 | bool GCNPassConfig::addLegalizeMachineIR() { |
789 | 5 | addPass(new Legalizer()); |
790 | 5 | return false; |
791 | 5 | } |
792 | | |
793 | 5 | bool GCNPassConfig::addRegBankSelect() { |
794 | 5 | addPass(new RegBankSelect()); |
795 | 5 | return false; |
796 | 5 | } |
797 | | |
798 | 5 | bool GCNPassConfig::addGlobalInstructionSelect() { |
799 | 5 | addPass(new InstructionSelect()); |
800 | 5 | return false; |
801 | 5 | } |
802 | | |
803 | 1.48k | void GCNPassConfig::addPreRegAlloc() { |
804 | 1.48k | if (LateCFGStructurize1.48k ) { |
805 | 0 | addPass(createAMDGPUMachineCFGStructurizerPass()); |
806 | 0 | } |
807 | 1.48k | addPass(createSIWholeQuadModePass()); |
808 | 1.48k | } |
809 | | |
810 | 44 | void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) { |
811 | 44 | // FIXME: We have to disable the verifier here because of PHIElimination + |
812 | 44 | // TwoAddressInstructions disabling it. |
813 | 44 | |
814 | 44 | // This must be run immediately after phi elimination and before |
815 | 44 | // TwoAddressInstructions, otherwise the processing of the tied operand of |
816 | 44 | // SI_ELSE will introduce a copy of the tied operand source after the else. |
817 | 44 | insertPass(&PHIEliminationID, &SILowerControlFlowID, false); |
818 | 44 | |
819 | 44 | // This must be run after SILowerControlFlow, since it needs to use the |
820 | 44 | // machine-level CFG, but before register allocation. |
821 | 44 | insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false); |
822 | 44 | |
823 | 44 | TargetPassConfig::addFastRegAlloc(RegAllocPass); |
824 | 44 | } |
825 | | |
826 | 1.43k | void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { |
827 | 1.43k | insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID); |
828 | 1.43k | |
829 | 1.43k | // This must be run immediately after phi elimination and before |
830 | 1.43k | // TwoAddressInstructions, otherwise the processing of the tied operand of |
831 | 1.43k | // SI_ELSE will introduce a copy of the tied operand source after the else. |
832 | 1.43k | insertPass(&PHIEliminationID, &SILowerControlFlowID, false); |
833 | 1.43k | |
834 | 1.43k | // This must be run after SILowerControlFlow, since it needs to use the |
835 | 1.43k | // machine-level CFG, but before register allocation. |
836 | 1.43k | insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false); |
837 | 1.43k | |
838 | 1.43k | TargetPassConfig::addOptimizedRegAlloc(RegAllocPass); |
839 | 1.43k | } |
840 | | |
841 | 1.48k | void GCNPassConfig::addPostRegAlloc() { |
842 | 1.48k | addPass(&SIFixVGPRCopiesID); |
843 | 1.48k | addPass(&SIOptimizeExecMaskingID); |
844 | 1.48k | TargetPassConfig::addPostRegAlloc(); |
845 | 1.48k | } |
846 | | |
847 | 1.48k | void GCNPassConfig::addPreSched2() { |
848 | 1.48k | } |
849 | | |
850 | 1.48k | void GCNPassConfig::addPreEmitPass() { |
851 | 1.48k | // The hazard recognizer that runs as part of the post-ra scheduler does not |
852 | 1.48k | // guarantee to be able handle all hazards correctly. This is because if there |
853 | 1.48k | // are multiple scheduling regions in a basic block, the regions are scheduled |
854 | 1.48k | // bottom up, so when we begin to schedule a region we don't know what |
855 | 1.48k | // instructions were emitted directly before it. |
856 | 1.48k | // |
857 | 1.48k | // Here we add a stand-alone hazard recognizer pass which can handle all |
858 | 1.48k | // cases. |
859 | 1.48k | addPass(&PostRAHazardRecognizerID); |
860 | 1.48k | |
861 | 1.48k | if (EnableSIInsertWaitcntsPass) |
862 | 1.48k | addPass(createSIInsertWaitcntsPass()); |
863 | 1.48k | else |
864 | 0 | addPass(createSIInsertWaitsPass()); |
865 | 1.48k | addPass(createSIShrinkInstructionsPass()); |
866 | 1.48k | addPass(&SIInsertSkipsPassID); |
867 | 1.48k | addPass(createSIMemoryLegalizerPass()); |
868 | 1.48k | addPass(createSIDebuggerInsertNopsPass()); |
869 | 1.48k | addPass(&BranchRelaxationPassID); |
870 | 1.48k | } |
871 | | |
872 | 1.72k | TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { |
873 | 1.72k | return new GCNPassConfig(*this, PM); |
874 | 1.72k | } |
875 | | |