/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===-- SIInsertSkips.cpp - Use predicates for control flow ---------------===// |
2 | | // |
3 | | // The LLVM Compiler Infrastructure |
4 | | // |
5 | | // This file is distributed under the University of Illinois Open Source |
6 | | // License. See LICENSE.TXT for details. |
7 | | // |
8 | | //===----------------------------------------------------------------------===// |
9 | | // |
10 | | /// \file |
11 | | /// \brief This pass inserts branches on the 0 exec mask over divergent branches |
12 | | /// branches when it's expected that jumping over the untaken control flow will |
13 | | /// be cheaper than having every workitem no-op through it. |
14 | | // |
15 | | //===----------------------------------------------------------------------===// |
16 | | |
17 | | #include "AMDGPU.h" |
18 | | #include "AMDGPUSubtarget.h" |
19 | | #include "SIInstrInfo.h" |
20 | | #include "SIMachineFunctionInfo.h" |
21 | | #include "llvm/ADT/SmallVector.h" |
22 | | #include "llvm/ADT/StringRef.h" |
23 | | #include "llvm/CodeGen/MachineBasicBlock.h" |
24 | | #include "llvm/CodeGen/MachineFunction.h" |
25 | | #include "llvm/CodeGen/MachineFunctionPass.h" |
26 | | #include "llvm/CodeGen/MachineInstr.h" |
27 | | #include "llvm/CodeGen/MachineInstrBuilder.h" |
28 | | #include "llvm/CodeGen/MachineOperand.h" |
29 | | #include "llvm/IR/CallingConv.h" |
30 | | #include "llvm/IR/DebugLoc.h" |
31 | | #include "llvm/MC/MCAsmInfo.h" |
32 | | #include "llvm/Pass.h" |
33 | | #include "llvm/Support/CommandLine.h" |
34 | | #include "llvm/Target/TargetMachine.h" |
35 | | #include <cassert> |
36 | | #include <cstdint> |
37 | | #include <iterator> |
38 | | |
39 | | using namespace llvm; |
40 | | |
41 | | #define DEBUG_TYPE "si-insert-skips" |
42 | | |
43 | | static cl::opt<unsigned> SkipThresholdFlag( |
44 | | "amdgpu-skip-threshold", |
45 | | cl::desc("Number of instructions before jumping over divergent control flow"), |
46 | | cl::init(12), cl::Hidden); |
47 | | |
48 | | namespace { |
49 | | |
50 | | class SIInsertSkips : public MachineFunctionPass { |
51 | | private: |
52 | | const SIRegisterInfo *TRI = nullptr; |
53 | | const SIInstrInfo *TII = nullptr; |
54 | | unsigned SkipThreshold = 0; |
55 | | |
56 | | bool shouldSkip(const MachineBasicBlock &From, |
57 | | const MachineBasicBlock &To) const; |
58 | | |
59 | | bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB); |
60 | | |
61 | | void kill(MachineInstr &MI); |
62 | | |
63 | | MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB, |
64 | | MachineBasicBlock::iterator I) const; |
65 | | |
66 | | bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB); |
67 | | |
68 | | public: |
69 | | static char ID; |
70 | | |
71 | 1.48k | SIInsertSkips() : MachineFunctionPass(ID) {} |
72 | | |
73 | | bool runOnMachineFunction(MachineFunction &MF) override; |
74 | | |
75 | 1.47k | StringRef getPassName() const override { |
76 | 1.47k | return "SI insert s_cbranch_execz instructions"; |
77 | 1.47k | } |
78 | | |
79 | 1.47k | void getAnalysisUsage(AnalysisUsage &AU) const override { |
80 | 1.47k | MachineFunctionPass::getAnalysisUsage(AU); |
81 | 1.47k | } |
82 | | }; |
83 | | |
84 | | } // end anonymous namespace |
85 | | |
86 | | char SIInsertSkips::ID = 0; |
87 | | |
88 | | INITIALIZE_PASS(SIInsertSkips, DEBUG_TYPE, |
89 | | "SI insert s_cbranch_execz instructions", false, false) |
90 | | |
91 | | char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID; |
92 | | |
93 | 2.44k | static bool opcodeEmitsNoInsts(unsigned Opc) { |
94 | 2.44k | switch (Opc) { |
95 | 47 | case TargetOpcode::IMPLICIT_DEF: |
96 | 47 | case TargetOpcode::KILL: |
97 | 47 | case TargetOpcode::BUNDLE: |
98 | 47 | case TargetOpcode::CFI_INSTRUCTION: |
99 | 47 | case TargetOpcode::EH_LABEL: |
100 | 47 | case TargetOpcode::GC_LABEL: |
101 | 47 | case TargetOpcode::DBG_VALUE: |
102 | 47 | return true; |
103 | 2.39k | default: |
104 | 2.39k | return false; |
105 | 0 | } |
106 | 0 | } |
107 | | |
108 | | bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From, |
109 | 390 | const MachineBasicBlock &To) const { |
110 | 390 | if (From.succ_empty()) |
111 | 7 | return false; |
112 | 383 | |
113 | 383 | unsigned NumInstr = 0; |
114 | 383 | const MachineFunction *MF = From.getParent(); |
115 | 383 | |
116 | 383 | for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end(); |
117 | 744 | MBBI != End && 744 MBBI != ToI743 ; ++MBBI361 ) { |
118 | 437 | const MachineBasicBlock &MBB = *MBBI; |
119 | 437 | |
120 | 437 | for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end(); |
121 | 2.80k | NumInstr < SkipThreshold && 2.80k I != E2.80k ; ++I2.36k ) { |
122 | 2.44k | if (opcodeEmitsNoInsts(I->getOpcode())) |
123 | 47 | continue; |
124 | 2.39k | |
125 | 2.39k | // FIXME: Since this is required for correctness, this should be inserted |
126 | 2.39k | // during SILowerControlFlow. |
127 | 2.39k | |
128 | 2.39k | // When a uniform loop is inside non-uniform control flow, the branch |
129 | 2.39k | // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken |
130 | 2.39k | // when EXEC = 0. We should skip the loop lest it becomes infinite. |
131 | 2.39k | if (2.39k I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ || |
132 | 2.39k | I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ) |
133 | 3 | return true; |
134 | 2.39k | |
135 | 2.39k | if (2.39k I->isInlineAsm()2.39k ) { |
136 | 13 | const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); |
137 | 13 | const char *AsmStr = I->getOperand(0).getSymbolName(); |
138 | 13 | |
139 | 13 | // inlineasm length estimate is number of bytes assuming the longest |
140 | 13 | // instruction. |
141 | 13 | uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI); |
142 | 13 | NumInstr += MaxAsmSize / MAI->getMaxInstLength(); |
143 | 2.39k | } else { |
144 | 2.37k | ++NumInstr; |
145 | 2.37k | } |
146 | 2.39k | |
147 | 2.39k | if (NumInstr >= SkipThreshold) |
148 | 73 | return true; |
149 | 2.44k | } |
150 | 437 | } |
151 | 383 | |
152 | 307 | return false; |
153 | 390 | } |
154 | | |
155 | 29 | bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) { |
156 | 29 | MachineBasicBlock &MBB = *MI.getParent(); |
157 | 29 | MachineFunction *MF = MBB.getParent(); |
158 | 29 | |
159 | 29 | if (MF->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS || |
160 | 22 | !shouldSkip(MBB, MBB.getParent()->back())) |
161 | 25 | return false; |
162 | 4 | |
163 | 4 | MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator()); |
164 | 4 | |
165 | 4 | const DebugLoc &DL = MI.getDebugLoc(); |
166 | 4 | |
167 | 4 | // If the exec mask is non-zero, skip the next two instructions |
168 | 4 | BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) |
169 | 4 | .addMBB(&NextBB); |
170 | 4 | |
171 | 4 | MachineBasicBlock::iterator Insert = SkipBB->begin(); |
172 | 4 | |
173 | 4 | // Exec mask is zero: Export to NULL target... |
174 | 4 | BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP_DONE)) |
175 | 4 | .addImm(0x09) // V_008DFC_SQ_EXP_NULL |
176 | 4 | .addReg(AMDGPU::VGPR0, RegState::Undef) |
177 | 4 | .addReg(AMDGPU::VGPR0, RegState::Undef) |
178 | 4 | .addReg(AMDGPU::VGPR0, RegState::Undef) |
179 | 4 | .addReg(AMDGPU::VGPR0, RegState::Undef) |
180 | 4 | .addImm(1) // vm |
181 | 4 | .addImm(0) // compr |
182 | 4 | .addImm(0); // en |
183 | 4 | |
184 | 4 | // ... and terminate wavefront. |
185 | 4 | BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)); |
186 | 4 | |
187 | 4 | return true; |
188 | 4 | } |
189 | | |
190 | 34 | void SIInsertSkips::kill(MachineInstr &MI) { |
191 | 34 | MachineBasicBlock &MBB = *MI.getParent(); |
192 | 34 | DebugLoc DL = MI.getDebugLoc(); |
193 | 34 | const MachineOperand &Op = MI.getOperand(0); |
194 | 34 | |
195 | | #ifndef NDEBUG |
196 | | CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv(); |
197 | | // Kill is only allowed in pixel / geometry shaders. |
198 | | assert(CallConv == CallingConv::AMDGPU_PS || |
199 | | CallConv == CallingConv::AMDGPU_GS); |
200 | | #endif |
201 | | // Clear this thread from the exec mask if the operand is negative. |
202 | 34 | if (Op.isImm()34 ) { |
203 | 14 | // Constant operand: Set exec mask to 0 or do nothing |
204 | 14 | if (Op.getImm() & 0x8000000014 ) { |
205 | 14 | BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) |
206 | 14 | .addImm(0); |
207 | 14 | } |
208 | 34 | } else { |
209 | 20 | BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32)) |
210 | 20 | .addImm(0) |
211 | 20 | .add(Op); |
212 | 20 | } |
213 | 34 | } |
214 | | |
215 | | MachineBasicBlock *SIInsertSkips::insertSkipBlock( |
216 | 4 | MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { |
217 | 4 | MachineFunction *MF = MBB.getParent(); |
218 | 4 | |
219 | 4 | MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock(); |
220 | 4 | MachineFunction::iterator MBBI(MBB); |
221 | 4 | ++MBBI; |
222 | 4 | |
223 | 4 | MF->insert(MBBI, SkipBB); |
224 | 4 | MBB.addSuccessor(SkipBB); |
225 | 4 | |
226 | 4 | return SkipBB; |
227 | 4 | } |
228 | | |
229 | | // Returns true if a branch over the block was inserted. |
230 | | bool SIInsertSkips::skipMaskBranch(MachineInstr &MI, |
231 | 368 | MachineBasicBlock &SrcMBB) { |
232 | 368 | MachineBasicBlock *DestBB = MI.getOperand(0).getMBB(); |
233 | 368 | |
234 | 368 | if (!shouldSkip(**SrcMBB.succ_begin(), *DestBB)) |
235 | 296 | return false; |
236 | 72 | |
237 | 72 | const DebugLoc &DL = MI.getDebugLoc(); |
238 | 72 | MachineBasicBlock::iterator InsPt = std::next(MI.getIterator()); |
239 | 72 | |
240 | 72 | BuildMI(SrcMBB, InsPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) |
241 | 72 | .addMBB(DestBB); |
242 | 72 | |
243 | 72 | return true; |
244 | 72 | } |
245 | | |
246 | 15.0k | bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { |
247 | 15.0k | const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); |
248 | 15.0k | TII = ST.getInstrInfo(); |
249 | 15.0k | TRI = &TII->getRegisterInfo(); |
250 | 15.0k | SkipThreshold = SkipThresholdFlag; |
251 | 15.0k | |
252 | 15.0k | bool HaveKill = false; |
253 | 15.0k | bool MadeChange = false; |
254 | 15.0k | |
255 | 15.0k | // Track depth of exec mask, divergent branches. |
256 | 15.0k | SmallVector<MachineBasicBlock *, 16> ExecBranchStack; |
257 | 15.0k | |
258 | 15.0k | MachineFunction::iterator NextBB; |
259 | 15.0k | |
260 | 15.0k | MachineBasicBlock *EmptyMBBAtEnd = nullptr; |
261 | 15.0k | |
262 | 15.0k | for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); |
263 | 32.0k | BI != BE32.0k ; BI = NextBB16.9k ) { |
264 | 16.9k | NextBB = std::next(BI); |
265 | 16.9k | MachineBasicBlock &MBB = *BI; |
266 | 16.9k | bool HaveSkipBlock = false; |
267 | 16.9k | |
268 | 16.9k | if (!ExecBranchStack.empty() && 16.9k ExecBranchStack.back() == &MBB754 ) { |
269 | 364 | // Reached convergence point for last divergent branch. |
270 | 364 | ExecBranchStack.pop_back(); |
271 | 364 | } |
272 | 16.9k | |
273 | 16.9k | if (HaveKill && 16.9k ExecBranchStack.empty()7 ) { |
274 | 5 | HaveKill = false; |
275 | 5 | |
276 | 5 | // TODO: Insert skip if exec is 0? |
277 | 5 | } |
278 | 16.9k | |
279 | 16.9k | MachineBasicBlock::iterator I, Next; |
280 | 319k | for (I = MBB.begin(); I != MBB.end()319k ; I = Next302k ) { |
281 | 302k | Next = std::next(I); |
282 | 302k | |
283 | 302k | MachineInstr &MI = *I; |
284 | 302k | |
285 | 302k | switch (MI.getOpcode()) { |
286 | 368 | case AMDGPU::SI_MASK_BRANCH: |
287 | 368 | ExecBranchStack.push_back(MI.getOperand(0).getMBB()); |
288 | 368 | MadeChange |= skipMaskBranch(MI, MBB); |
289 | 368 | break; |
290 | 302k | |
291 | 602 | case AMDGPU::S_BRANCH: |
292 | 602 | // Optimize out branches to the next block. |
293 | 602 | // FIXME: Shouldn't this be handled by BranchFolding? |
294 | 602 | if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())602 ) { |
295 | 435 | MI.eraseFromParent(); |
296 | 602 | } else if (167 HaveSkipBlock167 ) { |
297 | 1 | // Remove the given unconditional branch when a skip block has been |
298 | 1 | // inserted after the current one and let skip the two instructions |
299 | 1 | // performing the kill if the exec mask is non-zero. |
300 | 1 | MI.eraseFromParent(); |
301 | 1 | } |
302 | 602 | break; |
303 | 302k | |
304 | 34 | case AMDGPU::SI_KILL_TERMINATOR: |
305 | 34 | MadeChange = true; |
306 | 34 | kill(MI); |
307 | 34 | |
308 | 34 | if (ExecBranchStack.empty()34 ) { |
309 | 29 | if (skipIfDead(MI, *NextBB)29 ) { |
310 | 4 | HaveSkipBlock = true; |
311 | 4 | NextBB = std::next(BI); |
312 | 4 | BE = MF.end(); |
313 | 4 | } |
314 | 34 | } else { |
315 | 5 | HaveKill = true; |
316 | 5 | } |
317 | 34 | |
318 | 34 | MI.eraseFromParent(); |
319 | 34 | break; |
320 | 302k | |
321 | 263 | case AMDGPU::SI_RETURN_TO_EPILOG: |
322 | 263 | // FIXME: Should move somewhere else |
323 | 263 | assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid()); |
324 | 263 | |
325 | 263 | // Graphics shaders returning non-void shouldn't contain S_ENDPGM, |
326 | 263 | // because external bytecode will be appended at the end. |
327 | 263 | if (BI != --MF.end() || 263 I != MBB.getFirstTerminator()259 ) { |
328 | 4 | // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block at |
329 | 4 | // the end and jump there. |
330 | 4 | if (!EmptyMBBAtEnd4 ) { |
331 | 4 | EmptyMBBAtEnd = MF.CreateMachineBasicBlock(); |
332 | 4 | MF.insert(MF.end(), EmptyMBBAtEnd); |
333 | 4 | } |
334 | 4 | |
335 | 4 | MBB.addSuccessor(EmptyMBBAtEnd); |
336 | 4 | BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH)) |
337 | 4 | .addMBB(EmptyMBBAtEnd); |
338 | 4 | I->eraseFromParent(); |
339 | 4 | } |
340 | 263 | break; |
341 | 302k | |
342 | 301k | default: |
343 | 301k | break; |
344 | 302k | } |
345 | 302k | } |
346 | 16.9k | } |
347 | 15.0k | |
348 | 15.0k | return MadeChange; |
349 | 15.0k | } |