/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===-- SIOptimizeExecMasking.cpp -----------------------------------------===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | |
9 | | #include "AMDGPU.h" |
10 | | #include "AMDGPUSubtarget.h" |
11 | | #include "SIInstrInfo.h" |
12 | | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
13 | | #include "llvm/ADT/SmallSet.h" |
14 | | #include "llvm/CodeGen/MachineFunctionPass.h" |
15 | | #include "llvm/CodeGen/MachineInstrBuilder.h" |
16 | | #include "llvm/CodeGen/MachineRegisterInfo.h" |
17 | | #include "llvm/Support/Debug.h" |
18 | | |
19 | | using namespace llvm; |
20 | | |
21 | | #define DEBUG_TYPE "si-optimize-exec-masking" |
22 | | |
23 | | namespace { |
24 | | |
25 | | class SIOptimizeExecMasking : public MachineFunctionPass { |
26 | | public: |
27 | | static char ID; |
28 | | |
29 | | public: |
30 | 2.39k | SIOptimizeExecMasking() : MachineFunctionPass(ID) { |
31 | 2.39k | initializeSIOptimizeExecMaskingPass(*PassRegistry::getPassRegistry()); |
32 | 2.39k | } |
33 | | |
34 | | bool runOnMachineFunction(MachineFunction &MF) override; |
35 | | |
36 | 27.6k | StringRef getPassName() const override { |
37 | 27.6k | return "SI optimize exec mask operations"; |
38 | 27.6k | } |
39 | | |
40 | 2.37k | void getAnalysisUsage(AnalysisUsage &AU) const override { |
41 | 2.37k | AU.setPreservesCFG(); |
42 | 2.37k | MachineFunctionPass::getAnalysisUsage(AU); |
43 | 2.37k | } |
44 | | }; |
45 | | |
46 | | } // End anonymous namespace. |
47 | | |
48 | 101k | INITIALIZE_PASS_BEGIN(SIOptimizeExecMasking, DEBUG_TYPE, |
49 | 101k | "SI optimize exec mask operations", false, false) |
50 | 101k | INITIALIZE_PASS_DEPENDENCY(LiveIntervals) |
51 | 101k | INITIALIZE_PASS_END(SIOptimizeExecMasking, DEBUG_TYPE, |
52 | | "SI optimize exec mask operations", false, false) |
53 | | |
54 | | char SIOptimizeExecMasking::ID = 0; |
55 | | |
56 | | char &llvm::SIOptimizeExecMaskingID = SIOptimizeExecMasking::ID; |
57 | | |
58 | | /// If \p MI is a copy from exec, return the register copied to. |
59 | 2.03k | static unsigned isCopyFromExec(const MachineInstr &MI, const GCNSubtarget &ST) { |
60 | 2.03k | switch (MI.getOpcode()) { |
61 | 2.03k | case AMDGPU::COPY: |
62 | 1.25k | case AMDGPU::S_MOV_B64: |
63 | 1.25k | case AMDGPU::S_MOV_B64_term: |
64 | 1.25k | case AMDGPU::S_MOV_B32: |
65 | 1.25k | case AMDGPU::S_MOV_B32_term: { |
66 | 1.25k | const MachineOperand &Src = MI.getOperand(1); |
67 | 1.25k | if (Src.isReg() && |
68 | 1.25k | Src.getReg() == (ST.isWave32() 1.25k ? AMDGPU::EXEC_LO91 : AMDGPU::EXEC1.16k )) |
69 | 611 | return MI.getOperand(0).getReg(); |
70 | 1.42k | } |
71 | 1.42k | } |
72 | 1.42k | |
73 | 1.42k | return AMDGPU::NoRegister; |
74 | 1.42k | } |
75 | | |
76 | | /// If \p MI is a copy to exec, return the register copied from. |
77 | 27.2k | static unsigned isCopyToExec(const MachineInstr &MI, const GCNSubtarget &ST) { |
78 | 27.2k | switch (MI.getOpcode()) { |
79 | 27.2k | case AMDGPU::COPY: |
80 | 1.36k | case AMDGPU::S_MOV_B64: |
81 | 1.36k | case AMDGPU::S_MOV_B32: { |
82 | 1.36k | const MachineOperand &Dst = MI.getOperand(0); |
83 | 1.36k | if (Dst.isReg() && |
84 | 1.36k | Dst.getReg() == (ST.isWave32() ? AMDGPU::EXEC_LO84 : AMDGPU::EXEC1.27k ) && |
85 | 1.36k | MI.getOperand(1).isReg()636 ) |
86 | 635 | return MI.getOperand(1).getReg(); |
87 | 727 | break; |
88 | 727 | } |
89 | 727 | case AMDGPU::S_MOV_B64_term: |
90 | 0 | case AMDGPU::S_MOV_B32_term: |
91 | 0 | llvm_unreachable("should have been replaced"); |
92 | 26.6k | } |
93 | 26.6k | |
94 | 26.6k | return AMDGPU::NoRegister; |
95 | 26.6k | } |
96 | | |
97 | | /// If \p MI is a logical operation on an exec value, |
98 | | /// return the register copied to. |
99 | 16 | static unsigned isLogicalOpOnExec(const MachineInstr &MI) { |
100 | 16 | switch (MI.getOpcode()) { |
101 | 16 | case AMDGPU::S_AND_B64: |
102 | 12 | case AMDGPU::S_OR_B64: |
103 | 12 | case AMDGPU::S_XOR_B64: |
104 | 12 | case AMDGPU::S_ANDN2_B64: |
105 | 12 | case AMDGPU::S_ORN2_B64: |
106 | 12 | case AMDGPU::S_NAND_B64: |
107 | 12 | case AMDGPU::S_NOR_B64: |
108 | 12 | case AMDGPU::S_XNOR_B64: { |
109 | 12 | const MachineOperand &Src1 = MI.getOperand(1); |
110 | 12 | if (Src1.isReg() && Src1.getReg() == AMDGPU::EXEC) |
111 | 11 | return MI.getOperand(0).getReg(); |
112 | 1 | const MachineOperand &Src2 = MI.getOperand(2); |
113 | 1 | if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC) |
114 | 1 | return MI.getOperand(0).getReg(); |
115 | 0 | break; |
116 | 0 | } |
117 | 0 | case AMDGPU::S_AND_B32: |
118 | 0 | case AMDGPU::S_OR_B32: |
119 | 0 | case AMDGPU::S_XOR_B32: |
120 | 0 | case AMDGPU::S_ANDN2_B32: |
121 | 0 | case AMDGPU::S_ORN2_B32: |
122 | 0 | case AMDGPU::S_NAND_B32: |
123 | 0 | case AMDGPU::S_NOR_B32: |
124 | 0 | case AMDGPU::S_XNOR_B32: { |
125 | 0 | const MachineOperand &Src1 = MI.getOperand(1); |
126 | 0 | if (Src1.isReg() && Src1.getReg() == AMDGPU::EXEC_LO) |
127 | 0 | return MI.getOperand(0).getReg(); |
128 | 0 | const MachineOperand &Src2 = MI.getOperand(2); |
129 | 0 | if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC_LO) |
130 | 0 | return MI.getOperand(0).getReg(); |
131 | 0 | break; |
132 | 0 | } |
133 | 4 | } |
134 | 4 | |
135 | 4 | return AMDGPU::NoRegister; |
136 | 4 | } |
137 | | |
138 | 1.20k | static unsigned getSaveExecOp(unsigned Opc) { |
139 | 1.20k | switch (Opc) { |
140 | 1.20k | case AMDGPU::S_AND_B64: |
141 | 1.11k | return AMDGPU::S_AND_SAVEEXEC_B64; |
142 | 1.20k | case AMDGPU::S_OR_B64: |
143 | 2 | return AMDGPU::S_OR_SAVEEXEC_B64; |
144 | 1.20k | case AMDGPU::S_XOR_B64: |
145 | 0 | return AMDGPU::S_XOR_SAVEEXEC_B64; |
146 | 1.20k | case AMDGPU::S_ANDN2_B64: |
147 | 3 | return AMDGPU::S_ANDN2_SAVEEXEC_B64; |
148 | 1.20k | case AMDGPU::S_ORN2_B64: |
149 | 0 | return AMDGPU::S_ORN2_SAVEEXEC_B64; |
150 | 1.20k | case AMDGPU::S_NAND_B64: |
151 | 0 | return AMDGPU::S_NAND_SAVEEXEC_B64; |
152 | 1.20k | case AMDGPU::S_NOR_B64: |
153 | 0 | return AMDGPU::S_NOR_SAVEEXEC_B64; |
154 | 1.20k | case AMDGPU::S_XNOR_B64: |
155 | 0 | return AMDGPU::S_XNOR_SAVEEXEC_B64; |
156 | 1.20k | case AMDGPU::S_AND_B32: |
157 | 90 | return AMDGPU::S_AND_SAVEEXEC_B32; |
158 | 1.20k | case AMDGPU::S_OR_B32: |
159 | 0 | return AMDGPU::S_OR_SAVEEXEC_B32; |
160 | 1.20k | case AMDGPU::S_XOR_B32: |
161 | 0 | return AMDGPU::S_XOR_SAVEEXEC_B32; |
162 | 1.20k | case AMDGPU::S_ANDN2_B32: |
163 | 0 | return AMDGPU::S_ANDN2_SAVEEXEC_B32; |
164 | 1.20k | case AMDGPU::S_ORN2_B32: |
165 | 0 | return AMDGPU::S_ORN2_SAVEEXEC_B32; |
166 | 1.20k | case AMDGPU::S_NAND_B32: |
167 | 0 | return AMDGPU::S_NAND_SAVEEXEC_B32; |
168 | 1.20k | case AMDGPU::S_NOR_B32: |
169 | 0 | return AMDGPU::S_NOR_SAVEEXEC_B32; |
170 | 1.20k | case AMDGPU::S_XNOR_B32: |
171 | 0 | return AMDGPU::S_XNOR_SAVEEXEC_B32; |
172 | 1.20k | default: |
173 | 1 | return AMDGPU::INSTRUCTION_LIST_END; |
174 | 1.20k | } |
175 | 1.20k | } |
176 | | |
177 | | // These are only terminators to get correct spill code placement during |
178 | | // register allocation, so turn them back into normal instructions. Only one of |
179 | | // these is expected per block. |
180 | 29.5k | static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) { |
181 | 29.5k | switch (MI.getOpcode()) { |
182 | 29.5k | case AMDGPU::S_MOV_B64_term: |
183 | 614 | case AMDGPU::S_MOV_B32_term: { |
184 | 614 | MI.setDesc(TII.get(AMDGPU::COPY)); |
185 | 614 | return true; |
186 | 614 | } |
187 | 614 | case AMDGPU::S_XOR_B64_term: { |
188 | 91 | // This is only a terminator to get the correct spill code placement during |
189 | 91 | // register allocation. |
190 | 91 | MI.setDesc(TII.get(AMDGPU::S_XOR_B64)); |
191 | 91 | return true; |
192 | 614 | } |
193 | 614 | case AMDGPU::S_XOR_B32_term: { |
194 | 8 | // This is only a terminator to get the correct spill code placement during |
195 | 8 | // register allocation. |
196 | 8 | MI.setDesc(TII.get(AMDGPU::S_XOR_B32)); |
197 | 8 | return true; |
198 | 614 | } |
199 | 614 | case AMDGPU::S_OR_B32_term: { |
200 | 0 | // This is only a terminator to get the correct spill code placement during |
201 | 0 | // register allocation. |
202 | 0 | MI.setDesc(TII.get(AMDGPU::S_OR_B32)); |
203 | 0 | return true; |
204 | 614 | } |
205 | 614 | case AMDGPU::S_ANDN2_B64_term: { |
206 | 124 | // This is only a terminator to get the correct spill code placement during |
207 | 124 | // register allocation. |
208 | 124 | MI.setDesc(TII.get(AMDGPU::S_ANDN2_B64)); |
209 | 124 | return true; |
210 | 614 | } |
211 | 614 | case AMDGPU::S_ANDN2_B32_term: { |
212 | 15 | // This is only a terminator to get the correct spill code placement during |
213 | 15 | // register allocation. |
214 | 15 | MI.setDesc(TII.get(AMDGPU::S_ANDN2_B32)); |
215 | 15 | return true; |
216 | 614 | } |
217 | 28.7k | default: |
218 | 28.7k | return false; |
219 | 29.5k | } |
220 | 29.5k | } |
221 | | |
222 | | static MachineBasicBlock::reverse_iterator fixTerminators( |
223 | | const SIInstrInfo &TII, |
224 | 28.7k | MachineBasicBlock &MBB) { |
225 | 28.7k | MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend(); |
226 | 57.4k | for (; I != E; ++I28.7k ) { |
227 | 55.9k | if (!I->isTerminator()) |
228 | 26.3k | return I; |
229 | 29.5k | |
230 | 29.5k | if (removeTerminatorBit(TII, *I)) |
231 | 852 | return I; |
232 | 29.5k | } |
233 | 28.7k | |
234 | 28.7k | return E1.48k ; |
235 | 28.7k | } |
236 | | |
237 | | static MachineBasicBlock::reverse_iterator findExecCopy( |
238 | | const SIInstrInfo &TII, |
239 | | const GCNSubtarget &ST, |
240 | | MachineBasicBlock &MBB, |
241 | | MachineBasicBlock::reverse_iterator I, |
242 | 635 | unsigned CopyToExec) { |
243 | 635 | const unsigned InstLimit = 25; |
244 | 635 | |
245 | 635 | auto E = MBB.rend(); |
246 | 2.06k | for (unsigned N = 0; N <= InstLimit && I != E; ++I, ++N1.42k ) { |
247 | 2.03k | unsigned CopyFromExec = isCopyFromExec(*I, ST); |
248 | 2.03k | if (CopyFromExec != AMDGPU::NoRegister) |
249 | 611 | return I; |
250 | 2.03k | } |
251 | 635 | |
252 | 635 | return E24 ; |
253 | 635 | } |
254 | | |
255 | | // XXX - Seems LivePhysRegs doesn't work correctly since it will incorrectly |
256 | | // report the register as unavailable because a super-register with a lane mask |
257 | | // is unavailable. |
258 | 611 | static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) { |
259 | 1.21k | for (MachineBasicBlock *Succ : MBB.successors()) { |
260 | 1.21k | if (Succ->isLiveIn(Reg)) |
261 | 2 | return true; |
262 | 1.21k | } |
263 | 611 | |
264 | 611 | return false609 ; |
265 | 611 | } |
266 | | |
267 | 25.2k | bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { |
268 | 25.2k | if (skipFunction(MF.getFunction())) |
269 | 8 | return false; |
270 | 25.2k | |
271 | 25.2k | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
272 | 25.2k | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
273 | 25.2k | const SIInstrInfo *TII = ST.getInstrInfo(); |
274 | 25.2k | unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO1.92k : AMDGPU::EXEC23.3k ; |
275 | 25.2k | |
276 | 25.2k | // Optimize sequences emitted for control flow lowering. They are originally |
277 | 25.2k | // emitted as the separate operations because spill code may need to be |
278 | 25.2k | // inserted for the saved copy of exec. |
279 | 25.2k | // |
280 | 25.2k | // x = copy exec |
281 | 25.2k | // z = s_<op>_b64 x, y |
282 | 25.2k | // exec = copy z |
283 | 25.2k | // => |
284 | 25.2k | // x = s_<op>_saveexec_b64 y |
285 | 25.2k | // |
286 | 25.2k | |
287 | 28.7k | for (MachineBasicBlock &MBB : MF) { |
288 | 28.7k | MachineBasicBlock::reverse_iterator I = fixTerminators(*TII, MBB); |
289 | 28.7k | MachineBasicBlock::reverse_iterator E = MBB.rend(); |
290 | 28.7k | if (I == E) |
291 | 1.48k | continue; |
292 | 27.2k | |
293 | 27.2k | unsigned CopyToExec = isCopyToExec(*I, ST); |
294 | 27.2k | if (CopyToExec == AMDGPU::NoRegister) |
295 | 26.6k | continue; |
296 | 635 | |
297 | 635 | // Scan backwards to find the def. |
298 | 635 | auto CopyToExecInst = &*I; |
299 | 635 | auto CopyFromExecInst = findExecCopy(*TII, ST, MBB, I, CopyToExec); |
300 | 635 | if (CopyFromExecInst == E) { |
301 | 24 | auto PrepareExecInst = std::next(I); |
302 | 24 | if (PrepareExecInst == E) |
303 | 7 | continue; |
304 | 17 | // Fold exec = COPY (S_AND_B64 reg, exec) -> exec = S_AND_B64 reg, exec |
305 | 17 | if (CopyToExecInst->getOperand(1).isKill() && |
306 | 17 | isLogicalOpOnExec(*PrepareExecInst) == CopyToExec16 ) { |
307 | 12 | LLVM_DEBUG(dbgs() << "Fold exec copy: " << *PrepareExecInst); |
308 | 12 | |
309 | 12 | PrepareExecInst->getOperand(0).setReg(Exec); |
310 | 12 | |
311 | 12 | LLVM_DEBUG(dbgs() << "into: " << *PrepareExecInst << '\n'); |
312 | 12 | |
313 | 12 | CopyToExecInst->eraseFromParent(); |
314 | 12 | } |
315 | 17 | |
316 | 17 | continue; |
317 | 17 | } |
318 | 611 | |
319 | 611 | if (isLiveOut(MBB, CopyToExec)) { |
320 | 2 | // The copied register is live out and has a second use in another block. |
321 | 2 | LLVM_DEBUG(dbgs() << "Exec copy source register is live out\n"); |
322 | 2 | continue; |
323 | 2 | } |
324 | 609 | |
325 | 609 | unsigned CopyFromExec = CopyFromExecInst->getOperand(0).getReg(); |
326 | 609 | MachineInstr *SaveExecInst = nullptr; |
327 | 609 | SmallVector<MachineInstr *, 4> OtherUseInsts; |
328 | 609 | |
329 | 609 | for (MachineBasicBlock::iterator J |
330 | 609 | = std::next(CopyFromExecInst->getIterator()), JE = I->getIterator(); |
331 | 1.31k | J != JE; ++J705 ) { |
332 | 714 | if (SaveExecInst && J->readsRegister(Exec, TRI)89 ) { |
333 | 5 | LLVM_DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n'); |
334 | 5 | // Make sure this is inserted after any VALU ops that may have been |
335 | 5 | // scheduled in between. |
336 | 5 | SaveExecInst = nullptr; |
337 | 5 | break; |
338 | 5 | } |
339 | 709 | |
340 | 709 | bool ReadsCopyFromExec = J->readsRegister(CopyFromExec, TRI); |
341 | 709 | |
342 | 709 | if (J->modifiesRegister(CopyToExec, TRI)) { |
343 | 608 | if (SaveExecInst) { |
344 | 1 | LLVM_DEBUG(dbgs() << "Multiple instructions modify " |
345 | 1 | << printReg(CopyToExec, TRI) << '\n'); |
346 | 1 | SaveExecInst = nullptr; |
347 | 1 | break; |
348 | 1 | } |
349 | 607 | |
350 | 607 | unsigned SaveExecOp = getSaveExecOp(J->getOpcode()); |
351 | 607 | if (SaveExecOp == AMDGPU::INSTRUCTION_LIST_END) |
352 | 1 | break; |
353 | 606 | |
354 | 606 | if (ReadsCopyFromExec) { |
355 | 606 | SaveExecInst = &*J; |
356 | 606 | LLVM_DEBUG(dbgs() << "Found save exec op: " << *SaveExecInst << '\n'); |
357 | 606 | continue; |
358 | 606 | } else { |
359 | 0 | LLVM_DEBUG(dbgs() |
360 | 0 | << "Instruction does not read exec copy: " << *J << '\n'); |
361 | 0 | break; |
362 | 0 | } |
363 | 101 | } else if (ReadsCopyFromExec && !SaveExecInst85 ) { |
364 | 2 | // Make sure no other instruction is trying to use this copy, before it |
365 | 2 | // will be rewritten by the saveexec, i.e. hasOneUse. There may have |
366 | 2 | // been another use, such as an inserted spill. For example: |
367 | 2 | // |
368 | 2 | // %sgpr0_sgpr1 = COPY %exec |
369 | 2 | // spill %sgpr0_sgpr1 |
370 | 2 | // %sgpr2_sgpr3 = S_AND_B64 %sgpr0_sgpr1 |
371 | 2 | // |
372 | 2 | LLVM_DEBUG(dbgs() << "Found second use of save inst candidate: " << *J |
373 | 2 | << '\n'); |
374 | 2 | break; |
375 | 2 | } |
376 | 99 | |
377 | 99 | if (SaveExecInst && J->readsRegister(CopyToExec, TRI)83 ) { |
378 | 83 | assert(SaveExecInst != &*J); |
379 | 83 | OtherUseInsts.push_back(&*J); |
380 | 83 | } |
381 | 99 | } |
382 | 609 | |
383 | 609 | if (!SaveExecInst) |
384 | 9 | continue; |
385 | 600 | |
386 | 600 | LLVM_DEBUG(dbgs() << "Insert save exec op: " << *SaveExecInst << '\n'); |
387 | 600 | |
388 | 600 | MachineOperand &Src0 = SaveExecInst->getOperand(1); |
389 | 600 | MachineOperand &Src1 = SaveExecInst->getOperand(2); |
390 | 600 | |
391 | 600 | MachineOperand *OtherOp = nullptr; |
392 | 600 | |
393 | 600 | if (Src0.isReg() && Src0.getReg() == CopyFromExec) { |
394 | 599 | OtherOp = &Src1; |
395 | 599 | } else if (1 Src1.isReg()1 && Src1.getReg() == CopyFromExec1 ) { |
396 | 1 | if (!SaveExecInst->isCommutable()) |
397 | 1 | break; |
398 | 0 | |
399 | 0 | OtherOp = &Src0; |
400 | 0 | } else |
401 | 1 | llvm_unreachable0 ("unexpected"); |
402 | 600 | |
403 | 600 | CopyFromExecInst->eraseFromParent(); |
404 | 599 | |
405 | 599 | auto InsPt = SaveExecInst->getIterator(); |
406 | 599 | const DebugLoc &DL = SaveExecInst->getDebugLoc(); |
407 | 599 | |
408 | 599 | BuildMI(MBB, InsPt, DL, TII->get(getSaveExecOp(SaveExecInst->getOpcode())), |
409 | 599 | CopyFromExec) |
410 | 599 | .addReg(OtherOp->getReg()); |
411 | 599 | SaveExecInst->eraseFromParent(); |
412 | 599 | |
413 | 599 | CopyToExecInst->eraseFromParent(); |
414 | 599 | |
415 | 599 | for (MachineInstr *OtherInst : OtherUseInsts) { |
416 | 82 | OtherInst->substituteRegister(CopyToExec, Exec, |
417 | 82 | AMDGPU::NoSubRegister, *TRI); |
418 | 82 | } |
419 | 599 | } |
420 | 25.2k | |
421 | 25.2k | return true; |
422 | 25.2k | |
423 | 25.2k | } |