Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
Line
Count
Source (jump to first uncovered line)
1
//===-- SIInsertSkips.cpp - Use predicates for control flow ---------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
/// \file
10
/// This pass inserts branches on the 0 exec mask over divergent branches
11
/// branches when it's expected that jumping over the untaken control flow will
12
/// be cheaper than having every workitem no-op through it.
13
//
14
//===----------------------------------------------------------------------===//
15
16
#include "AMDGPU.h"
17
#include "AMDGPUSubtarget.h"
18
#include "SIInstrInfo.h"
19
#include "SIMachineFunctionInfo.h"
20
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21
#include "llvm/ADT/SmallVector.h"
22
#include "llvm/ADT/StringRef.h"
23
#include "llvm/CodeGen/MachineBasicBlock.h"
24
#include "llvm/CodeGen/MachineFunction.h"
25
#include "llvm/CodeGen/MachineFunctionPass.h"
26
#include "llvm/CodeGen/MachineInstr.h"
27
#include "llvm/CodeGen/MachineInstrBuilder.h"
28
#include "llvm/CodeGen/MachineOperand.h"
29
#include "llvm/IR/CallingConv.h"
30
#include "llvm/IR/DebugLoc.h"
31
#include "llvm/MC/MCAsmInfo.h"
32
#include "llvm/Pass.h"
33
#include "llvm/Support/CommandLine.h"
34
#include "llvm/Target/TargetMachine.h"
35
#include <cassert>
36
#include <cstdint>
37
#include <iterator>
38
39
using namespace llvm;
40
41
#define DEBUG_TYPE "si-insert-skips"
42
43
static cl::opt<unsigned> SkipThresholdFlag(
44
  "amdgpu-skip-threshold",
45
  cl::desc("Number of instructions before jumping over divergent control flow"),
46
  cl::init(12), cl::Hidden);
47
48
namespace {
49
50
class SIInsertSkips : public MachineFunctionPass {
51
private:
52
  const SIRegisterInfo *TRI = nullptr;
53
  const SIInstrInfo *TII = nullptr;
54
  unsigned SkipThreshold = 0;
55
56
  bool shouldSkip(const MachineBasicBlock &From,
57
                  const MachineBasicBlock &To) const;
58
59
  bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB);
60
61
  void kill(MachineInstr &MI);
62
63
  MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB,
64
                                     MachineBasicBlock::iterator I) const;
65
66
  bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
67
68
  bool optimizeVccBranch(MachineInstr &MI) const;
69
70
public:
71
  static char ID;
72
73
2.45k
  SIInsertSkips() : MachineFunctionPass(ID) {}
74
75
  bool runOnMachineFunction(MachineFunction &MF) override;
76
77
27.8k
  StringRef getPassName() const override {
78
27.8k
    return "SI insert s_cbranch_execz instructions";
79
27.8k
  }
80
81
2.42k
  void getAnalysisUsage(AnalysisUsage &AU) const override {
82
2.42k
    MachineFunctionPass::getAnalysisUsage(AU);
83
2.42k
  }
84
};
85
86
} // end anonymous namespace
87
88
char SIInsertSkips::ID = 0;
89
90
INITIALIZE_PASS(SIInsertSkips, DEBUG_TYPE,
91
                "SI insert s_cbranch_execz instructions", false, false)
92
93
char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID;
94
95
2.13k
static bool opcodeEmitsNoInsts(const MachineInstr &MI) {
96
2.13k
  if (MI.isMetaInstruction())
97
55
    return true;
98
2.07k
99
2.07k
  // Handle target specific opcodes.
100
2.07k
  switch (MI.getOpcode()) {
101
2.07k
  case AMDGPU::SI_MASK_BRANCH:
102
22
    return true;
103
2.07k
  default:
104
2.05k
    return false;
105
2.07k
  }
106
2.07k
}
107
108
bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
109
751
                               const MachineBasicBlock &To) const {
110
751
  unsigned NumInstr = 0;
111
751
  const MachineFunction *MF = From.getParent();
112
751
113
751
  for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
114
974
       MBBI != End && MBBI != ToI; 
++MBBI223
) {
115
794
    const MachineBasicBlock &MBB = *MBBI;
116
794
117
794
    for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
118
2.35k
         NumInstr < SkipThreshold && I != E; 
++I1.55k
) {
119
2.13k
      if (opcodeEmitsNoInsts(*I))
120
77
        continue;
121
2.05k
122
2.05k
      // FIXME: Since this is required for correctness, this should be inserted
123
2.05k
      // during SILowerControlFlow.
124
2.05k
125
2.05k
      // When a uniform loop is inside non-uniform control flow, the branch
126
2.05k
      // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
127
2.05k
      // when EXEC = 0. We should skip the loop lest it becomes infinite.
128
2.05k
      if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
129
2.05k
          
I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ2.05k
)
130
2
        return true;
131
2.05k
132
2.05k
      if (TII->hasUnwantedEffectsWhenEXECEmpty(*I))
133
57
        return true;
134
1.99k
135
1.99k
      // These instructions are potentially expensive even if EXEC = 0.
136
1.99k
      if (TII->isSMRD(*I) || 
TII->isVMEM(*I)1.85k
||
TII->isFLAT(*I)1.73k
||
137
1.99k
          
I->getOpcode() == AMDGPU::S_WAITCNT1.71k
)
138
492
        return true;
139
1.50k
140
1.50k
      ++NumInstr;
141
1.50k
      if (NumInstr >= SkipThreshold)
142
20
        return true;
143
1.50k
    }
144
794
  }
145
751
146
751
  
return false180
;
147
751
}
148
149
140
bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
150
140
  MachineBasicBlock &MBB = *MI.getParent();
151
140
  MachineFunction *MF = MBB.getParent();
152
140
153
140
  if (MF->getFunction().getCallingConv() != CallingConv::AMDGPU_PS ||
154
140
      
!shouldSkip(MBB, MBB.getParent()->back())55
)
155
130
    return false;
156
10
157
10
  MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator());
158
10
159
10
  const DebugLoc &DL = MI.getDebugLoc();
160
10
161
10
  // If the exec mask is non-zero, skip the next two instructions
162
10
  BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
163
10
    .addMBB(&NextBB);
164
10
165
10
  MachineBasicBlock::iterator Insert = SkipBB->begin();
166
10
167
10
  // Exec mask is zero: Export to NULL target...
168
10
  BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP_DONE))
169
10
    .addImm(0x09) // V_008DFC_SQ_EXP_NULL
170
10
    .addReg(AMDGPU::VGPR0, RegState::Undef)
171
10
    .addReg(AMDGPU::VGPR0, RegState::Undef)
172
10
    .addReg(AMDGPU::VGPR0, RegState::Undef)
173
10
    .addReg(AMDGPU::VGPR0, RegState::Undef)
174
10
    .addImm(1)  // vm
175
10
    .addImm(0)  // compr
176
10
    .addImm(0); // en
177
10
178
10
  // ... and terminate wavefront.
179
10
  BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0);
180
10
181
10
  return true;
182
10
}
183
184
145
void SIInsertSkips::kill(MachineInstr &MI) {
185
145
  MachineBasicBlock &MBB = *MI.getParent();
186
145
  DebugLoc DL = MI.getDebugLoc();
187
145
188
145
  switch (MI.getOpcode()) {
189
145
  case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: {
190
86
    unsigned Opcode = 0;
191
86
192
86
    // The opcodes are inverted because the inline immediate has to be
193
86
    // the first operand, e.g. from "x < imm" to "imm > x"
194
86
    switch (MI.getOperand(2).getImm()) {
195
86
    case ISD::SETOEQ:
196
3
    case ISD::SETEQ:
197
3
      Opcode = AMDGPU::V_CMPX_EQ_F32_e64;
198
3
      break;
199
4
    case ISD::SETOGT:
200
4
    case ISD::SETGT:
201
4
      Opcode = AMDGPU::V_CMPX_LT_F32_e64;
202
4
      break;
203
30
    case ISD::SETOGE:
204
30
    case ISD::SETGE:
205
30
      Opcode = AMDGPU::V_CMPX_LE_F32_e64;
206
30
      break;
207
30
    case ISD::SETOLT:
208
19
    case ISD::SETLT:
209
19
      Opcode = AMDGPU::V_CMPX_GT_F32_e64;
210
19
      break;
211
19
    case ISD::SETOLE:
212
6
    case ISD::SETLE:
213
6
      Opcode = AMDGPU::V_CMPX_GE_F32_e64;
214
6
      break;
215
6
    case ISD::SETONE:
216
3
    case ISD::SETNE:
217
3
      Opcode = AMDGPU::V_CMPX_LG_F32_e64;
218
3
      break;
219
3
    case ISD::SETO:
220
0
      Opcode = AMDGPU::V_CMPX_O_F32_e64;
221
0
      break;
222
3
    case ISD::SETUO:
223
0
      Opcode = AMDGPU::V_CMPX_U_F32_e64;
224
0
      break;
225
3
    case ISD::SETUEQ:
226
3
      Opcode = AMDGPU::V_CMPX_NLG_F32_e64;
227
3
      break;
228
3
    case ISD::SETUGT:
229
3
      Opcode = AMDGPU::V_CMPX_NGE_F32_e64;
230
3
      break;
231
6
    case ISD::SETUGE:
232
6
      Opcode = AMDGPU::V_CMPX_NGT_F32_e64;
233
6
      break;
234
3
    case ISD::SETULT:
235
3
      Opcode = AMDGPU::V_CMPX_NLE_F32_e64;
236
3
      break;
237
3
    case ISD::SETULE:
238
3
      Opcode = AMDGPU::V_CMPX_NLT_F32_e64;
239
3
      break;
240
3
    case ISD::SETUNE:
241
3
      Opcode = AMDGPU::V_CMPX_NEQ_F32_e64;
242
3
      break;
243
3
    default:
244
0
      llvm_unreachable("invalid ISD:SET cond code");
245
86
    }
246
86
247
86
    const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
248
86
    if (ST.hasNoSdstCMPX())
249
36
      Opcode = AMDGPU::getVCMPXNoSDstOp(Opcode);
250
86
251
86
    assert(MI.getOperand(0).isReg());
252
86
253
86
    if (TRI->isVGPR(MBB.getParent()->getRegInfo(),
254
86
                    MI.getOperand(0).getReg())) {
255
83
      Opcode = AMDGPU::getVOPe32(Opcode);
256
83
      BuildMI(MBB, &MI, DL, TII->get(Opcode))
257
83
          .add(MI.getOperand(1))
258
83
          .add(MI.getOperand(0));
259
83
    } else {
260
3
      auto I = BuildMI(MBB, &MI, DL, TII->get(Opcode));
261
3
      if (!ST.hasNoSdstCMPX())
262
2
        I.addReg(AMDGPU::VCC, RegState::Define);
263
3
264
3
      I.addImm(0)  // src0 modifiers
265
3
        .add(MI.getOperand(1))
266
3
        .addImm(0)  // src1 modifiers
267
3
        .add(MI.getOperand(0));
268
3
269
3
      I.addImm(0);  // omod
270
3
    }
271
86
    break;
272
86
  }
273
86
  case AMDGPU::SI_KILL_I1_TERMINATOR: {
274
59
    const MachineFunction *MF = MI.getParent()->getParent();
275
59
    const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
276
59
    unsigned Exec = ST.isWave32() ? 
AMDGPU::EXEC_LO9
:
AMDGPU::EXEC50
;
277
59
    const MachineOperand &Op = MI.getOperand(0);
278
59
    int64_t KillVal = MI.getOperand(1).getImm();
279
59
    assert(KillVal == 0 || KillVal == -1);
280
59
281
59
    // Kill all threads if Op0 is an immediate and equal to the Kill value.
282
59
    if (Op.isImm()) {
283
27
      int64_t Imm = Op.getImm();
284
27
      assert(Imm == 0 || Imm == -1);
285
27
286
27
      if (Imm == KillVal)
287
19
        BuildMI(MBB, &MI, DL, TII->get(ST.isWave32() ? 
AMDGPU::S_MOV_B323
288
19
                                                     : 
AMDGPU::S_MOV_B6416
), Exec)
289
19
          .addImm(0);
290
27
      break;
291
27
    }
292
32
293
32
    unsigned Opcode = KillVal ? 
AMDGPU::S_ANDN2_B643
:
AMDGPU::S_AND_B6429
;
294
32
    if (ST.isWave32())
295
6
      Opcode = KillVal ? 
AMDGPU::S_ANDN2_B320
: AMDGPU::S_AND_B32;
296
32
    BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec)
297
32
        .addReg(Exec)
298
32
        .add(Op);
299
32
    break;
300
32
  }
301
32
  default:
302
0
    llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR");
303
145
  }
304
145
}
305
306
MachineBasicBlock *SIInsertSkips::insertSkipBlock(
307
10
  MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
308
10
  MachineFunction *MF = MBB.getParent();
309
10
310
10
  MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock();
311
10
  MachineFunction::iterator MBBI(MBB);
312
10
  ++MBBI;
313
10
314
10
  MF->insert(MBBI, SkipBB);
315
10
  MBB.addSuccessor(SkipBB);
316
10
317
10
  return SkipBB;
318
10
}
319
320
// Returns true if a branch over the block was inserted.
321
bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
322
696
                                   MachineBasicBlock &SrcMBB) {
323
696
  MachineBasicBlock *DestBB = MI.getOperand(0).getMBB();
324
696
325
696
  if (!shouldSkip(**SrcMBB.succ_begin(), *DestBB))
326
135
    return false;
327
561
328
561
  const DebugLoc &DL = MI.getDebugLoc();
329
561
  MachineBasicBlock::iterator InsPt = std::next(MI.getIterator());
330
561
331
561
  BuildMI(SrcMBB, InsPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
332
561
    .addMBB(DestBB);
333
561
334
561
  return true;
335
561
}
336
337
284
bool SIInsertSkips::optimizeVccBranch(MachineInstr &MI) const {
338
284
  // Match:
339
284
  // sreg = -1
340
284
  // vcc = S_AND_B64 exec, sreg
341
284
  // S_CBRANCH_VCC[N]Z
342
284
  // =>
343
284
  // S_CBRANCH_EXEC[N]Z
344
284
  bool Changed = false;
345
284
  MachineBasicBlock &MBB = *MI.getParent();
346
284
  const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
347
284
  const bool IsWave32 = ST.isWave32();
348
284
  const unsigned CondReg = TRI->getVCC();
349
284
  const unsigned ExecReg = IsWave32 ? 
AMDGPU::EXEC_LO42
:
AMDGPU::EXEC242
;
350
284
  const unsigned And = IsWave32 ? 
AMDGPU::S_AND_B3242
:
AMDGPU::S_AND_B64242
;
351
284
352
284
  MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
353
284
                                      E = MBB.rend();
354
284
  bool ReadsCond = false;
355
284
  unsigned Threshold = 5;
356
413
  for (++A ; A != E ; 
++A129
) {
357
400
    if (!--Threshold)
358
11
      return false;
359
389
    if (A->modifiesRegister(ExecReg, TRI))
360
0
      return false;
361
389
    if (A->modifiesRegister(CondReg, TRI)) {
362
260
      if (!A->definesRegister(CondReg, TRI) || 
A->getOpcode() != And259
)
363
41
        return false;
364
219
      break;
365
219
    }
366
129
    ReadsCond |= A->readsRegister(CondReg, TRI);
367
129
  }
368
284
  
if (232
A == E232
)
369
13
    return false;
370
219
371
219
  MachineOperand &Op1 = A->getOperand(1);
372
219
  MachineOperand &Op2 = A->getOperand(2);
373
219
  if (Op1.getReg() != ExecReg && 
Op2.isReg()3
&&
Op2.getReg() == ExecReg3
) {
374
3
    TII->commuteInstruction(*A);
375
3
    Changed = true;
376
3
  }
377
219
  if (Op1.getReg() != ExecReg)
378
0
    return Changed;
379
219
  if (Op2.isImm() && 
Op2.getImm() != -16
)
380
2
    return Changed;
381
217
382
217
  unsigned SReg = AMDGPU::NoRegister;
383
217
  if (Op2.isReg()) {
384
213
    SReg = Op2.getReg();
385
213
    auto M = std::next(A);
386
213
    bool ReadsSreg = false;
387
245
    for ( ; M != E ; 
++M32
) {
388
242
      if (M->definesRegister(SReg, TRI))
389
207
        break;
390
35
      if (M->modifiesRegister(SReg, TRI))
391
3
        return Changed;
392
32
      ReadsSreg |= M->readsRegister(SReg, TRI);
393
32
    }
394
213
    
if (210
M == E210
||
395
210
        
!M->isMoveImmediate()207
||
396
210
        
!M->getOperand(1).isImm()29
||
397
210
        
M->getOperand(1).getImm() != -129
)
398
189
      return Changed;
399
21
    // First if sreg is only used in and instruction fold the immediate
400
21
    // into that and.
401
21
    if (!ReadsSreg && 
Op2.isKill()20
) {
402
15
      A->getOperand(2).ChangeToImmediate(-1);
403
15
      M->eraseFromParent();
404
15
    }
405
21
  }
406
217
407
217
  
if (25
!ReadsCond25
&&
A->registerDefIsDead(AMDGPU::SCC)24
&&
408
25
      
MI.killsRegister(CondReg, TRI)20
)
409
15
    A->eraseFromParent();
410
25
411
25
  bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
412
25
  if (SReg == ExecReg) {
413
2
    if (IsVCCZ) {
414
1
      MI.eraseFromParent();
415
1
      return true;
416
1
    }
417
1
    MI.setDesc(TII->get(AMDGPU::S_BRANCH));
418
23
  } else {
419
23
    MI.setDesc(TII->get(IsVCCZ ? 
AMDGPU::S_CBRANCH_EXECZ13
420
23
                               : 
AMDGPU::S_CBRANCH_EXECNZ10
));
421
23
  }
422
25
423
25
  MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI));
424
24
  MI.addImplicitDefUseOperands(*MBB.getParent());
425
24
426
24
  return true;
427
25
}
428
429
25.4k
bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
430
25.4k
  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
431
25.4k
  TII = ST.getInstrInfo();
432
25.4k
  TRI = &TII->getRegisterInfo();
433
25.4k
  SkipThreshold = SkipThresholdFlag;
434
25.4k
435
25.4k
  bool HaveKill = false;
436
25.4k
  bool MadeChange = false;
437
25.4k
438
25.4k
  // Track depth of exec mask, divergent branches.
439
25.4k
  SmallVector<MachineBasicBlock *, 16> ExecBranchStack;
440
25.4k
441
25.4k
  MachineFunction::iterator NextBB;
442
25.4k
443
25.4k
  MachineBasicBlock *EmptyMBBAtEnd = nullptr;
444
25.4k
445
25.4k
  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
446
54.4k
       BI != BE; 
BI = NextBB28.9k
) {
447
28.9k
    NextBB = std::next(BI);
448
28.9k
    MachineBasicBlock &MBB = *BI;
449
28.9k
    bool HaveSkipBlock = false;
450
28.9k
451
28.9k
    if (!ExecBranchStack.empty() && 
ExecBranchStack.back() == &MBB1.43k
) {
452
645
      // Reached convergence point for last divergent branch.
453
645
      ExecBranchStack.pop_back();
454
645
    }
455
28.9k
456
28.9k
    if (HaveKill && 
ExecBranchStack.empty()7
) {
457
5
      HaveKill = false;
458
5
459
5
      // TODO: Insert skip if exec is 0?
460
5
    }
461
28.9k
462
28.9k
    MachineBasicBlock::iterator I, Next;
463
462k
    for (I = MBB.begin(); I != MBB.end(); 
I = Next433k
) {
464
433k
      Next = std::next(I);
465
433k
466
433k
      MachineInstr &MI = *I;
467
433k
468
433k
      switch (MI.getOpcode()) {
469
433k
      case AMDGPU::SI_MASK_BRANCH:
470
696
        ExecBranchStack.push_back(MI.getOperand(0).getMBB());
471
696
        MadeChange |= skipMaskBranch(MI, MBB);
472
696
        break;
473
433k
474
433k
      case AMDGPU::S_BRANCH:
475
1.11k
        // Optimize out branches to the next block.
476
1.11k
        // FIXME: Shouldn't this be handled by BranchFolding?
477
1.11k
        if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) {
478
808
          MI.eraseFromParent();
479
808
        } else 
if (310
HaveSkipBlock310
) {
480
1
          // Remove the given unconditional branch when a skip block has been
481
1
          // inserted after the current one and let skip the two instructions
482
1
          // performing the kill if the exec mask is non-zero.
483
1
          MI.eraseFromParent();
484
1
        }
485
1.11k
        break;
486
433k
487
433k
      case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
488
145
      case AMDGPU::SI_KILL_I1_TERMINATOR:
489
145
        MadeChange = true;
490
145
        kill(MI);
491
145
492
145
        if (ExecBranchStack.empty()) {
493
140
          if (NextBB != BE && skipIfDead(MI, *NextBB)) {
494
10
            HaveSkipBlock = true;
495
10
            NextBB = std::next(BI);
496
10
            BE = MF.end();
497
10
          }
498
140
        } else {
499
5
          HaveKill = true;
500
5
        }
501
145
502
145
        MI.eraseFromParent();
503
145
        break;
504
145
505
2.02k
      case AMDGPU::SI_RETURN_TO_EPILOG:
506
2.02k
        // FIXME: Should move somewhere else
507
2.02k
        assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
508
2.02k
509
2.02k
        // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
510
2.02k
        // because external bytecode will be appended at the end.
511
2.02k
        if (BI != --MF.end() || 
I != MBB.getFirstTerminator()2.02k
) {
512
4
          // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block at
513
4
          // the end and jump there.
514
4
          if (!EmptyMBBAtEnd) {
515
4
            EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
516
4
            MF.insert(MF.end(), EmptyMBBAtEnd);
517
4
          }
518
4
519
4
          MBB.addSuccessor(EmptyMBBAtEnd);
520
4
          BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
521
4
            .addMBB(EmptyMBBAtEnd);
522
4
          I->eraseFromParent();
523
4
        }
524
2.02k
        break;
525
145
526
284
      case AMDGPU::S_CBRANCH_VCCZ:
527
284
      case AMDGPU::S_CBRANCH_VCCNZ:
528
284
        MadeChange |= optimizeVccBranch(MI);
529
284
        break;
530
284
531
429k
      default:
532
429k
        break;
533
433k
      }
534
433k
    }
535
28.9k
  }
536
25.4k
537
25.4k
  return MadeChange;
538
25.4k
}