Coverage Report

Created: 2017-10-03 07:32

/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
Line
Count
Source (jump to first uncovered line)
1
//===-- SIInsertSkips.cpp - Use predicates for control flow ---------------===//
2
//
3
//                     The LLVM Compiler Infrastructure
4
//
5
// This file is distributed under the University of Illinois Open Source
6
// License. See LICENSE.TXT for details.
7
//
8
//===----------------------------------------------------------------------===//
9
//
10
/// \file
11
/// \brief This pass inserts branches on the 0 exec mask over divergent branches
12
/// branches when it's expected that jumping over the untaken control flow will
13
/// be cheaper than having every workitem no-op through it.
14
//
15
//===----------------------------------------------------------------------===//
16
17
#include "AMDGPU.h"
18
#include "AMDGPUSubtarget.h"
19
#include "SIInstrInfo.h"
20
#include "SIMachineFunctionInfo.h"
21
#include "llvm/ADT/SmallVector.h"
22
#include "llvm/ADT/StringRef.h"
23
#include "llvm/CodeGen/MachineBasicBlock.h"
24
#include "llvm/CodeGen/MachineFunction.h"
25
#include "llvm/CodeGen/MachineFunctionPass.h"
26
#include "llvm/CodeGen/MachineInstr.h"
27
#include "llvm/CodeGen/MachineInstrBuilder.h"
28
#include "llvm/CodeGen/MachineOperand.h"
29
#include "llvm/IR/CallingConv.h"
30
#include "llvm/IR/DebugLoc.h"
31
#include "llvm/MC/MCAsmInfo.h"
32
#include "llvm/Pass.h"
33
#include "llvm/Support/CommandLine.h"
34
#include "llvm/Target/TargetMachine.h"
35
#include <cassert>
36
#include <cstdint>
37
#include <iterator>
38
39
using namespace llvm;
40
41
#define DEBUG_TYPE "si-insert-skips"
42
43
static cl::opt<unsigned> SkipThresholdFlag(
44
  "amdgpu-skip-threshold",
45
  cl::desc("Number of instructions before jumping over divergent control flow"),
46
  cl::init(12), cl::Hidden);
47
48
namespace {
49
50
class SIInsertSkips : public MachineFunctionPass {
51
private:
52
  const SIRegisterInfo *TRI = nullptr;
53
  const SIInstrInfo *TII = nullptr;
54
  unsigned SkipThreshold = 0;
55
56
  bool shouldSkip(const MachineBasicBlock &From,
57
                  const MachineBasicBlock &To) const;
58
59
  bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB);
60
61
  void kill(MachineInstr &MI);
62
63
  MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB,
64
                                     MachineBasicBlock::iterator I) const;
65
66
  bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
67
68
public:
69
  static char ID;
70
71
1.48k
  SIInsertSkips() : MachineFunctionPass(ID) {}
72
73
  bool runOnMachineFunction(MachineFunction &MF) override;
74
75
1.47k
  StringRef getPassName() const override {
76
1.47k
    return "SI insert s_cbranch_execz instructions";
77
1.47k
  }
78
79
1.47k
  void getAnalysisUsage(AnalysisUsage &AU) const override {
80
1.47k
    MachineFunctionPass::getAnalysisUsage(AU);
81
1.47k
  }
82
};
83
84
} // end anonymous namespace
85
86
char SIInsertSkips::ID = 0;
87
88
INITIALIZE_PASS(SIInsertSkips, DEBUG_TYPE,
89
                "SI insert s_cbranch_execz instructions", false, false)
90
91
char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID;
92
93
2.44k
static bool opcodeEmitsNoInsts(unsigned Opc) {
94
2.44k
  switch (Opc) {
95
47
  case TargetOpcode::IMPLICIT_DEF:
96
47
  case TargetOpcode::KILL:
97
47
  case TargetOpcode::BUNDLE:
98
47
  case TargetOpcode::CFI_INSTRUCTION:
99
47
  case TargetOpcode::EH_LABEL:
100
47
  case TargetOpcode::GC_LABEL:
101
47
  case TargetOpcode::DBG_VALUE:
102
47
    return true;
103
2.39k
  default:
104
2.39k
    return false;
105
0
  }
106
0
}
107
108
bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
109
390
                               const MachineBasicBlock &To) const {
110
390
  if (From.succ_empty())
111
7
    return false;
112
383
113
383
  unsigned NumInstr = 0;
114
383
  const MachineFunction *MF = From.getParent();
115
383
116
383
  for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
117
744
       
MBBI != End && 744
MBBI != ToI743
;
++MBBI361
) {
118
437
    const MachineBasicBlock &MBB = *MBBI;
119
437
120
437
    for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
121
2.80k
         
NumInstr < SkipThreshold && 2.80k
I != E2.80k
;
++I2.36k
) {
122
2.44k
      if (opcodeEmitsNoInsts(I->getOpcode()))
123
47
        continue;
124
2.39k
125
2.39k
      // FIXME: Since this is required for correctness, this should be inserted
126
2.39k
      // during SILowerControlFlow.
127
2.39k
128
2.39k
      // When a uniform loop is inside non-uniform control flow, the branch
129
2.39k
      // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
130
2.39k
      // when EXEC = 0. We should skip the loop lest it becomes infinite.
131
2.39k
      
if (2.39k
I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
132
2.39k
          I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
133
3
        return true;
134
2.39k
135
2.39k
      
if (2.39k
I->isInlineAsm()2.39k
) {
136
13
        const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
137
13
        const char *AsmStr = I->getOperand(0).getSymbolName();
138
13
139
13
        // inlineasm length estimate is number of bytes assuming the longest
140
13
        // instruction.
141
13
        uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI);
142
13
        NumInstr += MaxAsmSize / MAI->getMaxInstLength();
143
2.39k
      } else {
144
2.37k
        ++NumInstr;
145
2.37k
      }
146
2.39k
147
2.39k
      if (NumInstr >= SkipThreshold)
148
73
        return true;
149
2.44k
    }
150
437
  }
151
383
152
307
  return false;
153
390
}
154
155
29
bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
156
29
  MachineBasicBlock &MBB = *MI.getParent();
157
29
  MachineFunction *MF = MBB.getParent();
158
29
159
29
  if (MF->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS ||
160
22
      !shouldSkip(MBB, MBB.getParent()->back()))
161
25
    return false;
162
4
163
4
  MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator());
164
4
165
4
  const DebugLoc &DL = MI.getDebugLoc();
166
4
167
4
  // If the exec mask is non-zero, skip the next two instructions
168
4
  BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
169
4
    .addMBB(&NextBB);
170
4
171
4
  MachineBasicBlock::iterator Insert = SkipBB->begin();
172
4
173
4
  // Exec mask is zero: Export to NULL target...
174
4
  BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP_DONE))
175
4
    .addImm(0x09) // V_008DFC_SQ_EXP_NULL
176
4
    .addReg(AMDGPU::VGPR0, RegState::Undef)
177
4
    .addReg(AMDGPU::VGPR0, RegState::Undef)
178
4
    .addReg(AMDGPU::VGPR0, RegState::Undef)
179
4
    .addReg(AMDGPU::VGPR0, RegState::Undef)
180
4
    .addImm(1)  // vm
181
4
    .addImm(0)  // compr
182
4
    .addImm(0); // en
183
4
184
4
  // ... and terminate wavefront.
185
4
  BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
186
4
187
4
  return true;
188
4
}
189
190
34
void SIInsertSkips::kill(MachineInstr &MI) {
191
34
  MachineBasicBlock &MBB = *MI.getParent();
192
34
  DebugLoc DL = MI.getDebugLoc();
193
34
  const MachineOperand &Op = MI.getOperand(0);
194
34
195
#ifndef NDEBUG
196
  CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv();
197
  // Kill is only allowed in pixel / geometry shaders.
198
  assert(CallConv == CallingConv::AMDGPU_PS ||
199
         CallConv == CallingConv::AMDGPU_GS);
200
#endif
201
  // Clear this thread from the exec mask if the operand is negative.
202
34
  if (
Op.isImm()34
) {
203
14
    // Constant operand: Set exec mask to 0 or do nothing
204
14
    if (
Op.getImm() & 0x8000000014
) {
205
14
      BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
206
14
        .addImm(0);
207
14
    }
208
34
  } else {
209
20
    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32))
210
20
        .addImm(0)
211
20
        .add(Op);
212
20
  }
213
34
}
214
215
MachineBasicBlock *SIInsertSkips::insertSkipBlock(
216
4
  MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
217
4
  MachineFunction *MF = MBB.getParent();
218
4
219
4
  MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock();
220
4
  MachineFunction::iterator MBBI(MBB);
221
4
  ++MBBI;
222
4
223
4
  MF->insert(MBBI, SkipBB);
224
4
  MBB.addSuccessor(SkipBB);
225
4
226
4
  return SkipBB;
227
4
}
228
229
// Returns true if a branch over the block was inserted.
230
bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
231
368
                                   MachineBasicBlock &SrcMBB) {
232
368
  MachineBasicBlock *DestBB = MI.getOperand(0).getMBB();
233
368
234
368
  if (!shouldSkip(**SrcMBB.succ_begin(), *DestBB))
235
296
    return false;
236
72
237
72
  const DebugLoc &DL = MI.getDebugLoc();
238
72
  MachineBasicBlock::iterator InsPt = std::next(MI.getIterator());
239
72
240
72
  BuildMI(SrcMBB, InsPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
241
72
    .addMBB(DestBB);
242
72
243
72
  return true;
244
72
}
245
246
15.0k
bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
247
15.0k
  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
248
15.0k
  TII = ST.getInstrInfo();
249
15.0k
  TRI = &TII->getRegisterInfo();
250
15.0k
  SkipThreshold = SkipThresholdFlag;
251
15.0k
252
15.0k
  bool HaveKill = false;
253
15.0k
  bool MadeChange = false;
254
15.0k
255
15.0k
  // Track depth of exec mask, divergent branches.
256
15.0k
  SmallVector<MachineBasicBlock *, 16> ExecBranchStack;
257
15.0k
258
15.0k
  MachineFunction::iterator NextBB;
259
15.0k
260
15.0k
  MachineBasicBlock *EmptyMBBAtEnd = nullptr;
261
15.0k
262
15.0k
  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
263
32.0k
       
BI != BE32.0k
;
BI = NextBB16.9k
) {
264
16.9k
    NextBB = std::next(BI);
265
16.9k
    MachineBasicBlock &MBB = *BI;
266
16.9k
    bool HaveSkipBlock = false;
267
16.9k
268
16.9k
    if (
!ExecBranchStack.empty() && 16.9k
ExecBranchStack.back() == &MBB754
) {
269
364
      // Reached convergence point for last divergent branch.
270
364
      ExecBranchStack.pop_back();
271
364
    }
272
16.9k
273
16.9k
    if (
HaveKill && 16.9k
ExecBranchStack.empty()7
) {
274
5
      HaveKill = false;
275
5
276
5
      // TODO: Insert skip if exec is 0?
277
5
    }
278
16.9k
279
16.9k
    MachineBasicBlock::iterator I, Next;
280
319k
    for (I = MBB.begin(); 
I != MBB.end()319k
;
I = Next302k
) {
281
302k
      Next = std::next(I);
282
302k
283
302k
      MachineInstr &MI = *I;
284
302k
285
302k
      switch (MI.getOpcode()) {
286
368
      case AMDGPU::SI_MASK_BRANCH:
287
368
        ExecBranchStack.push_back(MI.getOperand(0).getMBB());
288
368
        MadeChange |= skipMaskBranch(MI, MBB);
289
368
        break;
290
302k
291
602
      case AMDGPU::S_BRANCH:
292
602
        // Optimize out branches to the next block.
293
602
        // FIXME: Shouldn't this be handled by BranchFolding?
294
602
        if (
MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())602
) {
295
435
          MI.eraseFromParent();
296
602
        } else 
if (167
HaveSkipBlock167
) {
297
1
          // Remove the given unconditional branch when a skip block has been
298
1
          // inserted after the current one and let skip the two instructions
299
1
          // performing the kill if the exec mask is non-zero.
300
1
          MI.eraseFromParent();
301
1
        }
302
602
        break;
303
302k
304
34
      case AMDGPU::SI_KILL_TERMINATOR:
305
34
        MadeChange = true;
306
34
        kill(MI);
307
34
308
34
        if (
ExecBranchStack.empty()34
) {
309
29
          if (
skipIfDead(MI, *NextBB)29
) {
310
4
            HaveSkipBlock = true;
311
4
            NextBB = std::next(BI);
312
4
            BE = MF.end();
313
4
          }
314
34
        } else {
315
5
          HaveKill = true;
316
5
        }
317
34
318
34
        MI.eraseFromParent();
319
34
        break;
320
302k
321
263
      case AMDGPU::SI_RETURN_TO_EPILOG:
322
263
        // FIXME: Should move somewhere else
323
263
        assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
324
263
325
263
        // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
326
263
        // because external bytecode will be appended at the end.
327
263
        if (
BI != --MF.end() || 263
I != MBB.getFirstTerminator()259
) {
328
4
          // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block at
329
4
          // the end and jump there.
330
4
          if (
!EmptyMBBAtEnd4
) {
331
4
            EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
332
4
            MF.insert(MF.end(), EmptyMBBAtEnd);
333
4
          }
334
4
335
4
          MBB.addSuccessor(EmptyMBBAtEnd);
336
4
          BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
337
4
            .addMBB(EmptyMBBAtEnd);
338
4
          I->eraseFromParent();
339
4
        }
340
263
        break;
341
302k
342
301k
      default:
343
301k
        break;
344
302k
      }
345
302k
    }
346
16.9k
  }
347
15.0k
348
15.0k
  return MadeChange;
349
15.0k
}