Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
Line
Count
Source (jump to first uncovered line)
1
//===-- SIOptimizeExecMaskingPreRA.cpp ------------------------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
/// \file
10
/// This pass removes redundant S_OR_B64 instructions enabling lanes in
11
/// the exec. If two SI_END_CF (lowered as S_OR_B64) come together without any
12
/// vector instructions between them we can only keep outer SI_END_CF, given
13
/// that CFG is structured and exec bits of the outer end statement are always
14
/// not less than exec bit of the inner one.
15
///
16
/// This needs to be done before the RA to eliminate saved exec bits registers
17
/// but after register coalescer to have no vector registers copies in between
18
/// of different end cf statements.
19
///
20
//===----------------------------------------------------------------------===//
21
22
#include "AMDGPU.h"
23
#include "AMDGPUSubtarget.h"
24
#include "SIInstrInfo.h"
25
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
26
#include "llvm/CodeGen/LiveIntervals.h"
27
#include "llvm/CodeGen/MachineFunctionPass.h"
28
29
using namespace llvm;
30
31
#define DEBUG_TYPE "si-optimize-exec-masking-pre-ra"
32
33
namespace {
34
35
class SIOptimizeExecMaskingPreRA : public MachineFunctionPass {
36
private:
37
  const SIRegisterInfo *TRI;
38
  const SIInstrInfo *TII;
39
  MachineRegisterInfo *MRI;
40
41
public:
42
  MachineBasicBlock::iterator skipIgnoreExecInsts(
43
    MachineBasicBlock::iterator I, MachineBasicBlock::iterator E) const;
44
45
    MachineBasicBlock::iterator skipIgnoreExecInstsTrivialSucc(
46
      MachineBasicBlock *&MBB,
47
      MachineBasicBlock::iterator It) const;
48
49
public:
50
  static char ID;
51
52
2.37k
  SIOptimizeExecMaskingPreRA() : MachineFunctionPass(ID) {
53
2.37k
    initializeSIOptimizeExecMaskingPreRAPass(*PassRegistry::getPassRegistry());
54
2.37k
  }
55
56
  bool runOnMachineFunction(MachineFunction &MF) override;
57
58
27.6k
  StringRef getPassName() const override {
59
27.6k
    return "SI optimize exec mask operations pre-RA";
60
27.6k
  }
61
62
2.37k
  void getAnalysisUsage(AnalysisUsage &AU) const override {
63
2.37k
    AU.addRequired<LiveIntervals>();
64
2.37k
    AU.setPreservesAll();
65
2.37k
    MachineFunctionPass::getAnalysisUsage(AU);
66
2.37k
  }
67
};
68
69
} // End anonymous namespace.
70
71
101k
INITIALIZE_PASS_BEGIN(SIOptimizeExecMaskingPreRA, DEBUG_TYPE,
72
101k
                      "SI optimize exec mask operations pre-RA", false, false)
73
101k
INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
74
101k
INITIALIZE_PASS_END(SIOptimizeExecMaskingPreRA, DEBUG_TYPE,
75
                    "SI optimize exec mask operations pre-RA", false, false)
76
77
char SIOptimizeExecMaskingPreRA::ID = 0;
78
79
char &llvm::SIOptimizeExecMaskingPreRAID = SIOptimizeExecMaskingPreRA::ID;
80
81
0
FunctionPass *llvm::createSIOptimizeExecMaskingPreRAPass() {
82
0
  return new SIOptimizeExecMaskingPreRA();
83
0
}
84
85
static bool isEndCF(const MachineInstr &MI, const SIRegisterInfo *TRI,
86
2.01k
                    const GCNSubtarget &ST) {
87
2.01k
  if (ST.isWave32()) {
88
153
    return MI.getOpcode() == AMDGPU::S_OR_B32 &&
89
153
           
MI.modifiesRegister(AMDGPU::EXEC_LO, TRI)10
;
90
153
  }
91
1.86k
92
1.86k
  return MI.getOpcode() == AMDGPU::S_OR_B64 &&
93
1.86k
         
MI.modifiesRegister(AMDGPU::EXEC, TRI)154
;
94
1.86k
}
95
96
20
static bool isFullExecCopy(const MachineInstr& MI, const GCNSubtarget& ST) {
97
20
  unsigned Exec = ST.isWave32() ? 
AMDGPU::EXEC_LO0
: AMDGPU::EXEC;
98
20
99
20
  if (MI.isCopy() && 
MI.getOperand(1).getReg() == Exec17
) {
100
17
    assert(MI.isFullCopy());
101
17
    return true;
102
17
  }
103
3
104
3
  return false;
105
3
}
106
107
static unsigned getOrNonExecReg(const MachineInstr &MI,
108
                                const SIInstrInfo &TII,
109
29
                                const GCNSubtarget& ST) {
110
29
  unsigned Exec = ST.isWave32() ? 
AMDGPU::EXEC_LO0
: AMDGPU::EXEC;
111
29
  auto Op = TII.getNamedOperand(MI, AMDGPU::OpName::src1);
112
29
  if (Op->isReg() && Op->getReg() != Exec)
113
29
     return Op->getReg();
114
0
  Op = TII.getNamedOperand(MI, AMDGPU::OpName::src0);
115
0
  if (Op->isReg() && Op->getReg() != Exec)
116
0
     return Op->getReg();
117
0
  return AMDGPU::NoRegister;
118
0
}
119
120
static MachineInstr* getOrExecSource(const MachineInstr &MI,
121
                                     const SIInstrInfo &TII,
122
                                     const MachineRegisterInfo &MRI,
123
20
                                     const GCNSubtarget& ST) {
124
20
  auto SavedExec = getOrNonExecReg(MI, TII, ST);
125
20
  if (SavedExec == AMDGPU::NoRegister)
126
0
    return nullptr;
127
20
  auto SaveExecInst = MRI.getUniqueVRegDef(SavedExec);
128
20
  if (!SaveExecInst || !isFullExecCopy(*SaveExecInst, ST))
129
3
    return nullptr;
130
17
  return SaveExecInst;
131
17
}
132
133
/// Skip over instructions that don't care about the exec mask.
134
MachineBasicBlock::iterator SIOptimizeExecMaskingPreRA::skipIgnoreExecInsts(
135
170
  MachineBasicBlock::iterator I, MachineBasicBlock::iterator E) const {
136
358
  for ( ; I != E; 
++I188
) {
137
288
    if (TII->mayReadEXEC(*MRI, *I))
138
100
      break;
139
288
  }
140
170
141
170
  return I;
142
170
}
143
144
// Skip to the next instruction, ignoring debug instructions, and trivial block
145
// boundaries (blocks that have one (typically fallthrough) successor, and the
146
// successor has one predecessor.
147
MachineBasicBlock::iterator
148
SIOptimizeExecMaskingPreRA::skipIgnoreExecInstsTrivialSucc(
149
  MachineBasicBlock *&MBB,
150
150
  MachineBasicBlock::iterator It) const {
151
150
152
170
  do {
153
170
    It = skipIgnoreExecInsts(It, MBB->end());
154
170
    if (It != MBB->end() || 
MBB->succ_size() != 170
)
155
105
      break;
156
65
157
65
    // If there is one trivial successor, advance to the next block.
158
65
    MachineBasicBlock *Succ = *MBB->succ_begin();
159
65
160
65
    // TODO: Is this really necessary?
161
65
    if (!MBB->isLayoutSuccessor(Succ))
162
45
      break;
163
20
164
20
    It = Succ->begin();
165
20
    MBB = Succ;
166
20
  } while (true);
167
150
168
150
  return It;
169
150
}
170
171
172
// Optimize sequence
173
//    %sel = V_CNDMASK_B32_e64 0, 1, %cc
174
//    %cmp = V_CMP_NE_U32 1, %1
175
//    $vcc = S_AND_B64 $exec, %cmp
176
//    S_CBRANCH_VCC[N]Z
177
// =>
178
//    $vcc = S_ANDN2_B64 $exec, %cc
179
//    S_CBRANCH_VCC[N]Z
180
//
181
// It is the negation pattern inserted by DAGCombiner::visitBRCOND() in the
182
// rebuildSetCC(). We start with S_CBRANCH to avoid exhaustive search, but
183
// only 3 first instructions are really needed. S_AND_B64 with exec is a
184
// required part of the pattern since V_CNDMASK_B32 writes zeroes for inactive
185
// lanes.
186
//
187
// Returns %cc register on success.
188
static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB,
189
                                     const GCNSubtarget &ST,
190
                                     MachineRegisterInfo &MRI,
191
28.8k
                                     LiveIntervals *LIS) {
192
28.8k
  const SIRegisterInfo *TRI = ST.getRegisterInfo();
193
28.8k
  const SIInstrInfo *TII = ST.getInstrInfo();
194
28.8k
  bool Wave32 = ST.isWave32();
195
28.8k
  const unsigned AndOpc = Wave32 ? 
AMDGPU::S_AND_B322.20k
:
AMDGPU::S_AND_B6426.6k
;
196
28.8k
  const unsigned Andn2Opc = Wave32 ? 
AMDGPU::S_ANDN2_B322.20k
:
AMDGPU::S_ANDN2_B6426.6k
;
197
28.8k
  const unsigned CondReg = Wave32 ? 
AMDGPU::VCC_LO2.20k
:
AMDGPU::VCC26.6k
;
198
28.8k
  const unsigned ExecReg = Wave32 ? 
AMDGPU::EXEC_LO2.20k
:
AMDGPU::EXEC26.6k
;
199
28.8k
200
29.4k
  auto I = llvm::find_if(MBB.terminators(), [](const MachineInstr &MI) {
201
29.4k
                           unsigned Opc = MI.getOpcode();
202
29.4k
                           return Opc == AMDGPU::S_CBRANCH_VCCZ ||
203
29.4k
                                  
Opc == AMDGPU::S_CBRANCH_VCCNZ29.4k
; });
204
28.8k
  if (I == MBB.terminators().end())
205
28.5k
    return AMDGPU::NoRegister;
206
250
207
250
  auto *And = TRI->findReachingDef(CondReg, AMDGPU::NoSubRegister,
208
250
                                   *I, MRI, LIS);
209
250
  if (!And || 
And->getOpcode() != AndOpc249
||
210
250
      
!And->getOperand(1).isReg()123
||
!And->getOperand(2).isReg()123
)
211
127
    return AMDGPU::NoRegister;
212
123
213
123
  MachineOperand *AndCC = &And->getOperand(1);
214
123
  unsigned CmpReg = AndCC->getReg();
215
123
  unsigned CmpSubReg = AndCC->getSubReg();
216
123
  if (CmpReg == ExecReg) {
217
103
    AndCC = &And->getOperand(2);
218
103
    CmpReg = AndCC->getReg();
219
103
    CmpSubReg = AndCC->getSubReg();
220
103
  } else 
if (20
And->getOperand(2).getReg() != ExecReg20
) {
221
0
    return AMDGPU::NoRegister;
222
0
  }
223
123
224
123
  auto *Cmp = TRI->findReachingDef(CmpReg, CmpSubReg, *And, MRI, LIS);
225
123
  if (!Cmp || 
!(120
Cmp->getOpcode() == AMDGPU::V_CMP_NE_U32_e32120
||
226
120
                
Cmp->getOpcode() == AMDGPU::V_CMP_NE_U32_e6482
) ||
227
123
      
Cmp->getParent() != And->getParent()56
)
228
68
    return AMDGPU::NoRegister;
229
55
230
55
  MachineOperand *Op1 = TII->getNamedOperand(*Cmp, AMDGPU::OpName::src0);
231
55
  MachineOperand *Op2 = TII->getNamedOperand(*Cmp, AMDGPU::OpName::src1);
232
55
  if (Op1->isImm() && 
Op2->isReg()36
)
233
36
    std::swap(Op1, Op2);
234
55
  if (!Op1->isReg() || !Op2->isImm() || 
Op2->getImm() != 153
)
235
16
    return AMDGPU::NoRegister;
236
39
237
39
  unsigned SelReg = Op1->getReg();
238
39
  auto *Sel = TRI->findReachingDef(SelReg, Op1->getSubReg(), *Cmp, MRI, LIS);
239
39
  if (!Sel || 
Sel->getOpcode() != AMDGPU::V_CNDMASK_B32_e6437
)
240
10
    return AMDGPU::NoRegister;
241
29
242
29
  if (TII->hasModifiersSet(*Sel, AMDGPU::OpName::src0_modifiers) ||
243
29
      TII->hasModifiersSet(*Sel, AMDGPU::OpName::src1_modifiers))
244
0
    return AMDGPU::NoRegister;
245
29
246
29
  Op1 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src0);
247
29
  Op2 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src1);
248
29
  MachineOperand *CC = TII->getNamedOperand(*Sel, AMDGPU::OpName::src2);
249
29
  if (!Op1->isImm() || !Op2->isImm() || !CC->isReg() ||
250
29
      Op1->getImm() != 0 || Op2->getImm() != 1)
251
0
    return AMDGPU::NoRegister;
252
29
253
29
  LLVM_DEBUG(dbgs() << "Folding sequence:\n\t" << *Sel << '\t'
254
29
                    << *Cmp << '\t' << *And);
255
29
256
29
  unsigned CCReg = CC->getReg();
257
29
  LIS->RemoveMachineInstrFromMaps(*And);
258
29
  MachineInstr *Andn2 = BuildMI(MBB, *And, And->getDebugLoc(),
259
29
                                TII->get(Andn2Opc), And->getOperand(0).getReg())
260
29
                            .addReg(ExecReg)
261
29
                            .addReg(CCReg, 0, CC->getSubReg());
262
29
  And->eraseFromParent();
263
29
  LIS->InsertMachineInstrInMaps(*Andn2);
264
29
265
29
  LLVM_DEBUG(dbgs() << "=>\n\t" << *Andn2 << '\n');
266
29
267
29
  // Try to remove compare. Cmp value should not used in between of cmp
268
29
  // and s_and_b64 if VCC or just unused if any other register.
269
29
  if ((TargetRegisterInfo::isVirtualRegister(CmpReg) &&
270
29
       
MRI.use_nodbg_empty(CmpReg)9
) ||
271
29
      
(20
CmpReg == CondReg20
&&
272
20
       std::none_of(std::next(Cmp->getIterator()), Andn2->getIterator(),
273
20
                    [&](const MachineInstr &MI) {
274
27
                      return MI.readsRegister(CondReg, TRI); }))) {
275
27
    LLVM_DEBUG(dbgs() << "Erasing: " << *Cmp << '\n');
276
27
277
27
    LIS->RemoveMachineInstrFromMaps(*Cmp);
278
27
    Cmp->eraseFromParent();
279
27
280
27
    // Try to remove v_cndmask_b32.
281
27
    if (TargetRegisterInfo::isVirtualRegister(SelReg) &&
282
27
        MRI.use_nodbg_empty(SelReg)) {
283
25
      LLVM_DEBUG(dbgs() << "Erasing: " << *Sel << '\n');
284
25
285
25
      LIS->RemoveMachineInstrFromMaps(*Sel);
286
25
      Sel->eraseFromParent();
287
25
    }
288
27
  }
289
29
290
29
  return CCReg;
291
29
}
292
293
25.2k
bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
294
25.2k
  if (skipFunction(MF.getFunction()))
295
8
    return false;
296
25.2k
297
25.2k
  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
298
25.2k
  TRI = ST.getRegisterInfo();
299
25.2k
  TII = ST.getInstrInfo();
300
25.2k
  MRI = &MF.getRegInfo();
301
25.2k
302
25.2k
  MachineRegisterInfo &MRI = MF.getRegInfo();
303
25.2k
  LiveIntervals *LIS = &getAnalysis<LiveIntervals>();
304
25.2k
  DenseSet<unsigned> RecalcRegs({AMDGPU::EXEC_LO, AMDGPU::EXEC_HI});
305
25.2k
  unsigned Exec = ST.isWave32() ? 
AMDGPU::EXEC_LO1.93k
:
AMDGPU::EXEC23.3k
;
306
25.2k
  bool Changed = false;
307
25.2k
308
28.8k
  for (MachineBasicBlock &MBB : MF) {
309
28.8k
310
28.8k
    if (unsigned Reg = optimizeVcndVcmpPair(MBB, ST, MRI, LIS)) {
311
29
      RecalcRegs.insert(Reg);
312
29
      RecalcRegs.insert(AMDGPU::VCC_LO);
313
29
      RecalcRegs.insert(AMDGPU::VCC_HI);
314
29
      RecalcRegs.insert(AMDGPU::SCC);
315
29
      Changed = true;
316
29
    }
317
28.8k
318
28.8k
    // Try to remove unneeded instructions before s_endpgm.
319
28.8k
    if (MBB.succ_empty()) {
320
25.3k
      if (MBB.empty())
321
32
        continue;
322
25.2k
323
25.2k
      // Skip this if the endpgm has any implicit uses, otherwise we would need
324
25.2k
      // to be careful to update / remove them.
325
25.2k
      // S_ENDPGM always has a single imm operand that is not used other than to
326
25.2k
      // end up in the encoding
327
25.2k
      MachineInstr &Term = MBB.back();
328
25.2k
      if (Term.getOpcode() != AMDGPU::S_ENDPGM || 
Term.getNumOperands() != 121.0k
)
329
4.27k
        continue;
330
21.0k
331
21.0k
      SmallVector<MachineBasicBlock*, 4> Blocks({&MBB});
332
21.0k
333
42.4k
      while (!Blocks.empty()) {
334
21.3k
        auto CurBB = Blocks.pop_back_val();
335
21.3k
        auto I = CurBB->rbegin(), E = CurBB->rend();
336
21.3k
        if (I != E) {
337
21.3k
          if (I->isUnconditionalBranch() || 
I->getOpcode() == AMDGPU::S_ENDPGM21.3k
)
338
21.0k
            ++I;
339
311
          else if (I->isBranch())
340
0
            continue;
341
21.3k
        }
342
21.3k
343
21.9k
        
while (21.3k
I != E) {
344
20.6k
          if (I->isDebugInstr()) {
345
1
            I = std::next(I);
346
1
            continue;
347
1
          }
348
20.6k
349
20.6k
          if (I->mayStore() || 
I->isBarrier()2.12k
||
I->isCall()2.12k
||
350
20.6k
              
I->hasUnmodeledSideEffects()2.12k
||
I->hasOrderedMemoryRef()589
)
351
20.1k
            break;
352
519
353
519
          LLVM_DEBUG(dbgs()
354
519
                     << "Removing no effect instruction: " << *I << '\n');
355
519
356
1.02k
          for (auto &Op : I->operands()) {
357
1.02k
            if (Op.isReg())
358
992
              RecalcRegs.insert(Op.getReg());
359
1.02k
          }
360
519
361
519
          auto Next = std::next(I);
362
519
          LIS->RemoveMachineInstrFromMaps(*I);
363
519
          I->eraseFromParent();
364
519
          I = Next;
365
519
366
519
          Changed = true;
367
519
        }
368
21.3k
369
21.3k
        if (I != E)
370
20.1k
          continue;
371
1.26k
372
1.26k
        // Try to ascend predecessors.
373
1.26k
        for (auto *Pred : CurBB->predecessors()) {
374
786
          if (Pred->succ_size() == 1)
375
375
            Blocks.push_back(Pred);
376
786
        }
377
1.26k
      }
378
21.0k
      continue;
379
21.0k
    }
380
3.52k
381
3.52k
    // Try to collapse adjacent endifs.
382
3.52k
    auto E = MBB.end();
383
3.52k
    auto Lead = skipDebugInstructionsForward(MBB.begin(), E);
384
3.52k
    if (MBB.succ_size() != 1 || 
Lead == E1.94k
||
!isEndCF(*Lead, TRI, ST)1.91k
)
385
3.37k
      continue;
386
150
387
150
    MachineBasicBlock *TmpMBB = &MBB;
388
150
    auto NextLead = skipIgnoreExecInstsTrivialSucc(TmpMBB, std::next(Lead));
389
150
    if (NextLead == TmpMBB->end() || 
!isEndCF(*NextLead, TRI, ST)100
||
390
150
        
!getOrExecSource(*NextLead, *TII, MRI, ST)11
)
391
141
      continue;
392
9
393
9
    LLVM_DEBUG(dbgs() << "Redundant EXEC = S_OR_B64 found: " << *Lead << '\n');
394
9
395
9
    auto SaveExec = getOrExecSource(*Lead, *TII, MRI, ST);
396
9
    unsigned SaveExecReg = getOrNonExecReg(*Lead, *TII, ST);
397
36
    for (auto &Op : Lead->operands()) {
398
36
      if (Op.isReg())
399
36
        RecalcRegs.insert(Op.getReg());
400
36
    }
401
9
402
9
    LIS->RemoveMachineInstrFromMaps(*Lead);
403
9
    Lead->eraseFromParent();
404
9
    if (SaveExecReg) {
405
9
      LIS->removeInterval(SaveExecReg);
406
9
      LIS->createAndComputeVirtRegInterval(SaveExecReg);
407
9
    }
408
9
409
9
    Changed = true;
410
9
411
9
    // If the only use of saved exec in the removed instruction is S_AND_B64
412
9
    // fold the copy now.
413
9
    if (!SaveExec || 
!SaveExec->isFullCopy()8
)
414
1
      continue;
415
8
416
8
    unsigned SavedExec = SaveExec->getOperand(0).getReg();
417
8
    bool SafeToReplace = true;
418
8
    for (auto& U : MRI.use_nodbg_instructions(SavedExec)) {
419
7
      if (U.getParent() != SaveExec->getParent()) {
420
0
        SafeToReplace = false;
421
0
        break;
422
0
      }
423
7
424
7
      LLVM_DEBUG(dbgs() << "Redundant EXEC COPY: " << *SaveExec << '\n');
425
7
    }
426
8
427
8
    if (SafeToReplace) {
428
8
      LIS->RemoveMachineInstrFromMaps(*SaveExec);
429
8
      SaveExec->eraseFromParent();
430
8
      MRI.replaceRegWith(SavedExec, Exec);
431
8
      LIS->removeInterval(SavedExec);
432
8
    }
433
8
  }
434
25.2k
435
25.2k
  if (Changed) {
436
1.97k
    for (auto Reg : RecalcRegs) {
437
1.97k
      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
438
199
        LIS->removeInterval(Reg);
439
199
        if (!MRI.reg_empty(Reg))
440
175
          LIS->createAndComputeVirtRegInterval(Reg);
441
1.77k
      } else {
442
1.77k
        LIS->removeAllRegUnitsForPhysReg(Reg);
443
1.77k
      }
444
1.97k
    }
445
523
  }
446
25.2k
447
25.2k
  return Changed;
448
25.2k
}