Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
Line
Count
Source (jump to first uncovered line)
1
//===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This pass lowers all occurrences of i1 values (with a vreg_1 register class)
10
// to lane masks (32 / 64-bit scalar registers). The pass assumes machine SSA
11
// form and a wave-level control flow graph.
12
//
13
// Before this pass, values that are semantically i1 and are defined and used
14
// within the same basic block are already represented as lane masks in scalar
15
// registers. However, values that cross basic blocks are always transferred
16
// between basic blocks in vreg_1 virtual registers and are lowered by this
17
// pass.
18
//
19
// The only instructions that use or define vreg_1 virtual registers are COPY,
20
// PHI, and IMPLICIT_DEF.
21
//
22
//===----------------------------------------------------------------------===//
23
24
#include "AMDGPU.h"
25
#include "AMDGPUSubtarget.h"
26
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
27
#include "SIInstrInfo.h"
28
#include "llvm/CodeGen/MachineDominators.h"
29
#include "llvm/CodeGen/MachineFunctionPass.h"
30
#include "llvm/CodeGen/MachineInstrBuilder.h"
31
#include "llvm/CodeGen/MachinePostDominators.h"
32
#include "llvm/CodeGen/MachineRegisterInfo.h"
33
#include "llvm/CodeGen/MachineSSAUpdater.h"
34
#include "llvm/IR/Function.h"
35
#include "llvm/IR/LLVMContext.h"
36
#include "llvm/Support/Debug.h"
37
#include "llvm/Target/TargetMachine.h"
38
39
#define DEBUG_TYPE "si-i1-copies"
40
41
using namespace llvm;
42
43
static unsigned createLaneMaskReg(MachineFunction &MF);
44
static unsigned insertUndefLaneMask(MachineBasicBlock &MBB);
45
46
namespace {
47
48
class SILowerI1Copies : public MachineFunctionPass {
49
public:
50
  static char ID;
51
52
private:
53
  bool IsWave32 = false;
54
  MachineFunction *MF = nullptr;
55
  MachineDominatorTree *DT = nullptr;
56
  MachinePostDominatorTree *PDT = nullptr;
57
  MachineRegisterInfo *MRI = nullptr;
58
  const GCNSubtarget *ST = nullptr;
59
  const SIInstrInfo *TII = nullptr;
60
61
  unsigned ExecReg;
62
  unsigned MovOp;
63
  unsigned AndOp;
64
  unsigned OrOp;
65
  unsigned XorOp;
66
  unsigned AndN2Op;
67
  unsigned OrN2Op;
68
69
  DenseSet<unsigned> ConstrainRegs;
70
71
public:
72
2.40k
  SILowerI1Copies() : MachineFunctionPass(ID) {
73
2.40k
    initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry());
74
2.40k
  }
75
76
  bool runOnMachineFunction(MachineFunction &MF) override;
77
78
25.1k
  StringRef getPassName() const override { return "SI Lower i1 Copies"; }
79
80
2.38k
  void getAnalysisUsage(AnalysisUsage &AU) const override {
81
2.38k
    AU.setPreservesCFG();
82
2.38k
    AU.addRequired<MachineDominatorTree>();
83
2.38k
    AU.addRequired<MachinePostDominatorTree>();
84
2.38k
    MachineFunctionPass::getAnalysisUsage(AU);
85
2.38k
  }
86
87
private:
88
  void lowerCopiesFromI1();
89
  void lowerPhis();
90
  void lowerCopiesToI1();
91
  bool isConstantLaneMask(unsigned Reg, bool &Val) const;
92
  void buildMergeLaneMasks(MachineBasicBlock &MBB,
93
                           MachineBasicBlock::iterator I, const DebugLoc &DL,
94
                           unsigned DstReg, unsigned PrevReg, unsigned CurReg);
95
  MachineBasicBlock::iterator
96
  getSaluInsertionAtEnd(MachineBasicBlock &MBB) const;
97
98
695k
  bool isVreg1(unsigned Reg) const {
99
695k
    return TargetRegisterInfo::isVirtualRegister(Reg) &&
100
695k
           
MRI->getRegClass(Reg) == &AMDGPU::VReg_1RegClass626k
;
101
695k
  }
102
103
334
  bool isLaneMaskReg(unsigned Reg) const {
104
334
    return TII->getRegisterInfo().isSGPRReg(*MRI, Reg) &&
105
334
           TII->getRegisterInfo().getRegSizeInBits(Reg, *MRI) ==
106
324
               ST->getWavefrontSize();
107
334
  }
108
};
109
110
/// Helper class that determines the relationship between incoming values of a
111
/// phi in the control flow graph to determine where an incoming value can
112
/// simply be taken as a scalar lane mask as-is, and where it needs to be
113
/// merged with another, previously defined lane mask.
114
///
115
/// The approach is as follows:
116
///  - Determine all basic blocks which, starting from the incoming blocks,
117
///    a wave may reach before entering the def block (the block containing the
118
///    phi).
119
///  - If an incoming block has no predecessors in this set, we can take the
120
///    incoming value as a scalar lane mask as-is.
121
///  -- A special case of this is when the def block has a self-loop.
122
///  - Otherwise, the incoming value needs to be merged with a previously
123
///    defined lane mask.
124
///  - If there is a path into the set of reachable blocks that does _not_ go
125
///    through an incoming block where we can take the scalar lane mask as-is,
126
///    we need to invent an available value for the SSAUpdater. Choices are
127
///    0 and undef, with differing consequences for how to merge values etc.
128
///
129
/// TODO: We could use region analysis to quickly skip over SESE regions during
130
///       the traversal.
131
///
132
class PhiIncomingAnalysis {
133
  MachinePostDominatorTree &PDT;
134
135
  // For each reachable basic block, whether it is a source in the induced
136
  // subgraph of the CFG.
137
  DenseMap<MachineBasicBlock *, bool> ReachableMap;
138
  SmallVector<MachineBasicBlock *, 4> ReachableOrdered;
139
  SmallVector<MachineBasicBlock *, 4> Stack;
140
  SmallVector<MachineBasicBlock *, 4> Predecessors;
141
142
public:
143
25.1k
  PhiIncomingAnalysis(MachinePostDominatorTree &PDT) : PDT(PDT) {}
144
145
  /// Returns whether \p MBB is a source in the induced subgraph of reachable
146
  /// blocks.
147
418
  bool isSource(MachineBasicBlock &MBB) const {
148
418
    return ReachableMap.find(&MBB)->second;
149
418
  }
150
151
215
  ArrayRef<MachineBasicBlock *> predecessors() const { return Predecessors; }
152
153
  void analyze(MachineBasicBlock &DefBlock,
154
215
               ArrayRef<MachineBasicBlock *> IncomingBlocks) {
155
215
    assert(Stack.empty());
156
215
    ReachableMap.clear();
157
215
    ReachableOrdered.clear();
158
215
    Predecessors.clear();
159
215
160
215
    // Insert the def block first, so that it acts as an end point for the
161
215
    // traversal.
162
215
    ReachableMap.try_emplace(&DefBlock, false);
163
215
    ReachableOrdered.push_back(&DefBlock);
164
215
165
418
    for (MachineBasicBlock *MBB : IncomingBlocks) {
166
418
      if (MBB == &DefBlock) {
167
1
        ReachableMap[&DefBlock] = true; // self-loop on DefBlock
168
1
        continue;
169
1
      }
170
417
171
417
      ReachableMap.try_emplace(MBB, false);
172
417
      ReachableOrdered.push_back(MBB);
173
417
174
417
      // If this block has a divergent terminator and the def block is its
175
417
      // post-dominator, the wave may first visit the other successors.
176
417
      bool Divergent = false;
177
425
      for (MachineInstr &MI : MBB->terminators()) {
178
425
        if (MI.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO ||
179
425
            MI.getOpcode() == AMDGPU::SI_IF ||
180
425
            
MI.getOpcode() == AMDGPU::SI_ELSE320
||
181
425
            
MI.getOpcode() == AMDGPU::SI_LOOP293
) {
182
142
          Divergent = true;
183
142
          break;
184
142
        }
185
425
      }
186
417
187
417
      if (Divergent && 
PDT.dominates(&DefBlock, MBB)142
) {
188
142
        for (MachineBasicBlock *Succ : MBB->successors())
189
284
          Stack.push_back(Succ);
190
142
      }
191
417
    }
192
215
193
616
    while (!Stack.empty()) {
194
401
      MachineBasicBlock *MBB = Stack.pop_back_val();
195
401
      if (!ReachableMap.try_emplace(MBB, false).second)
196
325
        continue;
197
76
      ReachableOrdered.push_back(MBB);
198
76
199
76
      for (MachineBasicBlock *Succ : MBB->successors())
200
117
        Stack.push_back(Succ);
201
76
    }
202
215
203
708
    for (MachineBasicBlock *MBB : ReachableOrdered) {
204
708
      bool HaveReachablePred = false;
205
1.06k
      for (MachineBasicBlock *Pred : MBB->predecessors()) {
206
1.06k
        if (ReachableMap.count(Pred)) {
207
788
          HaveReachablePred = true;
208
788
        } else {
209
272
          Stack.push_back(Pred);
210
272
        }
211
1.06k
      }
212
708
      if (!HaveReachablePred)
213
164
        ReachableMap[MBB] = true;
214
708
      if (HaveReachablePred) {
215
544
        for (MachineBasicBlock *UnreachablePred : Stack) {
216
76
          if (llvm::find(Predecessors, UnreachablePred) == Predecessors.end())
217
76
            Predecessors.push_back(UnreachablePred);
218
76
        }
219
544
      }
220
708
      Stack.clear();
221
708
    }
222
215
  }
223
};
224
225
/// Helper class that detects loops which require us to lower an i1 COPY into
226
/// bitwise manipulation.
227
///
228
/// Unfortunately, we cannot use LoopInfo because LoopInfo does not distinguish
229
/// between loops with the same header. Consider this example:
230
///
231
///  A-+-+
232
///  | | |
233
///  B-+ |
234
///  |   |
235
///  C---+
236
///
237
/// A is the header of a loop containing A, B, and C as far as LoopInfo is
238
/// concerned. However, an i1 COPY in B that is used in C must be lowered to
239
/// bitwise operations to combine results from different loop iterations when
240
/// B has a divergent branch (since by default we will compile this code such
241
/// that threads in a wave are merged at the entry of C).
242
///
243
/// The following rule is implemented to determine whether bitwise operations
244
/// are required: use the bitwise lowering for a def in block B if a backward
245
/// edge to B is reachable without going through the nearest common
246
/// post-dominator of B and all uses of the def.
247
///
248
/// TODO: This rule is conservative because it does not check whether the
249
///       relevant branches are actually divergent.
250
///
251
/// The class is designed to cache the CFG traversal so that it can be re-used
252
/// for multiple defs within the same basic block.
253
///
254
/// TODO: We could use region analysis to quickly skip over SESE regions during
255
///       the traversal.
256
///
257
class LoopFinder {
258
  MachineDominatorTree &DT;
259
  MachinePostDominatorTree &PDT;
260
261
  // All visited / reachable block, tagged by level (level 0 is the def block,
262
  // level 1 are all blocks reachable including but not going through the def
263
  // block's IPDOM, etc.).
264
  DenseMap<MachineBasicBlock *, unsigned> Visited;
265
266
  // Nearest common dominator of all visited blocks by level (level 0 is the
267
  // def block). Used for seeding the SSAUpdater.
268
  SmallVector<MachineBasicBlock *, 4> CommonDominators;
269
270
  // Post-dominator of all visited blocks.
271
  MachineBasicBlock *VisitedPostDom = nullptr;
272
273
  // Level at which a loop was found: 0 is not possible; 1 = a backward edge is
274
  // reachable without going through the IPDOM of the def block (if the IPDOM
275
  // itself has an edge to the def block, the loop level is 2), etc.
276
  unsigned FoundLoopLevel = ~0u;
277
278
  MachineBasicBlock *DefBlock = nullptr;
279
  SmallVector<MachineBasicBlock *, 4> Stack;
280
  SmallVector<MachineBasicBlock *, 4> NextLevel;
281
282
public:
283
  LoopFinder(MachineDominatorTree &DT, MachinePostDominatorTree &PDT)
284
50.3k
      : DT(DT), PDT(PDT) {}
285
286
56.8k
  void initialize(MachineBasicBlock &MBB) {
287
56.8k
    Visited.clear();
288
56.8k
    CommonDominators.clear();
289
56.8k
    Stack.clear();
290
56.8k
    NextLevel.clear();
291
56.8k
    VisitedPostDom = nullptr;
292
56.8k
    FoundLoopLevel = ~0u;
293
56.8k
294
56.8k
    DefBlock = &MBB;
295
56.8k
  }
296
297
  /// Check whether a backward edge can be reached without going through the
298
  /// given \p PostDom of the def block.
299
  ///
300
  /// Return the level of \p PostDom if a loop was found, or 0 otherwise.
301
304
  unsigned findLoop(MachineBasicBlock *PostDom) {
302
304
    MachineDomTreeNode *PDNode = PDT.getNode(DefBlock);
303
304
304
304
    if (!VisitedPostDom)
305
244
      advanceLevel();
306
304
307
304
    unsigned Level = 0;
308
463
    while (PDNode->getBlock() != PostDom) {
309
167
      if (PDNode->getBlock() == VisitedPostDom)
310
144
        advanceLevel();
311
167
      PDNode = PDNode->getIDom();
312
167
      Level++;
313
167
      if (FoundLoopLevel == Level)
314
8
        return Level;
315
167
    }
316
304
317
304
    
return 0296
;
318
304
  }
319
320
  /// Add undef values dominating the loop and the optionally given additional
321
  /// blocks, so that the SSA updater doesn't have to search all the way to the
322
  /// function entry.
323
  void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater,
324
8
                      ArrayRef<MachineBasicBlock *> Blocks = {}) {
325
8
    assert(LoopLevel < CommonDominators.size());
326
8
327
8
    MachineBasicBlock *Dom = CommonDominators[LoopLevel];
328
8
    for (MachineBasicBlock *MBB : Blocks)
329
16
      Dom = DT.findNearestCommonDominator(Dom, MBB);
330
8
331
8
    if (!inLoopLevel(*Dom, LoopLevel, Blocks)) {
332
0
      SSAUpdater.AddAvailableValue(Dom, insertUndefLaneMask(*Dom));
333
8
    } else {
334
8
      // The dominator is part of the loop or the given blocks, so add the
335
8
      // undef value to unreachable predecessors instead.
336
16
      for (MachineBasicBlock *Pred : Dom->predecessors()) {
337
16
        if (!inLoopLevel(*Pred, LoopLevel, Blocks))
338
8
          SSAUpdater.AddAvailableValue(Pred, insertUndefLaneMask(*Pred));
339
16
      }
340
8
    }
341
8
  }
342
343
private:
344
  bool inLoopLevel(MachineBasicBlock &MBB, unsigned LoopLevel,
345
24
                   ArrayRef<MachineBasicBlock *> Blocks) const {
346
24
    auto DomIt = Visited.find(&MBB);
347
24
    if (DomIt != Visited.end() && 
DomIt->second <= LoopLevel16
)
348
16
      return true;
349
8
350
8
    if (llvm::find(Blocks, &MBB) != Blocks.end())
351
0
      return true;
352
8
353
8
    return false;
354
8
  }
355
356
388
  void advanceLevel() {
357
388
    MachineBasicBlock *VisitedDom;
358
388
359
388
    if (!VisitedPostDom) {
360
244
      VisitedPostDom = DefBlock;
361
244
      VisitedDom = DefBlock;
362
244
      Stack.push_back(DefBlock);
363
244
    } else {
364
144
      VisitedPostDom = PDT.getNode(VisitedPostDom)->getIDom()->getBlock();
365
144
      VisitedDom = CommonDominators.back();
366
144
367
376
      for (unsigned i = 0; i < NextLevel.size();) {
368
232
        if (PDT.dominates(VisitedPostDom, NextLevel[i])) {
369
232
          Stack.push_back(NextLevel[i]);
370
232
371
232
          NextLevel[i] = NextLevel.back();
372
232
          NextLevel.pop_back();
373
232
        } else {
374
0
          i++;
375
0
        }
376
232
      }
377
144
    }
378
388
379
388
    unsigned Level = CommonDominators.size();
380
922
    while (!Stack.empty()) {
381
534
      MachineBasicBlock *MBB = Stack.pop_back_val();
382
534
      if (!PDT.dominates(VisitedPostDom, MBB))
383
0
        NextLevel.push_back(MBB);
384
534
385
534
      Visited[MBB] = Level;
386
534
      VisitedDom = DT.findNearestCommonDominator(VisitedDom, MBB);
387
534
388
746
      for (MachineBasicBlock *Succ : MBB->successors()) {
389
746
        if (Succ == DefBlock) {
390
18
          if (MBB == VisitedPostDom)
391
2
            FoundLoopLevel = std::min(FoundLoopLevel, Level + 1);
392
16
          else
393
16
            FoundLoopLevel = std::min(FoundLoopLevel, Level);
394
18
          continue;
395
18
        }
396
728
397
728
        if (Visited.try_emplace(Succ, ~0u).second) {
398
606
          if (MBB == VisitedPostDom)
399
548
            NextLevel.push_back(Succ);
400
58
          else
401
58
            Stack.push_back(Succ);
402
606
        }
403
728
      }
404
534
    }
405
388
406
388
    CommonDominators.push_back(VisitedDom);
407
388
  }
408
};
409
410
} // End anonymous namespace.
411
412
101k
INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false,
413
101k
                      false)
414
101k
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
415
101k
INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
416
101k
INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false,
417
                    false)
418
419
char SILowerI1Copies::ID = 0;
420
421
char &llvm::SILowerI1CopiesID = SILowerI1Copies::ID;
422
423
2.40k
FunctionPass *llvm::createSILowerI1CopiesPass() {
424
2.40k
  return new SILowerI1Copies();
425
2.40k
}
426
427
604
static unsigned createLaneMaskReg(MachineFunction &MF) {
428
604
  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
429
604
  MachineRegisterInfo &MRI = MF.getRegInfo();
430
604
  return MRI.createVirtualRegister(ST.isWave32() ? 
&AMDGPU::SReg_32RegClass82
431
604
                                                 : 
&AMDGPU::SReg_64RegClass522
);
432
604
}
433
434
84
static unsigned insertUndefLaneMask(MachineBasicBlock &MBB) {
435
84
  MachineFunction &MF = *MBB.getParent();
436
84
  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
437
84
  const SIInstrInfo *TII = ST.getInstrInfo();
438
84
  unsigned UndefReg = createLaneMaskReg(MF);
439
84
  BuildMI(MBB, MBB.getFirstTerminator(), {}, TII->get(AMDGPU::IMPLICIT_DEF),
440
84
          UndefReg);
441
84
  return UndefReg;
442
84
}
443
444
/// Lower all instructions that def or use vreg_1 registers.
445
///
446
/// In a first pass, we lower COPYs from vreg_1 to vector registers, as can
447
/// occur around inline assembly. We do this first, before vreg_1 registers
448
/// are changed to scalar mask registers.
449
///
450
/// Then we lower all defs of vreg_1 registers. Phi nodes are lowered before
451
/// all others, because phi lowering looks through copies and can therefore
452
/// often make copy lowering unnecessary.
453
25.1k
bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) {
454
25.1k
  MF = &TheMF;
455
25.1k
  MRI = &MF->getRegInfo();
456
25.1k
  DT = &getAnalysis<MachineDominatorTree>();
457
25.1k
  PDT = &getAnalysis<MachinePostDominatorTree>();
458
25.1k
459
25.1k
  ST = &MF->getSubtarget<GCNSubtarget>();
460
25.1k
  TII = ST->getInstrInfo();
461
25.1k
  IsWave32 = ST->isWave32();
462
25.1k
463
25.1k
  if (IsWave32) {
464
1.84k
    ExecReg = AMDGPU::EXEC_LO;
465
1.84k
    MovOp = AMDGPU::S_MOV_B32;
466
1.84k
    AndOp = AMDGPU::S_AND_B32;
467
1.84k
    OrOp = AMDGPU::S_OR_B32;
468
1.84k
    XorOp = AMDGPU::S_XOR_B32;
469
1.84k
    AndN2Op = AMDGPU::S_ANDN2_B32;
470
1.84k
    OrN2Op = AMDGPU::S_ORN2_B32;
471
23.3k
  } else {
472
23.3k
    ExecReg = AMDGPU::EXEC;
473
23.3k
    MovOp = AMDGPU::S_MOV_B64;
474
23.3k
    AndOp = AMDGPU::S_AND_B64;
475
23.3k
    OrOp = AMDGPU::S_OR_B64;
476
23.3k
    XorOp = AMDGPU::S_XOR_B64;
477
23.3k
    AndN2Op = AMDGPU::S_ANDN2_B64;
478
23.3k
    OrN2Op = AMDGPU::S_ORN2_B64;
479
23.3k
  }
480
25.1k
481
25.1k
  lowerCopiesFromI1();
482
25.1k
  lowerPhis();
483
25.1k
  lowerCopiesToI1();
484
25.1k
485
25.1k
  for (unsigned Reg : ConstrainRegs)
486
8
    MRI->constrainRegClass(Reg, &AMDGPU::SReg_1_XEXECRegClass);
487
25.1k
  ConstrainRegs.clear();
488
25.1k
489
25.1k
  return true;
490
25.1k
}
491
492
25.1k
void SILowerI1Copies::lowerCopiesFromI1() {
493
25.1k
  SmallVector<MachineInstr *, 4> DeadCopies;
494
25.1k
495
28.4k
  for (MachineBasicBlock &MBB : *MF) {
496
753k
    for (MachineInstr &MI : MBB) {
497
753k
      if (MI.getOpcode() != AMDGPU::COPY)
498
424k
        continue;
499
328k
500
328k
      unsigned DstReg = MI.getOperand(0).getReg();
501
328k
      unsigned SrcReg = MI.getOperand(1).getReg();
502
328k
      if (!isVreg1(SrcReg))
503
328k
        continue;
504
239
505
239
      if (isLaneMaskReg(DstReg) || 
isVreg1(DstReg)9
)
506
231
        continue;
507
8
508
8
      // Copy into a 32-bit vector register.
509
8
      LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI);
510
8
      DebugLoc DL = MI.getDebugLoc();
511
8
512
8
      assert(TII->getRegisterInfo().getRegSizeInBits(DstReg, *MRI) == 32);
513
8
      assert(!MI.getOperand(0).getSubReg());
514
8
515
8
      ConstrainRegs.insert(SrcReg);
516
8
      BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
517
8
          .addImm(0)
518
8
          .addImm(0)
519
8
          .addImm(0)
520
8
          .addImm(-1)
521
8
          .addReg(SrcReg);
522
8
      DeadCopies.push_back(&MI);
523
8
    }
524
28.4k
525
28.4k
    for (MachineInstr *MI : DeadCopies)
526
8
      MI->eraseFromParent();
527
28.4k
    DeadCopies.clear();
528
28.4k
  }
529
25.1k
}
530
531
25.1k
void SILowerI1Copies::lowerPhis() {
532
25.1k
  MachineSSAUpdater SSAUpdater(*MF);
533
25.1k
  LoopFinder LF(*DT, *PDT);
534
25.1k
  PhiIncomingAnalysis PIA(*PDT);
535
25.1k
  SmallVector<MachineInstr *, 4> DeadPhis;
536
25.1k
  SmallVector<MachineBasicBlock *, 4> IncomingBlocks;
537
25.1k
  SmallVector<unsigned, 4> IncomingRegs;
538
25.1k
  SmallVector<unsigned, 4> IncomingUpdated;
539
#ifndef NDEBUG
540
  DenseSet<unsigned> PhiRegisters;
541
#endif
542
543
28.4k
  for (MachineBasicBlock &MBB : *MF) {
544
28.4k
    LF.initialize(MBB);
545
28.4k
546
28.4k
    for (MachineInstr &MI : MBB.phis()) {
547
4.91k
      unsigned DstReg = MI.getOperand(0).getReg();
548
4.91k
      if (!isVreg1(DstReg))
549
4.69k
        continue;
550
223
551
223
      LLVM_DEBUG(dbgs() << "Lower PHI: " << MI);
552
223
553
223
      MRI->setRegClass(DstReg, IsWave32 ? 
&AMDGPU::SReg_32RegClass27
554
223
                                        : 
&AMDGPU::SReg_64RegClass196
);
555
223
556
223
      // Collect incoming values.
557
670
      for (unsigned i = 1; i < MI.getNumOperands(); 
i += 2447
) {
558
447
        assert(i + 1 < MI.getNumOperands());
559
447
        unsigned IncomingReg = MI.getOperand(i).getReg();
560
447
        MachineBasicBlock *IncomingMBB = MI.getOperand(i + 1).getMBB();
561
447
        MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg);
562
447
563
447
        if (IncomingDef->getOpcode() == AMDGPU::COPY) {
564
355
          IncomingReg = IncomingDef->getOperand(1).getReg();
565
355
          assert(isLaneMaskReg(IncomingReg) || isVreg1(IncomingReg));
566
355
          assert(!IncomingDef->getOperand(1).getSubReg());
567
355
        } else 
if (92
IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF92
) {
568
13
          continue;
569
79
        } else {
570
79
          assert(IncomingDef->isPHI() || PhiRegisters.count(IncomingReg));
571
79
        }
572
447
573
447
        IncomingBlocks.push_back(IncomingMBB);
574
434
        IncomingRegs.push_back(IncomingReg);
575
434
      }
576
223
577
#ifndef NDEBUG
578
      PhiRegisters.insert(DstReg);
579
#endif
580
581
223
      // Phis in a loop that are observed outside the loop receive a simple but
582
223
      // conservatively correct treatment.
583
223
      MachineBasicBlock *PostDomBound = &MBB;
584
241
      for (MachineInstr &Use : MRI->use_instructions(DstReg)) {
585
241
        PostDomBound =
586
241
            PDT->findNearestCommonDominator(PostDomBound, Use.getParent());
587
241
      }
588
223
589
223
      unsigned FoundLoopLevel = LF.findLoop(PostDomBound);
590
223
591
223
      SSAUpdater.Initialize(DstReg);
592
223
593
223
      if (FoundLoopLevel) {
594
8
        LF.addLoopEntries(FoundLoopLevel, SSAUpdater, IncomingBlocks);
595
8
596
24
        for (unsigned i = 0; i < IncomingRegs.size(); 
++i16
) {
597
16
          IncomingUpdated.push_back(createLaneMaskReg(*MF));
598
16
          SSAUpdater.AddAvailableValue(IncomingBlocks[i],
599
16
                                       IncomingUpdated.back());
600
16
        }
601
8
602
24
        for (unsigned i = 0; i < IncomingRegs.size(); 
++i16
) {
603
16
          MachineBasicBlock &IMBB = *IncomingBlocks[i];
604
16
          buildMergeLaneMasks(
605
16
              IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i],
606
16
              SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]);
607
16
        }
608
215
      } else {
609
215
        // The phi is not observed from outside a loop. Use a more accurate
610
215
        // lowering.
611
215
        PIA.analyze(MBB, IncomingBlocks);
612
215
613
215
        for (MachineBasicBlock *MBB : PIA.predecessors())
614
76
          SSAUpdater.AddAvailableValue(MBB, insertUndefLaneMask(*MBB));
615
215
616
633
        for (unsigned i = 0; i < IncomingRegs.size(); 
++i418
) {
617
418
          MachineBasicBlock &IMBB = *IncomingBlocks[i];
618
418
          if (PIA.isSource(IMBB)) {
619
165
            IncomingUpdated.push_back(0);
620
165
            SSAUpdater.AddAvailableValue(&IMBB, IncomingRegs[i]);
621
253
          } else {
622
253
            IncomingUpdated.push_back(createLaneMaskReg(*MF));
623
253
            SSAUpdater.AddAvailableValue(&IMBB, IncomingUpdated.back());
624
253
          }
625
418
        }
626
215
627
633
        for (unsigned i = 0; i < IncomingRegs.size(); 
++i418
) {
628
418
          if (!IncomingUpdated[i])
629
165
            continue;
630
253
631
253
          MachineBasicBlock &IMBB = *IncomingBlocks[i];
632
253
          buildMergeLaneMasks(
633
253
              IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i],
634
253
              SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]);
635
253
        }
636
215
      }
637
223
638
223
      unsigned NewReg = SSAUpdater.GetValueInMiddleOfBlock(&MBB);
639
223
      if (NewReg != DstReg) {
640
222
        MRI->replaceRegWith(NewReg, DstReg);
641
222
642
222
        // Ensure that DstReg has a single def and mark the old PHI node for
643
222
        // deletion.
644
222
        MI.getOperand(0).setReg(NewReg);
645
222
        DeadPhis.push_back(&MI);
646
222
      }
647
223
648
223
      IncomingBlocks.clear();
649
223
      IncomingRegs.clear();
650
223
      IncomingUpdated.clear();
651
223
    }
652
28.4k
653
28.4k
    for (MachineInstr *MI : DeadPhis)
654
222
      MI->eraseFromParent();
655
28.4k
    DeadPhis.clear();
656
28.4k
  }
657
25.1k
}
658
659
25.1k
void SILowerI1Copies::lowerCopiesToI1() {
660
25.1k
  MachineSSAUpdater SSAUpdater(*MF);
661
25.1k
  LoopFinder LF(*DT, *PDT);
662
25.1k
  SmallVector<MachineInstr *, 4> DeadCopies;
663
25.1k
664
28.4k
  for (MachineBasicBlock &MBB : *MF) {
665
28.4k
    LF.initialize(MBB);
666
28.4k
667
753k
    for (MachineInstr &MI : MBB) {
668
753k
      if (MI.getOpcode() != AMDGPU::IMPLICIT_DEF &&
669
753k
          
MI.getOpcode() != AMDGPU::COPY720k
)
670
392k
        continue;
671
361k
672
361k
      unsigned DstReg = MI.getOperand(0).getReg();
673
361k
      if (!isVreg1(DstReg))
674
361k
        continue;
675
431
676
431
      if (MRI->use_empty(DstReg)) {
677
349
        DeadCopies.push_back(&MI);
678
349
        continue;
679
349
      }
680
82
681
82
      LLVM_DEBUG(dbgs() << "Lower Other: " << MI);
682
82
683
82
      MRI->setRegClass(DstReg, IsWave32 ? 
&AMDGPU::SReg_32RegClass1
684
82
                                        : 
&AMDGPU::SReg_64RegClass81
);
685
82
      if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF)
686
1
        continue;
687
81
688
81
      DebugLoc DL = MI.getDebugLoc();
689
81
      unsigned SrcReg = MI.getOperand(1).getReg();
690
81
      assert(!MI.getOperand(1).getSubReg());
691
81
692
81
      if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
693
81
          
(79
!isLaneMaskReg(SrcReg)79
&&
!isVreg1(SrcReg)1
)) {
694
2
        assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32);
695
2
        unsigned TmpReg = createLaneMaskReg(*MF);
696
2
        BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg)
697
2
            .addReg(SrcReg)
698
2
            .addImm(0);
699
2
        MI.getOperand(1).setReg(TmpReg);
700
2
        SrcReg = TmpReg;
701
2
      }
702
81
703
81
      // Defs in a loop that are observed outside the loop must be transformed
704
81
      // into appropriate bit manipulation.
705
81
      MachineBasicBlock *PostDomBound = &MBB;
706
94
      for (MachineInstr &Use : MRI->use_instructions(DstReg)) {
707
94
        PostDomBound =
708
94
            PDT->findNearestCommonDominator(PostDomBound, Use.getParent());
709
94
      }
710
81
711
81
      unsigned FoundLoopLevel = LF.findLoop(PostDomBound);
712
81
      if (FoundLoopLevel) {
713
0
        SSAUpdater.Initialize(DstReg);
714
0
        SSAUpdater.AddAvailableValue(&MBB, DstReg);
715
0
        LF.addLoopEntries(FoundLoopLevel, SSAUpdater);
716
0
717
0
        buildMergeLaneMasks(MBB, MI, DL, DstReg,
718
0
                            SSAUpdater.GetValueInMiddleOfBlock(&MBB), SrcReg);
719
0
        DeadCopies.push_back(&MI);
720
0
      }
721
81
    }
722
28.4k
723
28.4k
    for (MachineInstr *MI : DeadCopies)
724
349
      MI->eraseFromParent();
725
28.4k
    DeadCopies.clear();
726
28.4k
  }
727
25.1k
}
728
729
538
bool SILowerI1Copies::isConstantLaneMask(unsigned Reg, bool &Val) const {
730
538
  const MachineInstr *MI;
731
554
  for (;;) {
732
554
    MI = MRI->getUniqueVRegDef(Reg);
733
554
    if (MI->getOpcode() != AMDGPU::COPY)
734
538
      break;
735
16
736
16
    Reg = MI->getOperand(1).getReg();
737
16
    if (!TargetRegisterInfo::isVirtualRegister(Reg))
738
0
      return false;
739
16
    if (!isLaneMaskReg(Reg))
740
0
      return false;
741
16
  }
742
538
743
538
  if (MI->getOpcode() != MovOp)
744
310
    return false;
745
228
746
228
  if (!MI->getOperand(1).isImm())
747
0
    return false;
748
228
749
228
  int64_t Imm = MI->getOperand(1).getImm();
750
228
  if (Imm == 0) {
751
131
    Val = false;
752
131
    return true;
753
131
  }
754
97
  if (Imm == -1) {
755
97
    Val = true;
756
97
    return true;
757
97
  }
758
0
759
0
  return false;
760
0
}
761
762
264
static void instrDefsUsesSCC(const MachineInstr &MI, bool &Def, bool &Use) {
763
264
  Def = false;
764
264
  Use = false;
765
264
766
495
  for (const MachineOperand &MO : MI.operands()) {
767
495
    if (MO.isReg() && 
MO.getReg() == AMDGPU::SCC248
) {
768
50
      if (MO.isUse())
769
7
        Use = true;
770
43
      else
771
43
        Def = true;
772
50
    }
773
495
  }
774
264
}
775
776
/// Return a point at the end of the given \p MBB to insert SALU instructions
777
/// for lane mask calculation. Take terminators and SCC into account.
778
MachineBasicBlock::iterator
779
269
SILowerI1Copies::getSaluInsertionAtEnd(MachineBasicBlock &MBB) const {
780
269
  auto InsertionPt = MBB.getFirstTerminator();
781
269
  bool TerminatorsUseSCC = false;
782
473
  for (auto I = InsertionPt, E = MBB.end(); I != E; 
++I204
) {
783
247
    bool DefsSCC;
784
247
    instrDefsUsesSCC(*I, DefsSCC, TerminatorsUseSCC);
785
247
    if (TerminatorsUseSCC || 
DefsSCC240
)
786
43
      break;
787
247
  }
788
269
789
269
  if (!TerminatorsUseSCC)
790
262
    return InsertionPt;
791
7
792
17
  
while (7
InsertionPt != MBB.begin()) {
793
17
    InsertionPt--;
794
17
795
17
    bool DefSCC, UseSCC;
796
17
    instrDefsUsesSCC(*InsertionPt, DefSCC, UseSCC);
797
17
    if (DefSCC)
798
7
      return InsertionPt;
799
17
  }
800
7
801
7
  // We should have at least seen an IMPLICIT_DEF or COPY
802
7
  
llvm_unreachable0
("SCC used by terminator but no def in block");
803
7
}
804
805
void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB,
806
                                          MachineBasicBlock::iterator I,
807
                                          const DebugLoc &DL, unsigned DstReg,
808
269
                                          unsigned PrevReg, unsigned CurReg) {
809
269
  bool PrevVal;
810
269
  bool PrevConstant = isConstantLaneMask(PrevReg, PrevVal);
811
269
  bool CurVal;
812
269
  bool CurConstant = isConstantLaneMask(CurReg, CurVal);
813
269
814
269
  if (PrevConstant && 
CurConstant78
) {
815
36
    if (PrevVal == CurVal) {
816
0
      BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(CurReg);
817
36
    } else if (CurVal) {
818
21
      BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(ExecReg);
819
21
    } else {
820
15
      BuildMI(MBB, I, DL, TII->get(XorOp), DstReg)
821
15
          .addReg(ExecReg)
822
15
          .addImm(-1);
823
15
    }
824
36
    return;
825
36
  }
826
233
827
233
  unsigned PrevMaskedReg = 0;
828
233
  unsigned CurMaskedReg = 0;
829
233
  if (!PrevConstant) {
830
191
    if (CurConstant && 
CurVal114
) {
831
55
      PrevMaskedReg = PrevReg;
832
136
    } else {
833
136
      PrevMaskedReg = createLaneMaskReg(*MF);
834
136
      BuildMI(MBB, I, DL, TII->get(AndN2Op), PrevMaskedReg)
835
136
          .addReg(PrevReg)
836
136
          .addReg(ExecReg);
837
136
    }
838
191
  }
839
233
  if (!CurConstant) {
840
119
    // TODO: check whether CurReg is already masked by EXEC
841
119
    if (PrevConstant && 
PrevVal42
) {
842
6
      CurMaskedReg = CurReg;
843
113
    } else {
844
113
      CurMaskedReg = createLaneMaskReg(*MF);
845
113
      BuildMI(MBB, I, DL, TII->get(AndOp), CurMaskedReg)
846
113
          .addReg(CurReg)
847
113
          .addReg(ExecReg);
848
113
    }
849
119
  }
850
233
851
233
  if (PrevConstant && 
!PrevVal42
) {
852
36
    BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg)
853
36
        .addReg(CurMaskedReg);
854
197
  } else if (CurConstant && 
!CurVal114
) {
855
59
    BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg)
856
59
        .addReg(PrevMaskedReg);
857
138
  } else if (PrevConstant && 
PrevVal6
) {
858
6
    BuildMI(MBB, I, DL, TII->get(OrN2Op), DstReg)
859
6
        .addReg(CurMaskedReg)
860
6
        .addReg(ExecReg);
861
132
  } else {
862
132
    BuildMI(MBB, I, DL, TII->get(OrOp), DstReg)
863
132
        .addReg(PrevMaskedReg)
864
132
        .addReg(CurMaskedReg ? 
CurMaskedReg77
:
ExecReg55
);
865
132
  }
866
233
}