Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
Line
Count
Source (jump to first uncovered line)
1
//===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
/// \file
10
/// This pass adds instructions to enable whole quad mode for pixel
11
/// shaders, and whole wavefront mode for all programs.
12
///
13
/// Whole quad mode is required for derivative computations, but it interferes
14
/// with shader side effects (stores and atomics). This pass is run on the
15
/// scheduled machine IR but before register coalescing, so that machine SSA is
16
/// available for analysis. It ensures that WQM is enabled when necessary, but
17
/// disabled around stores and atomics.
18
///
19
/// When necessary, this pass creates a function prolog
20
///
21
///   S_MOV_B64 LiveMask, EXEC
22
///   S_WQM_B64 EXEC, EXEC
23
///
24
/// to enter WQM at the top of the function and surrounds blocks of Exact
25
/// instructions by
26
///
27
///   S_AND_SAVEEXEC_B64 Tmp, LiveMask
28
///   ...
29
///   S_MOV_B64 EXEC, Tmp
30
///
31
/// We also compute when a sequence of instructions requires Whole Wavefront
32
/// Mode (WWM) and insert instructions to save and restore it:
33
///
34
/// S_OR_SAVEEXEC_B64 Tmp, -1
35
/// ...
36
/// S_MOV_B64 EXEC, Tmp
37
///
38
/// In order to avoid excessive switching during sequences of Exact
39
/// instructions, the pass first analyzes which instructions must be run in WQM
40
/// (aka which instructions produce values that lead to derivative
41
/// computations).
42
///
43
/// Basic blocks are always exited in WQM as long as some successor needs WQM.
44
///
45
/// There is room for improvement given better control flow analysis:
46
///
47
///  (1) at the top level (outside of control flow statements, and as long as
48
///      kill hasn't been used), one SGPR can be saved by recovering WQM from
49
///      the LiveMask (this is implemented for the entry block).
50
///
51
///  (2) when entire regions (e.g. if-else blocks or entire loops) only
52
///      consist of exact and don't-care instructions, the switch only has to
53
///      be done at the entry and exit points rather than potentially in each
54
///      block of the region.
55
///
56
//===----------------------------------------------------------------------===//
57
58
#include "AMDGPU.h"
59
#include "AMDGPUSubtarget.h"
60
#include "SIInstrInfo.h"
61
#include "SIMachineFunctionInfo.h"
62
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
63
#include "llvm/ADT/DenseMap.h"
64
#include "llvm/ADT/PostOrderIterator.h"
65
#include "llvm/ADT/SmallVector.h"
66
#include "llvm/ADT/StringRef.h"
67
#include "llvm/CodeGen/LiveInterval.h"
68
#include "llvm/CodeGen/LiveIntervals.h"
69
#include "llvm/CodeGen/MachineBasicBlock.h"
70
#include "llvm/CodeGen/MachineFunction.h"
71
#include "llvm/CodeGen/MachineFunctionPass.h"
72
#include "llvm/CodeGen/MachineInstr.h"
73
#include "llvm/CodeGen/MachineInstrBuilder.h"
74
#include "llvm/CodeGen/MachineOperand.h"
75
#include "llvm/CodeGen/MachineRegisterInfo.h"
76
#include "llvm/CodeGen/SlotIndexes.h"
77
#include "llvm/CodeGen/TargetRegisterInfo.h"
78
#include "llvm/IR/CallingConv.h"
79
#include "llvm/IR/DebugLoc.h"
80
#include "llvm/MC/MCRegisterInfo.h"
81
#include "llvm/Pass.h"
82
#include "llvm/Support/Debug.h"
83
#include "llvm/Support/raw_ostream.h"
84
#include <cassert>
85
#include <vector>
86
87
using namespace llvm;
88
89
#define DEBUG_TYPE "si-wqm"
90
91
namespace {
92
93
enum {
94
  StateWQM = 0x1,
95
  StateWWM = 0x2,
96
  StateExact = 0x4,
97
};
98
99
struct PrintState {
100
public:
101
  int State;
102
103
0
  explicit PrintState(int State) : State(State) {}
104
};
105
106
#ifndef NDEBUG
107
static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
108
  if (PS.State & StateWQM)
109
    OS << "WQM";
110
  if (PS.State & StateWWM) {
111
    if (PS.State & StateWQM)
112
      OS << '|';
113
    OS << "WWM";
114
  }
115
  if (PS.State & StateExact) {
116
    if (PS.State & (StateWQM | StateWWM))
117
      OS << '|';
118
    OS << "Exact";
119
  }
120
121
  return OS;
122
}
123
#endif
124
125
struct InstrInfo {
126
  char Needs = 0;
127
  char Disabled = 0;
128
  char OutNeeds = 0;
129
};
130
131
struct BlockInfo {
132
  char Needs = 0;
133
  char InNeeds = 0;
134
  char OutNeeds = 0;
135
};
136
137
struct WorkItem {
138
  MachineBasicBlock *MBB = nullptr;
139
  MachineInstr *MI = nullptr;
140
141
  WorkItem() = default;
142
4.65k
  WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
143
22.6k
  WorkItem(MachineInstr *MI) : MI(MI) {}
144
};
145
146
class SIWholeQuadMode : public MachineFunctionPass {
147
private:
148
  CallingConv::ID CallingConv;
149
  const SIInstrInfo *TII;
150
  const SIRegisterInfo *TRI;
151
  const GCNSubtarget *ST;
152
  MachineRegisterInfo *MRI;
153
  LiveIntervals *LIS;
154
155
  DenseMap<const MachineInstr *, InstrInfo> Instructions;
156
  DenseMap<MachineBasicBlock *, BlockInfo> Blocks;
157
  SmallVector<MachineInstr *, 1> LiveMaskQueries;
158
  SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
159
160
  void printInfo();
161
162
  void markInstruction(MachineInstr &MI, char Flag,
163
                       std::vector<WorkItem> &Worklist);
164
  void markInstructionUses(const MachineInstr &MI, char Flag,
165
                           std::vector<WorkItem> &Worklist);
166
  char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
167
  void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
168
  void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
169
  char analyzeFunction(MachineFunction &MF);
170
171
  bool requiresCorrectState(const MachineInstr &MI) const;
172
173
  MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
174
                                      MachineBasicBlock::iterator Before);
175
  MachineBasicBlock::iterator
176
  prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
177
                   MachineBasicBlock::iterator Last, bool PreferLast,
178
                   bool SaveSCC);
179
  void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
180
               unsigned SaveWQM, unsigned LiveMaskReg);
181
  void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
182
             unsigned SavedWQM);
183
  void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
184
             unsigned SaveOrig);
185
  void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
186
               unsigned SavedOrig);
187
  void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);
188
189
  void lowerLiveMaskQueries(unsigned LiveMaskReg);
190
  void lowerCopyInstrs();
191
192
public:
193
  static char ID;
194
195
  SIWholeQuadMode() :
196
2.44k
    MachineFunctionPass(ID) { }
197
198
  bool runOnMachineFunction(MachineFunction &MF) override;
199
200
27.9k
  StringRef getPassName() const override { return "SI Whole Quad Mode"; }
201
202
2.42k
  void getAnalysisUsage(AnalysisUsage &AU) const override {
203
2.42k
    AU.addRequired<LiveIntervals>();
204
2.42k
    AU.addPreserved<SlotIndexes>();
205
2.42k
    AU.addPreserved<LiveIntervals>();
206
2.42k
    AU.setPreservesCFG();
207
2.42k
    MachineFunctionPass::getAnalysisUsage(AU);
208
2.42k
  }
209
};
210
211
} // end anonymous namespace
212
213
char SIWholeQuadMode::ID = 0;
214
215
101k
INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
216
101k
                      false)
217
101k
INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
218
101k
INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
219
                    false)
220
221
char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
222
223
2.44k
FunctionPass *llvm::createSIWholeQuadModePass() {
224
2.44k
  return new SIWholeQuadMode;
225
2.44k
}
226
227
#ifndef NDEBUG
228
LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
229
  for (const auto &BII : Blocks) {
230
    dbgs() << "\n"
231
           << printMBBReference(*BII.first) << ":\n"
232
           << "  InNeeds = " << PrintState(BII.second.InNeeds)
233
           << ", Needs = " << PrintState(BII.second.Needs)
234
           << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
235
236
    for (const MachineInstr &MI : *BII.first) {
237
      auto III = Instructions.find(&MI);
238
      if (III == Instructions.end())
239
        continue;
240
241
      dbgs() << "  " << MI << "    Needs = " << PrintState(III->second.Needs)
242
             << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
243
    }
244
  }
245
}
246
#endif
247
248
void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
249
15.7k
                                      std::vector<WorkItem> &Worklist) {
250
15.7k
  InstrInfo &II = Instructions[&MI];
251
15.7k
252
15.7k
  assert(!(Flag & StateExact) && Flag != 0);
253
15.7k
254
15.7k
  // Remove any disabled states from the flag. The user that required it gets
255
15.7k
  // an undefined value in the helper lanes. For example, this can happen if
256
15.7k
  // the result of an atomic is used by instruction that requires WQM, where
257
15.7k
  // ignoring the request for WQM is correct as per the relevant specs.
258
15.7k
  Flag &= ~II.Disabled;
259
15.7k
260
15.7k
  // Ignore if the flag is already encompassed by the existing needs, or we
261
15.7k
  // just disabled everything.
262
15.7k
  if ((II.Needs & Flag) == Flag)
263
7.71k
    return;
264
8.06k
265
8.06k
  II.Needs |= Flag;
266
8.06k
  Worklist.push_back(&MI);
267
8.06k
}
268
269
/// Mark all instructions defining the uses in \p MI with \p Flag.
270
void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
271
15.2k
                                          std::vector<WorkItem> &Worklist) {
272
47.3k
  for (const MachineOperand &Use : MI.uses()) {
273
47.3k
    if (!Use.isReg() || 
!Use.isUse()28.3k
)
274
19.2k
      continue;
275
28.0k
276
28.0k
    unsigned Reg = Use.getReg();
277
28.0k
278
28.0k
    // Handle physical registers that we need to track; this is mostly relevant
279
28.0k
    // for VCC, which can appear as the (implicit) input of a uniform branch,
280
28.0k
    // e.g. when a loop counter is stored in a VGPR.
281
28.0k
    if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
282
12.4k
      if (Reg == AMDGPU::EXEC || 
Reg == AMDGPU::EXEC_LO9.47k
)
283
2.93k
        continue;
284
9.46k
285
18.9k
      
for (MCRegUnitIterator RegUnit(Reg, TRI); 9.46k
RegUnit.isValid();
++RegUnit9.52k
) {
286
9.52k
        LiveRange &LR = LIS->getRegUnit(*RegUnit);
287
9.52k
        const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
288
9.52k
        if (!Value)
289
430
          continue;
290
9.09k
291
9.09k
        // Since we're in machine SSA, we do not need to track physical
292
9.09k
        // registers across basic blocks.
293
9.09k
        if (Value->isPHIDef())
294
9.04k
          continue;
295
50
296
50
        markInstruction(*LIS->getInstructionFromIndex(Value->def), Flag,
297
50
                        Worklist);
298
50
      }
299
9.46k
300
9.46k
      continue;
301
9.46k
    }
302
15.6k
303
15.6k
    for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
304
15.6k
      markInstruction(DefMI, Flag, Worklist);
305
15.6k
  }
306
15.2k
}
307
308
// Scan instructions to determine which ones require an Exact execmask and
309
// which ones seed WQM requirements.
310
char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
311
25.4k
                                       std::vector<WorkItem> &Worklist) {
312
25.4k
  char GlobalFlags = 0;
313
25.4k
  bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
314
25.4k
  SmallVector<MachineInstr *, 4> SetInactiveInstrs;
315
25.4k
316
25.4k
  // We need to visit the basic blocks in reverse post-order so that we visit
317
25.4k
  // defs before uses, in particular so that we don't accidentally mark an
318
25.4k
  // instruction as needing e.g. WQM before visiting it and realizing it needs
319
25.4k
  // WQM disabled.
320
25.4k
  ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
321
54.6k
  for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; 
++BI29.1k
) {
322
29.1k
    MachineBasicBlock &MBB = **BI;
323
29.1k
    BlockInfo &BBI = Blocks[&MBB];
324
29.1k
325
553k
    for (auto II = MBB.begin(), IE = MBB.end(); II != IE; 
++II523k
) {
326
523k
      MachineInstr &MI = *II;
327
523k
      InstrInfo &III = Instructions[&MI];
328
523k
      unsigned Opcode = MI.getOpcode();
329
523k
      char Flags = 0;
330
523k
331
523k
      if (TII->isWQM(Opcode)) {
332
413
        // Sampling instructions don't need to produce results for all pixels
333
413
        // in a quad, they just require all inputs of a quad to have been
334
413
        // computed for derivatives.
335
413
        markInstructionUses(MI, StateWQM, Worklist);
336
413
        GlobalFlags |= StateWQM;
337
413
        continue;
338
523k
      } else if (Opcode == AMDGPU::WQM) {
339
22
        // The WQM intrinsic requires its output to have all the helper lanes
340
22
        // correct, so we need it to be in WQM.
341
22
        Flags = StateWQM;
342
22
        LowerToCopyInstrs.push_back(&MI);
343
523k
      } else if (Opcode == AMDGPU::WWM) {
344
120
        // The WWM intrinsic doesn't make the same guarantee, and plus it needs
345
120
        // to be executed in WQM or Exact so that its copy doesn't clobber
346
120
        // inactive lanes.
347
120
        markInstructionUses(MI, StateWWM, Worklist);
348
120
        GlobalFlags |= StateWWM;
349
120
        LowerToCopyInstrs.push_back(&MI);
350
120
        continue;
351
523k
      } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
352
523k
                 
Opcode == AMDGPU::V_SET_INACTIVE_B64523k
) {
353
75
        III.Disabled = StateWWM;
354
75
        MachineOperand &Inactive = MI.getOperand(2);
355
75
        if (Inactive.isReg()) {
356
18
          if (Inactive.isUndef()) {
357
0
            LowerToCopyInstrs.push_back(&MI);
358
18
          } else {
359
18
            unsigned Reg = Inactive.getReg();
360
18
            if (TargetRegisterInfo::isVirtualRegister(Reg)) {
361
18
              for (MachineInstr &DefMI : MRI->def_instructions(Reg))
362
18
                markInstruction(DefMI, StateWWM, Worklist);
363
18
            }
364
18
          }
365
18
        }
366
75
        SetInactiveInstrs.push_back(&MI);
367
75
        continue;
368
523k
      } else if (TII->isDisableWQM(MI)) {
369
3.90k
        BBI.Needs |= StateExact;
370
3.90k
        if (!(BBI.InNeeds & StateExact)) {
371
3.33k
          BBI.InNeeds |= StateExact;
372
3.33k
          Worklist.push_back(&MBB);
373
3.33k
        }
374
3.90k
        GlobalFlags |= StateExact;
375
3.90k
        III.Disabled = StateWQM | StateWWM;
376
3.90k
        continue;
377
519k
      } else {
378
519k
        if (Opcode == AMDGPU::SI_PS_LIVE) {
379
13
          LiveMaskQueries.push_back(&MI);
380
519k
        } else if (WQMOutputs) {
381
8
          // The function is in machine SSA form, which means that physical
382
8
          // VGPRs correspond to shader inputs and outputs. Inputs are
383
8
          // only used, outputs are only defined.
384
8
          for (const MachineOperand &MO : MI.defs()) {
385
8
            if (!MO.isReg())
386
0
              continue;
387
8
388
8
            unsigned Reg = MO.getReg();
389
8
390
8
            if (!TRI->isVirtualRegister(Reg) &&
391
8
                
TRI->hasVectorRegisters(TRI->getPhysRegClass(Reg))2
) {
392
2
              Flags = StateWQM;
393
2
              break;
394
2
            }
395
8
          }
396
8
        }
397
519k
398
519k
        if (!Flags)
399
519k
          continue;
400
24
      }
401
24
402
24
      markInstruction(MI, Flags, Worklist);
403
24
      GlobalFlags |= Flags;
404
24
    }
405
29.1k
  }
406
25.4k
407
25.4k
  // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
408
25.4k
  // ever used anywhere in the function. This implements the corresponding
409
25.4k
  // semantics of @llvm.amdgcn.set.inactive.
410
25.4k
  if (GlobalFlags & StateWQM) {
411
390
    for (MachineInstr *MI : SetInactiveInstrs)
412
4
      markInstruction(*MI, StateWQM, Worklist);
413
390
  }
414
25.4k
415
25.4k
  return GlobalFlags;
416
25.4k
}
417
418
void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
419
22.6k
                                           std::vector<WorkItem>& Worklist) {
420
22.6k
  MachineBasicBlock *MBB = MI.getParent();
421
22.6k
  InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
422
22.6k
  BlockInfo &BI = Blocks[MBB];
423
22.6k
424
22.6k
  // Control flow-type instructions and stores to temporary memory that are
425
22.6k
  // followed by WQM computations must themselves be in WQM.
426
22.6k
  if ((II.OutNeeds & StateWQM) && 
!(II.Disabled & StateWQM)12.1k
&&
427
22.6k
      
(12.0k
MI.isTerminator()12.0k
||
(11.9k
TII->usesVM_CNT(MI)11.9k
&&
MI.mayStore()168
))) {
428
163
    Instructions[&MI].Needs = StateWQM;
429
163
    II.Needs = StateWQM;
430
163
  }
431
22.6k
432
22.6k
  // Propagate to block level
433
22.6k
  if (II.Needs & StateWQM) {
434
13.7k
    BI.Needs |= StateWQM;
435
13.7k
    if (!(BI.InNeeds & StateWQM)) {
436
433
      BI.InNeeds |= StateWQM;
437
433
      Worklist.push_back(MBB);
438
433
    }
439
13.7k
  }
440
22.6k
441
22.6k
  // Propagate backwards within block
442
22.6k
  if (MachineInstr *PrevMI = MI.getPrevNode()) {
443
21.3k
    char InNeeds = (II.Needs & ~StateWWM) | II.OutNeeds;
444
21.3k
    if (!PrevMI->isPHI()) {
445
21.1k
      InstrInfo &PrevII = Instructions[PrevMI];
446
21.1k
      if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
447
13.8k
        PrevII.OutNeeds |= InNeeds;
448
13.8k
        Worklist.push_back(PrevMI);
449
13.8k
      }
450
21.1k
    }
451
21.3k
  }
452
22.6k
453
22.6k
  // Propagate WQM flag to instruction inputs
454
22.6k
  assert(!(II.Needs & StateExact));
455
22.6k
456
22.6k
  if (II.Needs != 0)
457
14.7k
    markInstructionUses(MI, II.Needs, Worklist);
458
22.6k
459
22.6k
  // Ensure we process a block containing WWM, even if it does not require any
460
22.6k
  // WQM transitions.
461
22.6k
  if (II.Needs & StateWWM)
462
1.01k
    BI.Needs |= StateWWM;
463
22.6k
}
464
465
void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
466
4.65k
                                     std::vector<WorkItem>& Worklist) {
467
4.65k
  BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
468
4.65k
469
4.65k
  // Propagate through instructions
470
4.65k
  if (!MBB.empty()) {
471
4.64k
    MachineInstr *LastMI = &*MBB.rbegin();
472
4.64k
    InstrInfo &LastII = Instructions[LastMI];
473
4.64k
    if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
474
737
      LastII.OutNeeds |= BI.OutNeeds;
475
737
      Worklist.push_back(LastMI);
476
737
    }
477
4.64k
  }
478
4.65k
479
4.65k
  // Predecessor blocks must provide for our WQM/Exact needs.
480
4.65k
  for (MachineBasicBlock *Pred : MBB.predecessors()) {
481
1.28k
    BlockInfo &PredBI = Blocks[Pred];
482
1.28k
    if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
483
536
      continue;
484
751
485
751
    PredBI.OutNeeds |= BI.InNeeds;
486
751
    PredBI.InNeeds |= BI.InNeeds;
487
751
    Worklist.push_back(Pred);
488
751
  }
489
4.65k
490
4.65k
  // All successors must be prepared to accept the same set of WQM/Exact data.
491
4.65k
  for (MachineBasicBlock *Succ : MBB.successors()) {
492
1.39k
    BlockInfo &SuccBI = Blocks[Succ];
493
1.39k
    if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
494
1.25k
      continue;
495
136
496
136
    SuccBI.InNeeds |= BI.OutNeeds;
497
136
    Worklist.push_back(Succ);
498
136
  }
499
4.65k
}
500
501
25.4k
char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
502
25.4k
  std::vector<WorkItem> Worklist;
503
25.4k
  char GlobalFlags = scanInstructions(MF, Worklist);
504
25.4k
505
52.8k
  while (!Worklist.empty()) {
506
27.3k
    WorkItem WI = Worklist.back();
507
27.3k
    Worklist.pop_back();
508
27.3k
509
27.3k
    if (WI.MI)
510
22.6k
      propagateInstruction(*WI.MI, Worklist);
511
4.65k
    else
512
4.65k
      propagateBlock(*WI.MBB, Worklist);
513
27.3k
  }
514
25.4k
515
25.4k
  return GlobalFlags;
516
25.4k
}
517
518
/// Whether \p MI really requires the exec state computed during analysis.
519
///
520
/// Scalar instructions must occasionally be marked WQM for correct propagation
521
/// (e.g. thread masks leading up to branches), but when it comes to actual
522
/// execution, they don't care about EXEC.
523
14.4k
bool SIWholeQuadMode::requiresCorrectState(const MachineInstr &MI) const {
524
14.4k
  if (MI.isTerminator())
525
719
    return true;
526
13.7k
527
13.7k
  // Skip instructions that are not affected by EXEC
528
13.7k
  if (TII->isScalarUnit(MI))
529
660
    return false;
530
13.0k
531
13.0k
  // Generic instructions such as COPY will either disappear by register
532
13.0k
  // coalescing or be lowered to SALU or VALU instructions.
533
13.0k
  if (MI.isTransient()) {
534
10.0k
    if (MI.getNumExplicitOperands() >= 1) {
535
10.0k
      const MachineOperand &Op = MI.getOperand(0);
536
10.0k
      if (Op.isReg()) {
537
10.0k
        if (TRI->isSGPRReg(*MRI, Op.getReg())) {
538
5.52k
          // SGPR instructions are not affected by EXEC
539
5.52k
          return false;
540
5.52k
        }
541
7.57k
      }
542
10.0k
    }
543
10.0k
  }
544
7.57k
545
7.57k
  return true;
546
7.57k
}
547
548
MachineBasicBlock::iterator
549
SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
550
0
                         MachineBasicBlock::iterator Before) {
551
0
  unsigned SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
552
0
553
0
  MachineInstr *Save =
554
0
      BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
555
0
          .addReg(AMDGPU::SCC);
556
0
  MachineInstr *Restore =
557
0
      BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
558
0
          .addReg(SaveReg);
559
0
560
0
  LIS->InsertMachineInstrInMaps(*Save);
561
0
  LIS->InsertMachineInstrInMaps(*Restore);
562
0
  LIS->createAndComputeVirtRegInterval(SaveReg);
563
0
564
0
  return Restore;
565
0
}
566
567
// Return an iterator in the (inclusive) range [First, Last] at which
568
// instructions can be safely inserted, keeping in mind that some of the
569
// instructions we want to add necessarily clobber SCC.
570
MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
571
    MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
572
1.05k
    MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
573
1.05k
  if (!SaveSCC)
574
58
    return PreferLast ? 
Last6
:
First52
;
575
994
576
994
  LiveRange &LR = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
577
994
  auto MBBE = MBB.end();
578
994
  SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
579
994
                                     : 
LIS->getMBBEndIdx(&MBB)0
;
580
994
  SlotIndex LastIdx =
581
994
      Last != MBBE ? 
LIS->getInstructionIndex(*Last)988
:
LIS->getMBBEndIdx(&MBB)6
;
582
994
  SlotIndex Idx = PreferLast ? 
LastIdx395
:
FirstIdx599
;
583
994
  const LiveRange::Segment *S;
584
994
585
996
  for (;;) {
586
996
    S = LR.getSegmentContaining(Idx);
587
996
    if (!S)
588
994
      break;
589
2
590
2
    if (PreferLast) {
591
2
      SlotIndex Next = S->start.getBaseIndex();
592
2
      if (Next < FirstIdx)
593
0
        break;
594
2
      Idx = Next;
595
2
    } else {
596
0
      SlotIndex Next = S->end.getNextIndex().getBaseIndex();
597
0
      if (Next > LastIdx)
598
0
        break;
599
0
      Idx = Next;
600
0
    }
601
2
  }
602
994
603
994
  MachineBasicBlock::iterator MBBI;
604
994
605
994
  if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
606
994
    MBBI = MI;
607
0
  else {
608
0
    assert(Idx == LIS->getMBBEndIdx(&MBB));
609
0
    MBBI = MBB.end();
610
0
  }
611
994
612
994
  if (S)
613
0
    MBBI = saveSCC(MBB, MBBI);
614
994
615
994
  return MBBI;
616
994
}
617
618
void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
619
                              MachineBasicBlock::iterator Before,
620
409
                              unsigned SaveWQM, unsigned LiveMaskReg) {
621
409
  MachineInstr *MI;
622
409
623
409
  if (SaveWQM) {
624
6
    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
625
6
                   
AMDGPU::S_AND_SAVEEXEC_B320
: AMDGPU::S_AND_SAVEEXEC_B64),
626
6
                 SaveWQM)
627
6
             .addReg(LiveMaskReg);
628
403
  } else {
629
403
    unsigned Exec = ST->isWave32() ? 
AMDGPU::EXEC_LO89
:
AMDGPU::EXEC314
;
630
403
    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
631
314
                   
AMDGPU::S_AND_B3289
: AMDGPU::S_AND_B64),
632
403
                 Exec)
633
403
             .addReg(Exec)
634
403
             .addReg(LiveMaskReg);
635
403
  }
636
409
637
409
  LIS->InsertMachineInstrInMaps(*MI);
638
409
}
639
640
void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
641
                            MachineBasicBlock::iterator Before,
642
401
                            unsigned SavedWQM) {
643
401
  MachineInstr *MI;
644
401
645
401
  unsigned Exec = ST->isWave32() ? 
AMDGPU::EXEC_LO89
:
AMDGPU::EXEC312
;
646
401
  if (SavedWQM) {
647
6
    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
648
6
             .addReg(SavedWQM);
649
395
  } else {
650
395
    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
651
306
                   
AMDGPU::S_WQM_B3289
: AMDGPU::S_WQM_B64),
652
395
                 Exec)
653
395
             .addReg(Exec);
654
395
  }
655
401
656
401
  LIS->InsertMachineInstrInMaps(*MI);
657
401
}
658
659
void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB,
660
                            MachineBasicBlock::iterator Before,
661
122
                            unsigned SaveOrig) {
662
122
  MachineInstr *MI;
663
122
664
122
  assert(SaveOrig);
665
122
  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_WWM), SaveOrig)
666
122
           .addImm(-1);
667
122
  LIS->InsertMachineInstrInMaps(*MI);
668
122
}
669
670
void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB,
671
                              MachineBasicBlock::iterator Before,
672
122
                              unsigned SavedOrig) {
673
122
  MachineInstr *MI;
674
122
675
122
  assert(SavedOrig);
676
122
  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM),
677
122
               ST->isWave32() ? 
AMDGPU::EXEC_LO6
:
AMDGPU::EXEC116
)
678
122
           .addReg(SavedOrig);
679
122
  LIS->InsertMachineInstrInMaps(*MI);
680
122
}
681
682
void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
683
761
                                   bool isEntry) {
684
761
  auto BII = Blocks.find(&MBB);
685
761
  if (BII == Blocks.end())
686
0
    return;
687
761
688
761
  const BlockInfo &BI = BII->second;
689
761
690
761
  // This is a non-entry block that is WQM throughout, so no need to do
691
761
  // anything.
692
761
  if (!isEntry && 
BI.Needs == StateWQM306
&&
BI.OutNeeds != StateExact80
)
693
64
    return;
694
697
695
697
  LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
696
697
                    << ":\n");
697
697
698
697
  unsigned SavedWQMReg = 0;
699
697
  unsigned SavedNonWWMReg = 0;
700
697
  bool WQMFromExec = isEntry;
701
697
  char State = (isEntry || 
!(BI.InNeeds & StateWQM)242
) ?
StateExact638
:
StateWQM59
;
702
697
  char NonWWMState = 0;
703
697
  const TargetRegisterClass *BoolRC = TRI->getBoolRC();
704
697
705
697
  auto II = MBB.getFirstNonPHI(), IE = MBB.end();
706
697
  if (isEntry)
707
455
    ++II; // Skip the instruction that saves LiveMask
708
697
709
697
  // This stores the first instruction where it's safe to switch from WQM to
710
697
  // Exact or vice versa.
711
697
  MachineBasicBlock::iterator FirstWQM = IE;
712
697
713
697
  // This stores the first instruction where it's safe to switch from WWM to
714
697
  // Exact/WQM or to switch to WWM. It must always be the same as, or after,
715
697
  // FirstWQM since if it's safe to switch to/from WWM, it must be safe to
716
697
  // switch to/from WQM as well.
717
697
  MachineBasicBlock::iterator FirstWWM = IE;
718
15.1k
  for (;;) {
719
15.1k
    MachineBasicBlock::iterator Next = II;
720
15.1k
    char Needs = StateExact | StateWQM; // WWM is disabled by default
721
15.1k
    char OutNeeds = 0;
722
15.1k
723
15.1k
    if (FirstWQM == IE)
724
4.16k
      FirstWQM = II;
725
15.1k
726
15.1k
    if (FirstWWM == IE)
727
8.99k
      FirstWWM = II;
728
15.1k
729
15.1k
    // First, figure out the allowed states (Needs) based on the propagated
730
15.1k
    // flags.
731
15.1k
    if (II != IE) {
732
14.4k
      MachineInstr &MI = *II;
733
14.4k
734
14.4k
      if (requiresCorrectState(MI)) {
735
8.29k
        auto III = Instructions.find(&MI);
736
8.29k
        if (III != Instructions.end()) {
737
8.29k
          if (III->second.Needs & StateWWM)
738
576
            Needs = StateWWM;
739
7.72k
          else if (III->second.Needs & StateWQM)
740
2.25k
            Needs = StateWQM;
741
5.46k
          else
742
5.46k
            Needs &= ~III->second.Disabled;
743
8.29k
          OutNeeds = III->second.OutNeeds;
744
8.29k
        }
745
8.29k
      } else {
746
6.18k
        // If the instruction doesn't actually need a correct EXEC, then we can
747
6.18k
        // safely leave WWM enabled.
748
6.18k
        Needs = StateExact | StateWQM | StateWWM;
749
6.18k
      }
750
14.4k
751
14.4k
      if (MI.isTerminator() && 
OutNeeds == StateExact719
)
752
169
        Needs = StateExact;
753
14.4k
754
14.4k
      if (MI.getOpcode() == AMDGPU::SI_ELSE && 
BI.OutNeeds == StateExact7
)
755
7
        MI.getOperand(3).setImm(1);
756
14.4k
757
14.4k
      ++Next;
758
14.4k
    } else {
759
697
      // End of basic block
760
697
      if (BI.OutNeeds & StateWQM)
761
50
        Needs = StateWQM;
762
647
      else if (BI.OutNeeds == StateExact)
763
152
        Needs = StateExact;
764
495
      else
765
495
        Needs = StateWQM | StateExact;
766
697
    }
767
15.1k
768
15.1k
    // Now, transition if necessary.
769
15.1k
    if (!(Needs & State)) {
770
1.05k
      MachineBasicBlock::iterator First;
771
1.05k
      if (State == StateWWM || 
Needs == StateWWM930
) {
772
244
        // We must switch to or from WWM
773
244
        First = FirstWWM;
774
808
      } else {
775
808
        // We only need to switch to/from WQM, so we can use FirstWQM
776
808
        First = FirstWQM;
777
808
      }
778
1.05k
779
1.05k
      MachineBasicBlock::iterator Before =
780
1.05k
          prepareInsertion(MBB, First, II, Needs == StateWQM,
781
1.05k
                           Needs == StateExact || 
WQMFromExec643
);
782
1.05k
783
1.05k
      if (State == StateWWM) {
784
122
        assert(SavedNonWWMReg);
785
122
        fromWWM(MBB, Before, SavedNonWWMReg);
786
122
        State = NonWWMState;
787
122
      }
788
1.05k
789
1.05k
      if (Needs == StateWWM) {
790
122
        NonWWMState = State;
791
122
        SavedNonWWMReg = MRI->createVirtualRegister(BoolRC);
792
122
        toWWM(MBB, Before, SavedNonWWMReg);
793
122
        State = StateWWM;
794
930
      } else {
795
930
        if (State == StateWQM && 
(Needs & StateExact)411
&&
!(Needs & StateWQM)411
) {
796
409
          if (!WQMFromExec && 
(OutNeeds & StateWQM)54
)
797
6
            SavedWQMReg = MRI->createVirtualRegister(BoolRC);
798
409
799
409
          toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
800
409
          State = StateExact;
801
521
        } else if (State == StateExact && 
(Needs & StateWQM)519
&&
802
521
                   
!(Needs & StateExact)519
) {
803
401
          assert(WQMFromExec == (SavedWQMReg == 0));
804
401
805
401
          toWQM(MBB, Before, SavedWQMReg);
806
401
807
401
          if (SavedWQMReg) {
808
6
            LIS->createAndComputeVirtRegInterval(SavedWQMReg);
809
6
            SavedWQMReg = 0;
810
6
          }
811
401
          State = StateWQM;
812
401
        } else {
813
120
          // We can get here if we transitioned from WWM to a non-WWM state that
814
120
          // already matches our needs, but we shouldn't need to do anything.
815
120
          assert(Needs & State);
816
120
        }
817
930
      }
818
1.05k
    }
819
15.1k
820
15.1k
    if (Needs != (StateExact | StateWQM | StateWWM)) {
821
8.99k
      if (Needs != (StateExact | StateWQM))
822
3.67k
        FirstWQM = IE;
823
8.99k
      FirstWWM = IE;
824
8.99k
    }
825
15.1k
826
15.1k
    if (II == IE)
827
697
      break;
828
14.4k
    II = Next;
829
14.4k
  }
830
697
}
831
832
25.4k
void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
833
25.4k
  for (MachineInstr *MI : LiveMaskQueries) {
834
13
    const DebugLoc &DL = MI->getDebugLoc();
835
13
    unsigned Dest = MI->getOperand(0).getReg();
836
13
    MachineInstr *Copy =
837
13
        BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
838
13
            .addReg(LiveMaskReg);
839
13
840
13
    LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
841
13
    MI->eraseFromParent();
842
13
  }
843
25.4k
}
844
845
462
void SIWholeQuadMode::lowerCopyInstrs() {
846
462
  for (MachineInstr *MI : LowerToCopyInstrs) {
847
142
    for (unsigned i = MI->getNumExplicitOperands() - 1; i > 1; 
i--0
)
848
0
      MI->RemoveOperand(i);
849
142
850
142
    const unsigned Reg = MI->getOperand(0).getReg();
851
142
852
142
    if (TRI->isVGPR(*MRI, Reg)) {
853
102
      const TargetRegisterClass *regClass =
854
102
          TargetRegisterInfo::isVirtualRegister(Reg)
855
102
              ? 
MRI->getRegClass(Reg)101
856
102
              : 
TRI->getPhysRegClass(Reg)1
;
857
102
858
102
      const unsigned MovOp = TII->getMovOpcode(regClass);
859
102
      MI->setDesc(TII->get(MovOp));
860
102
861
102
      // And make it implicitly depend on exec (like all VALU movs should do).
862
102
      MI->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
863
102
    } else {
864
40
      MI->setDesc(TII->get(AMDGPU::COPY));
865
40
    }
866
142
  }
867
462
}
868
869
25.4k
bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
870
25.4k
  Instructions.clear();
871
25.4k
  Blocks.clear();
872
25.4k
  LiveMaskQueries.clear();
873
25.4k
  LowerToCopyInstrs.clear();
874
25.4k
  CallingConv = MF.getFunction().getCallingConv();
875
25.4k
876
25.4k
  ST = &MF.getSubtarget<GCNSubtarget>();
877
25.4k
878
25.4k
  TII = ST->getInstrInfo();
879
25.4k
  TRI = &TII->getRegisterInfo();
880
25.4k
  MRI = &MF.getRegInfo();
881
25.4k
  LIS = &getAnalysis<LiveIntervals>();
882
25.4k
883
25.4k
  char GlobalFlags = analyzeFunction(MF);
884
25.4k
  unsigned LiveMaskReg = 0;
885
25.4k
  unsigned Exec = ST->isWave32() ? 
AMDGPU::EXEC_LO1.92k
:
AMDGPU::EXEC23.5k
;
886
25.4k
  if (!(GlobalFlags & StateWQM)) {
887
25.0k
    lowerLiveMaskQueries(Exec);
888
25.0k
    if (!(GlobalFlags & StateWWM))
889
25.0k
      return !LiveMaskQueries.empty();
890
390
  } else {
891
390
    // Store a copy of the original live mask when required
892
390
    MachineBasicBlock &Entry = MF.front();
893
390
    MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
894
390
895
390
    if (GlobalFlags & StateExact || 
!LiveMaskQueries.empty()7
) {
896
383
      LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
897
383
      MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(),
898
383
                                 TII->get(AMDGPU::COPY), LiveMaskReg)
899
383
                             .addReg(Exec);
900
383
      LIS->InsertMachineInstrInMaps(*MI);
901
383
    }
902
390
903
390
    lowerLiveMaskQueries(LiveMaskReg);
904
390
905
390
    if (GlobalFlags == StateWQM) {
906
7
      // For a shader that needs only WQM, we can just set it once.
907
7
      BuildMI(Entry, EntryMI, DebugLoc(), TII->get(ST->isWave32() ?
908
7
                
AMDGPU::S_WQM_B320
: AMDGPU::S_WQM_B64),
909
7
              Exec)
910
7
          .addReg(Exec);
911
7
912
7
      lowerCopyInstrs();
913
7
      // EntryMI may become invalid here
914
7
      return true;
915
7
    }
916
455
  }
917
455
918
455
  LLVM_DEBUG(printInfo());
919
455
920
455
  lowerCopyInstrs();
921
455
922
455
  // Handle the general case
923
455
  for (auto BII : Blocks)
924
761
    processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin());
925
455
926
455
  // Physical registers like SCC aren't tracked by default anyway, so just
927
455
  // removing the ranges we computed is the simplest option for maintaining
928
455
  // the analysis results.
929
455
  LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
930
455
931
455
  return true;
932
455
}