Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
Line
Count
Source (jump to first uncovered line)
1
//===- SIFixSGPRCopies.cpp - Remove potential VGPR => SGPR copies ---------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
/// \file
10
/// Copies from VGPR to SGPR registers are illegal and the register coalescer
11
/// will sometimes generate these illegal copies in situations like this:
12
///
13
///  Register Class <vsrc> is the union of <vgpr> and <sgpr>
14
///
15
/// BB0:
16
///   %0 <sgpr> = SCALAR_INST
17
///   %1 <vsrc> = COPY %0 <sgpr>
18
///    ...
19
///    BRANCH %cond BB1, BB2
20
///  BB1:
21
///    %2 <vgpr> = VECTOR_INST
22
///    %3 <vsrc> = COPY %2 <vgpr>
23
///  BB2:
24
///    %4 <vsrc> = PHI %1 <vsrc>, <%bb.0>, %3 <vrsc>, <%bb.1>
25
///    %5 <vgpr> = VECTOR_INST %4 <vsrc>
26
///
27
///
28
/// The coalescer will begin at BB0 and eliminate its copy, then the resulting
29
/// code will look like this:
30
///
31
/// BB0:
32
///   %0 <sgpr> = SCALAR_INST
33
///    ...
34
///    BRANCH %cond BB1, BB2
35
/// BB1:
36
///   %2 <vgpr> = VECTOR_INST
37
///   %3 <vsrc> = COPY %2 <vgpr>
38
/// BB2:
39
///   %4 <sgpr> = PHI %0 <sgpr>, <%bb.0>, %3 <vsrc>, <%bb.1>
40
///   %5 <vgpr> = VECTOR_INST %4 <sgpr>
41
///
42
/// Now that the result of the PHI instruction is an SGPR, the register
43
/// allocator is now forced to constrain the register class of %3 to
44
/// <sgpr> so we end up with final code like this:
45
///
46
/// BB0:
47
///   %0 <sgpr> = SCALAR_INST
48
///    ...
49
///    BRANCH %cond BB1, BB2
50
/// BB1:
51
///   %2 <vgpr> = VECTOR_INST
52
///   %3 <sgpr> = COPY %2 <vgpr>
53
/// BB2:
54
///   %4 <sgpr> = PHI %0 <sgpr>, <%bb.0>, %3 <sgpr>, <%bb.1>
55
///   %5 <vgpr> = VECTOR_INST %4 <sgpr>
56
///
57
/// Now this code contains an illegal copy from a VGPR to an SGPR.
58
///
59
/// In order to avoid this problem, this pass searches for PHI instructions
60
/// which define a <vsrc> register and constrains its definition class to
61
/// <vgpr> if the user of the PHI's definition register is a vector instruction.
62
/// If the PHI's definition class is constrained to <vgpr> then the coalescer
63
/// will be unable to perform the COPY removal from the above example  which
64
/// ultimately led to the creation of an illegal COPY.
65
//===----------------------------------------------------------------------===//
66
67
#include "AMDGPU.h"
68
#include "AMDGPUSubtarget.h"
69
#include "SIInstrInfo.h"
70
#include "SIRegisterInfo.h"
71
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
72
#include "llvm/ADT/DenseSet.h"
73
#include "llvm/ADT/STLExtras.h"
74
#include "llvm/ADT/SmallSet.h"
75
#include "llvm/ADT/SmallVector.h"
76
#include "llvm/CodeGen/MachineBasicBlock.h"
77
#include "llvm/CodeGen/MachineDominators.h"
78
#include "llvm/CodeGen/MachineFunction.h"
79
#include "llvm/CodeGen/MachineFunctionPass.h"
80
#include "llvm/CodeGen/MachineInstr.h"
81
#include "llvm/CodeGen/MachineInstrBuilder.h"
82
#include "llvm/CodeGen/MachineOperand.h"
83
#include "llvm/CodeGen/MachineRegisterInfo.h"
84
#include "llvm/CodeGen/TargetRegisterInfo.h"
85
#include "llvm/Pass.h"
86
#include "llvm/Support/CodeGen.h"
87
#include "llvm/Support/CommandLine.h"
88
#include "llvm/Support/Debug.h"
89
#include "llvm/Support/raw_ostream.h"
90
#include "llvm/Target/TargetMachine.h"
91
#include <cassert>
92
#include <cstdint>
93
#include <iterator>
94
#include <list>
95
#include <map>
96
#include <tuple>
97
#include <utility>
98
99
using namespace llvm;
100
101
#define DEBUG_TYPE "si-fix-sgpr-copies"
102
103
static cl::opt<bool> EnableM0Merge(
104
  "amdgpu-enable-merge-m0",
105
  cl::desc("Merge and hoist M0 initializations"),
106
  cl::init(true));
107
108
namespace {
109
110
class SIFixSGPRCopies : public MachineFunctionPass {
111
  MachineDominatorTree *MDT;
112
113
public:
114
  static char ID;
115
116
2.41k
  SIFixSGPRCopies() : MachineFunctionPass(ID) {}
117
118
  bool runOnMachineFunction(MachineFunction &MF) override;
119
120
25.2k
  StringRef getPassName() const override { return "SI Fix SGPR copies"; }
121
122
2.39k
  void getAnalysisUsage(AnalysisUsage &AU) const override {
123
2.39k
    AU.addRequired<MachineDominatorTree>();
124
2.39k
    AU.addPreserved<MachineDominatorTree>();
125
2.39k
    AU.setPreservesCFG();
126
2.39k
    MachineFunctionPass::getAnalysisUsage(AU);
127
2.39k
  }
128
};
129
130
} // end anonymous namespace
131
132
101k
INITIALIZE_PASS_BEGIN(SIFixSGPRCopies, DEBUG_TYPE,
133
101k
                     "SI Fix SGPR copies", false, false)
134
101k
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
135
101k
INITIALIZE_PASS_END(SIFixSGPRCopies, DEBUG_TYPE,
136
                     "SI Fix SGPR copies", false, false)
137
138
char SIFixSGPRCopies::ID = 0;
139
140
char &llvm::SIFixSGPRCopiesID = SIFixSGPRCopies::ID;
141
142
0
FunctionPass *llvm::createSIFixSGPRCopiesPass() {
143
0
  return new SIFixSGPRCopies();
144
0
}
145
146
static bool hasVectorOperands(const MachineInstr &MI,
147
47.3k
                              const SIRegisterInfo *TRI) {
148
47.3k
  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
149
342k
  for (unsigned i = 0, e = MI.getNumOperands(); i != e; 
++i294k
) {
150
299k
    if (!MI.getOperand(i).isReg() ||
151
299k
        
!TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg())175k
)
152
123k
      continue;
153
175k
154
175k
    if (TRI->hasVectorRegisters(MRI.getRegClass(MI.getOperand(i).getReg())))
155
4.56k
      return true;
156
175k
  }
157
47.3k
  
return false42.8k
;
158
47.3k
}
159
160
static std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
161
getCopyRegClasses(const MachineInstr &Copy,
162
                  const SIRegisterInfo &TRI,
163
323k
                  const MachineRegisterInfo &MRI) {
164
323k
  unsigned DstReg = Copy.getOperand(0).getReg();
165
323k
  unsigned SrcReg = Copy.getOperand(1).getReg();
166
323k
167
323k
  const TargetRegisterClass *SrcRC =
168
323k
    TargetRegisterInfo::isVirtualRegister(SrcReg) ?
169
268k
    MRI.getRegClass(SrcReg) :
170
323k
    
TRI.getPhysRegClass(SrcReg)54.7k
;
171
323k
172
323k
  // We don't really care about the subregister here.
173
323k
  // SrcRC = TRI.getSubRegClass(SrcRC, Copy.getOperand(1).getSubReg());
174
323k
175
323k
  const TargetRegisterClass *DstRC =
176
323k
    TargetRegisterInfo::isVirtualRegister(DstReg) ?
177
323k
    MRI.getRegClass(DstReg) :
178
323k
    
TRI.getPhysRegClass(DstReg)0
;
179
323k
180
323k
  return std::make_pair(SrcRC, DstRC);
181
323k
}
182
183
static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC,
184
                             const TargetRegisterClass *DstRC,
185
316k
                             const SIRegisterInfo &TRI) {
186
316k
  return SrcRC != &AMDGPU::VReg_1RegClass && 
TRI.isSGPRClass(DstRC)315k
&&
187
316k
         
TRI.hasVectorRegisters(SrcRC)166k
;
188
316k
}
189
190
static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC,
191
                             const TargetRegisterClass *DstRC,
192
288k
                             const SIRegisterInfo &TRI) {
193
288k
  return DstRC != &AMDGPU::VReg_1RegClass && 
TRI.isSGPRClass(SrcRC)287k
&&
194
288k
         
TRI.hasVectorRegisters(DstRC)189k
;
195
288k
}
196
197
static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI,
198
                                      const SIRegisterInfo *TRI,
199
57.6k
                                      const SIInstrInfo *TII) {
200
57.6k
  MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
201
57.6k
  auto &Src = MI.getOperand(1);
202
57.6k
  unsigned DstReg = MI.getOperand(0).getReg();
203
57.6k
  unsigned SrcReg = Src.getReg();
204
57.6k
  if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
205
57.6k
      !TargetRegisterInfo::isVirtualRegister(DstReg))
206
0
    return false;
207
57.6k
208
115k
  
for (const auto &MO : MRI.reg_nodbg_operands(DstReg))57.6k
{
209
115k
    const auto *UseMI = MO.getParent();
210
115k
    if (UseMI == &MI)
211
57.6k
      continue;
212
57.7k
    if (MO.isDef() || UseMI->getParent() != MI.getParent() ||
213
57.7k
        
UseMI->getOpcode() <= TargetOpcode::GENERIC_OP_END57.3k
||
214
57.7k
        
!TII->isOperandLegal(*UseMI, UseMI->getOperandNo(&MO), &Src)52.4k
)
215
55.1k
      return false;
216
57.7k
  }
217
57.6k
  // Change VGPR to SGPR destination.
218
57.6k
  MRI.setRegClass(DstReg, TRI->getEquivalentSGPRClass(MRI.getRegClass(DstReg)));
219
2.43k
  return true;
220
57.6k
}
221
222
// Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE.
223
//
224
// SGPRx = ...
225
// SGPRy = REG_SEQUENCE SGPRx, sub0 ...
226
// VGPRz = COPY SGPRy
227
//
228
// ==>
229
//
230
// VGPRx = COPY SGPRx
231
// VGPRz = REG_SEQUENCE VGPRx, sub0
232
//
233
// This exposes immediate folding opportunities when materializing 64-bit
234
// immediates.
235
static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
236
                                        const SIRegisterInfo *TRI,
237
                                        const SIInstrInfo *TII,
238
68.7k
                                        MachineRegisterInfo &MRI) {
239
68.7k
  assert(MI.isRegSequence());
240
68.7k
241
68.7k
  unsigned DstReg = MI.getOperand(0).getReg();
242
68.7k
  if (!TRI->isSGPRClass(MRI.getRegClass(DstReg)))
243
25.9k
    return false;
244
42.8k
245
42.8k
  if (!MRI.hasOneUse(DstReg))
246
12.5k
    return false;
247
30.2k
248
30.2k
  MachineInstr &CopyUse = *MRI.use_instr_begin(DstReg);
249
30.2k
  if (!CopyUse.isCopy())
250
23.2k
    return false;
251
6.99k
252
6.99k
  // It is illegal to have vreg inputs to a physreg defining reg_sequence.
253
6.99k
  if (TargetRegisterInfo::isPhysicalRegister(CopyUse.getOperand(0).getReg()))
254
2
    return false;
255
6.99k
256
6.99k
  const TargetRegisterClass *SrcRC, *DstRC;
257
6.99k
  std::tie(SrcRC, DstRC) = getCopyRegClasses(CopyUse, *TRI, MRI);
258
6.99k
259
6.99k
  if (!isSGPRToVGPRCopy(SrcRC, DstRC, *TRI))
260
804
    return false;
261
6.18k
262
6.18k
  if (tryChangeVGPRtoSGPRinCopy(CopyUse, TRI, TII))
263
51
    return true;
264
6.13k
265
6.13k
  // TODO: Could have multiple extracts?
266
6.13k
  unsigned SubReg = CopyUse.getOperand(1).getSubReg();
267
6.13k
  if (SubReg != AMDGPU::NoSubRegister)
268
0
    return false;
269
6.13k
270
6.13k
  MRI.setRegClass(DstReg, DstRC);
271
6.13k
272
6.13k
  // SGPRx = ...
273
6.13k
  // SGPRy = REG_SEQUENCE SGPRx, sub0 ...
274
6.13k
  // VGPRz = COPY SGPRy
275
6.13k
276
6.13k
  // =>
277
6.13k
  // VGPRx = COPY SGPRx
278
6.13k
  // VGPRz = REG_SEQUENCE VGPRx, sub0
279
6.13k
280
6.13k
  MI.getOperand(0).setReg(CopyUse.getOperand(0).getReg());
281
6.13k
  bool IsAGPR = TRI->hasAGPRs(DstRC);
282
6.13k
283
23.4k
  for (unsigned I = 1, N = MI.getNumOperands(); I != N; 
I += 217.2k
) {
284
17.2k
    unsigned SrcReg = MI.getOperand(I).getReg();
285
17.2k
    unsigned SrcSubReg = MI.getOperand(I).getSubReg();
286
17.2k
287
17.2k
    const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
288
17.2k
    assert(TRI->isSGPRClass(SrcRC) &&
289
17.2k
           "Expected SGPR REG_SEQUENCE to only have SGPR inputs");
290
17.2k
291
17.2k
    SrcRC = TRI->getSubRegClass(SrcRC, SrcSubReg);
292
17.2k
    const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SrcRC);
293
17.2k
294
17.2k
    unsigned TmpReg = MRI.createVirtualRegister(NewSrcRC);
295
17.2k
296
17.2k
    BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY),
297
17.2k
            TmpReg)
298
17.2k
        .add(MI.getOperand(I));
299
17.2k
300
17.2k
    if (IsAGPR) {
301
416
      const TargetRegisterClass *NewSrcRC = TRI->getEquivalentAGPRClass(SrcRC);
302
416
      unsigned TmpAReg = MRI.createVirtualRegister(NewSrcRC);
303
416
      unsigned Opc = NewSrcRC == &AMDGPU::AGPR_32RegClass ?
304
416
        AMDGPU::V_ACCVGPR_WRITE_B32 : 
AMDGPU::COPY0
;
305
416
      BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(Opc),
306
416
            TmpAReg)
307
416
        .addReg(TmpReg, RegState::Kill);
308
416
      TmpReg = TmpAReg;
309
416
    }
310
17.2k
311
17.2k
    MI.getOperand(I).setReg(TmpReg);
312
17.2k
  }
313
6.13k
314
6.13k
  CopyUse.eraseFromParent();
315
6.13k
  return true;
316
6.13k
}
317
318
static bool phiHasVGPROperands(const MachineInstr &PHI,
319
                               const MachineRegisterInfo &MRI,
320
                               const SIRegisterInfo *TRI,
321
663
                               const SIInstrInfo *TII) {
322
1.82k
  for (unsigned i = 1; i < PHI.getNumOperands(); 
i += 21.16k
) {
323
1.17k
    unsigned Reg = PHI.getOperand(i).getReg();
324
1.17k
    if (TRI->hasVGPRs(MRI.getRegClass(Reg)))
325
10
      return true;
326
1.17k
  }
327
663
  
return false653
;
328
663
}
329
330
static bool phiHasBreakDef(const MachineInstr &PHI,
331
                           const MachineRegisterInfo &MRI,
332
556
                           SmallSet<unsigned, 8> &Visited) {
333
1.23k
  for (unsigned i = 1; i < PHI.getNumOperands(); 
i += 2676
) {
334
957
    unsigned Reg = PHI.getOperand(i).getReg();
335
957
    if (Visited.count(Reg))
336
44
      continue;
337
913
338
913
    Visited.insert(Reg);
339
913
340
913
    MachineInstr *DefInstr = MRI.getVRegDef(Reg);
341
913
    switch (DefInstr->getOpcode()) {
342
913
    default:
343
552
      break;
344
913
    case AMDGPU::SI_IF_BREAK:
345
277
      return true;
346
913
    case AMDGPU::PHI:
347
84
      if (phiHasBreakDef(*DefInstr, MRI, Visited))
348
4
        return true;
349
913
    }
350
913
  }
351
556
  
return false275
;
352
556
}
353
354
static bool hasTerminatorThatModifiesExec(const MachineBasicBlock &MBB,
355
976
                                          const TargetRegisterInfo &TRI) {
356
976
  for (MachineBasicBlock::const_iterator I = MBB.getFirstTerminator(),
357
1.72k
       E = MBB.end(); I != E; 
++I752
) {
358
1.07k
    if (I->modifiesRegister(AMDGPU::EXEC, &TRI))
359
322
      return true;
360
1.07k
  }
361
976
  
return false654
;
362
976
}
363
364
static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy,
365
                                    const MachineInstr *MoveImm,
366
                                    const SIInstrInfo *TII,
367
                                    unsigned &SMovOp,
368
34.5k
                                    int64_t &Imm) {
369
34.5k
  if (Copy->getOpcode() != AMDGPU::COPY)
370
27
    return false;
371
34.5k
372
34.5k
  if (!MoveImm->isMoveImmediate())
373
34.2k
    return false;
374
290
375
290
  const MachineOperand *ImmOp =
376
290
      TII->getNamedOperand(*MoveImm, AMDGPU::OpName::src0);
377
290
  if (!ImmOp->isImm())
378
260
    return false;
379
30
380
30
  // FIXME: Handle copies with sub-regs.
381
30
  if (Copy->getOperand(0).getSubReg())
382
0
    return false;
383
30
384
30
  switch (MoveImm->getOpcode()) {
385
30
  default:
386
0
    return false;
387
30
  case AMDGPU::V_MOV_B32_e32:
388
30
    SMovOp = AMDGPU::S_MOV_B32;
389
30
    break;
390
30
  case AMDGPU::V_MOV_B64_PSEUDO:
391
0
    SMovOp = AMDGPU::S_MOV_B64;
392
0
    break;
393
30
  }
394
30
  Imm = ImmOp->getImm();
395
30
  return true;
396
30
}
397
398
template <class UnaryPredicate>
399
bool searchPredecessors(const MachineBasicBlock *MBB,
400
                        const MachineBasicBlock *CutOff,
401
1.22k
                        UnaryPredicate Predicate) {
402
1.22k
  if (MBB == CutOff)
403
207
    return false;
404
1.01k
405
1.01k
  DenseSet<const MachineBasicBlock *> Visited;
406
1.01k
  SmallVector<MachineBasicBlock *, 4> Worklist(MBB->pred_begin(),
407
1.01k
                                               MBB->pred_end());
408
1.01k
409
2.08k
  while (!Worklist.empty()) {
410
1.39k
    MachineBasicBlock *MBB = Worklist.pop_back_val();
411
1.39k
412
1.39k
    if (!Visited.insert(MBB).second)
413
286
      continue;
414
1.11k
    if (MBB == CutOff)
415
99
      continue;
416
1.01k
    if (Predicate(MBB))
417
325
      return true;
418
687
419
687
    Worklist.append(MBB->pred_begin(), MBB->pred_end());
420
687
  }
421
1.01k
422
1.01k
  
return false691
;
423
1.01k
}
SIFixSGPRCopies.cpp:bool searchPredecessors<predsHasDivergentTerminator(llvm::MachineBasicBlock*, llvm::TargetRegisterInfo const*)::$_1>(llvm::MachineBasicBlock const*, llvm::MachineBasicBlock const*, predsHasDivergentTerminator(llvm::MachineBasicBlock*, llvm::TargetRegisterInfo const*)::$_1)
Line
Count
Source
401
917
                        UnaryPredicate Predicate) {
402
917
  if (MBB == CutOff)
403
0
    return false;
404
917
405
917
  DenseSet<const MachineBasicBlock *> Visited;
406
917
  SmallVector<MachineBasicBlock *, 4> Worklist(MBB->pred_begin(),
407
917
                                               MBB->pred_end());
408
917
409
1.83k
  while (!Worklist.empty()) {
410
1.23k
    MachineBasicBlock *MBB = Worklist.pop_back_val();
411
1.23k
412
1.23k
    if (!Visited.insert(MBB).second)
413
262
      continue;
414
976
    if (MBB == CutOff)
415
0
      continue;
416
976
    if (Predicate(MBB))
417
322
      return true;
418
654
419
654
    Worklist.append(MBB->pred_begin(), MBB->pred_end());
420
654
  }
421
917
422
917
  
return false595
;
423
917
}
SIFixSGPRCopies.cpp:bool searchPredecessors<isReachable(llvm::MachineInstr const*, llvm::MachineInstr const*, llvm::MachineBasicBlock const*, llvm::MachineDominatorTree&)::$_2>(llvm::MachineBasicBlock const*, llvm::MachineBasicBlock const*, isReachable(llvm::MachineInstr const*, llvm::MachineInstr const*, llvm::MachineBasicBlock const*, llvm::MachineDominatorTree&)::$_2)
Line
Count
Source
401
306
                        UnaryPredicate Predicate) {
402
306
  if (MBB == CutOff)
403
207
    return false;
404
99
405
99
  DenseSet<const MachineBasicBlock *> Visited;
406
99
  SmallVector<MachineBasicBlock *, 4> Worklist(MBB->pred_begin(),
407
99
                                               MBB->pred_end());
408
99
409
255
  while (!Worklist.empty()) {
410
159
    MachineBasicBlock *MBB = Worklist.pop_back_val();
411
159
412
159
    if (!Visited.insert(MBB).second)
413
24
      continue;
414
135
    if (MBB == CutOff)
415
99
      continue;
416
36
    if (Predicate(MBB))
417
3
      return true;
418
33
419
33
    Worklist.append(MBB->pred_begin(), MBB->pred_end());
420
33
  }
421
99
422
99
  
return false96
;
423
99
}
424
425
static bool predsHasDivergentTerminator(MachineBasicBlock *MBB,
426
917
                                        const TargetRegisterInfo *TRI) {
427
976
  return searchPredecessors(MBB, nullptr, [TRI](MachineBasicBlock *MBB) {
428
976
           return hasTerminatorThatModifiesExec(*MBB, *TRI); });
429
917
}
430
431
// Checks if there is potential path From instruction To instruction.
432
// If CutOff is specified and it sits in between of that path we ignore
433
// a higher portion of the path and report it is not reachable.
434
static bool isReachable(const MachineInstr *From,
435
                        const MachineInstr *To,
436
                        const MachineBasicBlock *CutOff,
437
9.48k
                        MachineDominatorTree &MDT) {
438
9.48k
  // If either From block dominates To block or instructions are in the same
439
9.48k
  // block and From is higher.
440
9.48k
  if (MDT.dominates(From, To))
441
9.14k
    return true;
442
343
443
343
  const MachineBasicBlock *MBBFrom = From->getParent();
444
343
  const MachineBasicBlock *MBBTo = To->getParent();
445
343
  if (MBBFrom == MBBTo)
446
37
    return false;
447
306
448
306
  // Instructions are in different blocks, do predecessor search.
449
306
  // We should almost never get here since we do not usually produce M0 stores
450
306
  // other than -1.
451
306
  return searchPredecessors(MBBTo, CutOff, [MBBFrom]
452
306
           (const MachineBasicBlock *MBB) 
{ return MBB == MBBFrom; }36
);
453
306
}
454
455
// Return the first non-prologue instruction in the block.
456
static MachineBasicBlock::iterator
457
34
getFirstNonPrologue(MachineBasicBlock *MBB, const TargetInstrInfo *TII) {
458
34
  MachineBasicBlock::iterator I = MBB->getFirstNonPHI();
459
35
  while (I != MBB->end() && TII->isBasicBlockPrologue(*I))
460
1
    ++I;
461
34
462
34
  return I;
463
34
}
464
465
// Hoist and merge identical SGPR initializations into a common predecessor.
466
// This is intended to combine M0 initializations, but can work with any
467
// SGPR. A VGPR cannot be processed since we cannot guarantee vector
468
// executioon.
469
static bool hoistAndMergeSGPRInits(unsigned Reg,
470
                                   const MachineRegisterInfo &MRI,
471
                                   MachineDominatorTree &MDT,
472
25.0k
                                   const TargetInstrInfo *TII) {
473
25.0k
  // List of inits by immediate value.
474
25.0k
  using InitListMap = std::map<unsigned, std::list<MachineInstr *>>;
475
25.0k
  InitListMap Inits;
476
25.0k
  // List of clobbering instructions.
477
25.0k
  SmallVector<MachineInstr*, 8> Clobbers;
478
25.0k
  // List of instructions marked for deletion.
479
25.0k
  SmallSet<MachineInstr*, 8> MergedInstrs;
480
25.0k
481
25.0k
  bool Changed = false;
482
25.0k
483
25.0k
  for (auto &MI : MRI.def_instructions(Reg)) {
484
9.83k
    MachineOperand *Imm = nullptr;
485
18.7k
    for (auto &MO: MI.operands()) {
486
18.7k
      if ((MO.isReg() && 
(9.82k
(9.82k
MO.isDef()9.82k
&&
MO.getReg() != Reg9.07k
) ||
!MO.isDef()9.71k
)) ||
487
18.7k
          
(17.9k
!MO.isImm()17.9k
&&
!MO.isReg()8.97k
) ||
(17.9k
MO.isImm()17.9k
&&
Imm8.95k
)) {
488
878
        Imm = nullptr;
489
878
        break;
490
17.9k
      } else if (MO.isImm())
491
8.95k
        Imm = &MO;
492
18.7k
    }
493
9.83k
    if (Imm)
494
8.95k
      Inits[Imm->getImm()].push_front(&MI);
495
878
    else
496
878
      Clobbers.push_back(&MI);
497
9.83k
  }
498
25.0k
499
25.0k
  for (auto &Init : Inits) {
500
1.81k
    auto &Defs = Init.second;
501
1.81k
502
10.7k
    for (auto I1 = Defs.begin(), E = Defs.end(); I1 != E; ) {
503
8.93k
      MachineInstr *MI1 = *I1;
504
8.93k
505
154k
      for (auto I2 = std::next(I1); I2 != E; ) {
506
145k
        MachineInstr *MI2 = *I2;
507
145k
508
145k
        // Check any possible interference
509
145k
        auto interferes = [&](MachineBasicBlock::iterator From,
510
145k
                              MachineBasicBlock::iterator To) -> bool {
511
145k
512
145k
          assert(MDT.dominates(&*To, &*From));
513
145k
514
145k
          auto interferes = [&MDT, From, To](MachineInstr* &Clobber) -> bool {
515
4.74k
            const MachineBasicBlock *MBBFrom = From->getParent();
516
4.74k
            const MachineBasicBlock *MBBTo = To->getParent();
517
4.74k
            bool MayClobberFrom = isReachable(Clobber, &*From, MBBTo, MDT);
518
4.74k
            bool MayClobberTo = isReachable(Clobber, &*To, MBBTo, MDT);
519
4.74k
            if (!MayClobberFrom && 
!MayClobberTo156
)
520
156
              return false;
521
4.58k
            if ((MayClobberFrom && !MayClobberTo) ||
522
4.58k
                
(4.56k
!MayClobberFrom4.56k
&&
MayClobberTo0
))
523
28
              return true;
524
4.56k
            // Both can clobber, this is not an interference only if both are
525
4.56k
            // dominated by Clobber and belong to the same block or if Clobber
526
4.56k
            // properly dominates To, given that To >> From, so it dominates
527
4.56k
            // both and located in a common dominator.
528
4.56k
            return !((MBBFrom == MBBTo &&
529
4.56k
                      
MDT.dominates(Clobber, &*From)4.53k
&&
530
4.56k
                      
MDT.dominates(Clobber, &*To)4.53k
) ||
531
4.56k
                     
MDT.properlyDominates(Clobber->getParent(), MBBTo)21
);
532
4.56k
          };
533
145k
534
145k
          return (llvm::any_of(Clobbers, interferes)) ||
535
145k
                 
(llvm::any_of(Inits, [&](InitListMap::value_type &C) 145k
{
536
145k
                    return C.first != Init.first &&
537
145k
                           
llvm::any_of(C.second, interferes)79
;
538
145k
                  }));
539
145k
        };
540
145k
541
145k
        if (MDT.dominates(MI1, MI2)) {
542
145k
          if (!interferes(MI2, MI1)) {
543
145k
            LLVM_DEBUG(dbgs()
544
145k
                       << "Erasing from "
545
145k
                       << printMBBReference(*MI2->getParent()) << " " << *MI2);
546
145k
            MergedInstrs.insert(MI2);
547
145k
            Changed = true;
548
145k
            ++I2;
549
145k
            continue;
550
145k
          }
551
34
        } else if (MDT.dominates(MI2, MI1)) {
552
0
          if (!interferes(MI1, MI2)) {
553
0
            LLVM_DEBUG(dbgs()
554
0
                       << "Erasing from "
555
0
                       << printMBBReference(*MI1->getParent()) << " " << *MI1);
556
0
            MergedInstrs.insert(MI1);
557
0
            Changed = true;
558
0
            ++I1;
559
0
            break;
560
0
          }
561
34
        } else {
562
34
          auto *MBB = MDT.findNearestCommonDominator(MI1->getParent(),
563
34
                                                     MI2->getParent());
564
34
          if (!MBB) {
565
0
            ++I2;
566
0
            continue;
567
0
          }
568
34
569
34
          MachineBasicBlock::iterator I = getFirstNonPrologue(MBB, TII);
570
34
          if (!interferes(MI1, I) && 
!interferes(MI2, I)28
) {
571
28
            LLVM_DEBUG(dbgs()
572
28
                       << "Erasing from "
573
28
                       << printMBBReference(*MI1->getParent()) << " " << *MI1
574
28
                       << "and moving from "
575
28
                       << printMBBReference(*MI2->getParent()) << " to "
576
28
                       << printMBBReference(*I->getParent()) << " " << *MI2);
577
28
            I->getParent()->splice(I, MI2->getParent(), MI2);
578
28
            MergedInstrs.insert(MI1);
579
28
            Changed = true;
580
28
            ++I1;
581
28
            break;
582
28
          }
583
34
        }
584
34
        ++I2;
585
34
      }
586
8.93k
      ++I1;
587
8.93k
    }
588
1.81k
  }
589
25.0k
590
25.0k
  for (auto MI : MergedInstrs)
591
7.12k
    MI->removeFromParent();
592
25.0k
593
25.0k
  if (Changed)
594
963
    MRI.clearKillFlags(Reg);
595
25.0k
596
25.0k
  return Changed;
597
25.0k
}
598
599
25.2k
bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
600
25.2k
  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
601
25.2k
  MachineRegisterInfo &MRI = MF.getRegInfo();
602
25.2k
  const SIRegisterInfo *TRI = ST.getRegisterInfo();
603
25.2k
  const SIInstrInfo *TII = ST.getInstrInfo();
604
25.2k
  MDT = &getAnalysis<MachineDominatorTree>();
605
25.2k
606
25.2k
  SmallVector<MachineInstr *, 16> Worklist;
607
25.2k
608
25.2k
  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
609
53.7k
                                                  BI != BE; 
++BI28.5k
) {
610
28.5k
    MachineBasicBlock &MBB = *BI;
611
28.5k
    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
612
770k
         I != E; 
++I741k
) {
613
741k
      MachineInstr &MI = *I;
614
741k
615
741k
      switch (MI.getOpcode()) {
616
741k
      default:
617
334k
        continue;
618
741k
      case AMDGPU::COPY:
619
328k
      case AMDGPU::WQM:
620
328k
      case AMDGPU::WWM: {
621
328k
        // If the destination register is a physical register there isn't really
622
328k
        // much we can do to fix this.
623
328k
        if (!TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg()))
624
12.4k
          continue;
625
316k
626
316k
        const TargetRegisterClass *SrcRC, *DstRC;
627
316k
        std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, MRI);
628
316k
        if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) {
629
35.0k
          unsigned SrcReg = MI.getOperand(1).getReg();
630
35.0k
          if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) {
631
446
            TII->moveToVALU(MI, MDT);
632
446
            break;
633
446
          }
634
34.5k
635
34.5k
          MachineInstr *DefMI = MRI.getVRegDef(SrcReg);
636
34.5k
          unsigned SMovOp;
637
34.5k
          int64_t Imm;
638
34.5k
          // If we are just copying an immediate, we can replace the copy with
639
34.5k
          // s_mov_b32.
640
34.5k
          if (isSafeToFoldImmIntoCopy(&MI, DefMI, TII, SMovOp, Imm)) {
641
30
            MI.getOperand(1).ChangeToImmediate(Imm);
642
30
            MI.addImplicitDefUseOperands(MF);
643
30
            MI.setDesc(TII->get(SMovOp));
644
30
            break;
645
30
          }
646
34.5k
          TII->moveToVALU(MI, MDT);
647
281k
        } else if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) {
648
51.4k
          tryChangeVGPRtoSGPRinCopy(MI, TRI, TII);
649
51.4k
        }
650
316k
651
316k
        
break315k
;
652
316k
      }
653
316k
      case AMDGPU::PHI: {
654
4.91k
        unsigned Reg = MI.getOperand(0).getReg();
655
4.91k
        if (!TRI->isSGPRClass(MRI.getRegClass(Reg)))
656
4.25k
          break;
657
663
658
663
        // We don't need to fix the PHI if the common dominator of the
659
663
        // two incoming blocks terminates with a uniform branch.
660
663
        bool HasVGPROperand = phiHasVGPROperands(MI, MRI, TRI, TII);
661
663
        if (MI.getNumExplicitOperands() == 5 && 
!HasVGPROperand513
) {
662
503
          MachineBasicBlock *MBB0 = MI.getOperand(2).getMBB();
663
503
          MachineBasicBlock *MBB1 = MI.getOperand(4).getMBB();
664
503
665
503
          if (!predsHasDivergentTerminator(MBB0, TRI) &&
666
503
              
!predsHasDivergentTerminator(MBB1, TRI)414
) {
667
181
            LLVM_DEBUG(dbgs()
668
181
                       << "Not fixing PHI for uniform branch: " << MI << '\n');
669
181
            break;
670
181
          }
671
482
        }
672
482
673
482
        // If a PHI node defines an SGPR and any of its operands are VGPRs,
674
482
        // then we need to move it to the VALU.
675
482
        //
676
482
        // Also, if a PHI node defines an SGPR and has all SGPR operands
677
482
        // we must move it to the VALU, because the SGPR operands will
678
482
        // all end up being assigned the same register, which means
679
482
        // there is a potential for a conflict if different threads take
680
482
        // different control flow paths.
681
482
        //
682
482
        // For Example:
683
482
        //
684
482
        // sgpr0 = def;
685
482
        // ...
686
482
        // sgpr1 = def;
687
482
        // ...
688
482
        // sgpr2 = PHI sgpr0, sgpr1
689
482
        // use sgpr2;
690
482
        //
691
482
        // Will Become:
692
482
        //
693
482
        // sgpr2 = def;
694
482
        // ...
695
482
        // sgpr2 = def;
696
482
        // ...
697
482
        // use sgpr2
698
482
        //
699
482
        // The one exception to this rule is when one of the operands
700
482
        // is defined by a SI_BREAK, SI_IF_BREAK, or SI_ELSE_BREAK
701
482
        // instruction.  In this case, there we know the program will
702
482
        // never enter the second block (the loop) without entering
703
482
        // the first block (where the condition is computed), so there
704
482
        // is no chance for values to be over-written.
705
482
706
482
        SmallSet<unsigned, 8> Visited;
707
482
        if (HasVGPROperand || 
!phiHasBreakDef(MI, MRI, Visited)472
) {
708
205
          LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI);
709
205
          TII->moveToVALU(MI, MDT);
710
205
        }
711
482
712
482
        break;
713
482
      }
714
73.2k
      case AMDGPU::REG_SEQUENCE:
715
73.2k
        if (TRI->hasVectorRegisters(TII->getOpRegClass(MI, 0)) ||
716
73.2k
            
!hasVectorOperands(MI, TRI)47.3k
) {
717
68.7k
          foldVGPRCopyIntoRegSequence(MI, TRI, TII, MRI);
718
68.7k
          continue;
719
68.7k
        }
720
4.56k
721
4.56k
        LLVM_DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI);
722
4.56k
723
4.56k
        TII->moveToVALU(MI, MDT);
724
4.56k
        break;
725
4.56k
      case AMDGPU::INSERT_SUBREG: {
726
26
        const TargetRegisterClass *DstRC, *Src0RC, *Src1RC;
727
26
        DstRC = MRI.getRegClass(MI.getOperand(0).getReg());
728
26
        Src0RC = MRI.getRegClass(MI.getOperand(1).getReg());
729
26
        Src1RC = MRI.getRegClass(MI.getOperand(2).getReg());
730
26
        if (TRI->isSGPRClass(DstRC) &&
731
26
            
(16
TRI->hasVectorRegisters(Src0RC)16
||
732
16
             TRI->hasVectorRegisters(Src1RC))) {
733
6
          LLVM_DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI);
734
6
          TII->moveToVALU(MI, MDT);
735
6
        }
736
26
        break;
737
4.56k
      }
738
741k
      }
739
741k
    }
740
28.5k
  }
741
25.2k
742
25.2k
  if (MF.getTarget().getOptLevel() > CodeGenOpt::None && 
EnableM0Merge25.0k
)
743
25.0k
    hoistAndMergeSGPRInits(AMDGPU::M0, MRI, *MDT, TII);
744
25.2k
745
25.2k
  return true;
746
25.2k
}