Coverage Report

Created: 2017-10-03 07:32

/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Line
Count
Source (jump to first uncovered line)
1
//===- SIInstrInfo.cpp - SI Instruction Information  ----------------------===//
2
//
3
//                     The LLVM Compiler Infrastructure
4
//
5
// This file is distributed under the University of Illinois Open Source
6
// License. See LICENSE.TXT for details.
7
//
8
//===----------------------------------------------------------------------===//
9
//
10
/// \file
11
/// \brief SI Implementation of TargetInstrInfo.
12
//
13
//===----------------------------------------------------------------------===//
14
15
#include "SIInstrInfo.h"
16
#include "AMDGPU.h"
17
#include "AMDGPUSubtarget.h"
18
#include "GCNHazardRecognizer.h"
19
#include "SIDefines.h"
20
#include "SIMachineFunctionInfo.h"
21
#include "SIRegisterInfo.h"
22
#include "Utils/AMDGPUBaseInfo.h"
23
#include "llvm/ADT/APInt.h"
24
#include "llvm/ADT/ArrayRef.h"
25
#include "llvm/ADT/SmallVector.h"
26
#include "llvm/ADT/StringRef.h"
27
#include "llvm/ADT/iterator_range.h"
28
#include "llvm/Analysis/AliasAnalysis.h"
29
#include "llvm/Analysis/MemoryLocation.h"
30
#include "llvm/Analysis/ValueTracking.h"
31
#include "llvm/CodeGen/MachineBasicBlock.h"
32
#include "llvm/CodeGen/MachineFrameInfo.h"
33
#include "llvm/CodeGen/MachineFunction.h"
34
#include "llvm/CodeGen/MachineInstr.h"
35
#include "llvm/CodeGen/MachineInstrBuilder.h"
36
#include "llvm/CodeGen/MachineInstrBundle.h"
37
#include "llvm/CodeGen/MachineMemOperand.h"
38
#include "llvm/CodeGen/MachineOperand.h"
39
#include "llvm/CodeGen/MachineRegisterInfo.h"
40
#include "llvm/CodeGen/MachineValueType.h"
41
#include "llvm/CodeGen/RegisterScavenging.h"
42
#include "llvm/CodeGen/ScheduleDAG.h"
43
#include "llvm/CodeGen/SelectionDAGNodes.h"
44
#include "llvm/IR/DebugLoc.h"
45
#include "llvm/IR/DiagnosticInfo.h"
46
#include "llvm/IR/Function.h"
47
#include "llvm/IR/InlineAsm.h"
48
#include "llvm/IR/LLVMContext.h"
49
#include "llvm/MC/MCInstrDesc.h"
50
#include "llvm/Support/Casting.h"
51
#include "llvm/Support/CommandLine.h"
52
#include "llvm/Support/Compiler.h"
53
#include "llvm/Support/ErrorHandling.h"
54
#include "llvm/Support/MathExtras.h"
55
#include "llvm/Target/TargetMachine.h"
56
#include "llvm/Target/TargetOpcodes.h"
57
#include "llvm/Target/TargetRegisterInfo.h"
58
#include <cassert>
59
#include <cstdint>
60
#include <iterator>
61
#include <utility>
62
63
using namespace llvm;
64
65
// Must be at least 4 to be able to branch over minimum unconditional branch
66
// code. This is only for making it possible to write reasonably small tests for
67
// long branches.
68
static cl::opt<unsigned>
69
BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
70
                 cl::desc("Restrict range of branch instructions (DEBUG)"));
71
72
SIInstrInfo::SIInstrInfo(const SISubtarget &ST)
73
1.81k
  : AMDGPUInstrInfo(ST), RI(ST), ST(ST) {}
74
75
//===----------------------------------------------------------------------===//
76
// TargetInstrInfo callbacks
77
//===----------------------------------------------------------------------===//
78
79
458k
static unsigned getNumOperandsNoGlue(SDNode *Node) {
80
458k
  unsigned N = Node->getNumOperands();
81
511k
  while (
N && 511k
Node->getOperand(N - 1).getValueType() == MVT::Glue511k
)
82
52.3k
    --N;
83
458k
  return N;
84
458k
}
85
86
458k
static SDValue findChainOperand(SDNode *Load) {
87
458k
  SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1);
88
458k
  assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node");
89
458k
  return LastOp;
90
458k
}
91
92
/// \brief Returns true if both nodes have the same value for the given
93
///        operand \p Op, or if both nodes do not have this operand.
94
612k
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
95
612k
  unsigned Opc0 = N0->getMachineOpcode();
96
612k
  unsigned Opc1 = N1->getMachineOpcode();
97
612k
98
612k
  int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
99
612k
  int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
100
612k
101
612k
  if (
Op0Idx == -1 && 612k
Op1Idx == -113.9k
)
102
13.9k
    return true;
103
598k
104
598k
105
598k
  
if (598k
(Op0Idx == -1 && 598k
Op1Idx != -125
) ||
106
598k
      
(Op1Idx == -1 && 598k
Op0Idx != -127
))
107
52
    return false;
108
598k
109
598k
  // getNamedOperandIdx returns the index for the MachineInstr's operands,
110
598k
  // which includes the result as the first operand. We are indexing into the
111
598k
  // MachineSDNode's operands, so we need to skip the result operand to get
112
598k
  // the real index.
113
598k
  --Op0Idx;
114
598k
  --Op1Idx;
115
598k
116
598k
  return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
117
598k
}
118
119
bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
120
26.3k
                                                    AliasAnalysis *AA) const {
121
26.3k
  // TODO: The generic check fails for VALU instructions that should be
122
26.3k
  // rematerializable due to implicit reads of exec. We really want all of the
123
26.3k
  // generic logic for this except for this.
124
26.3k
  switch (MI.getOpcode()) {
125
6.39k
  case AMDGPU::V_MOV_B32_e32:
126
6.39k
  case AMDGPU::V_MOV_B32_e64:
127
6.39k
  case AMDGPU::V_MOV_B64_PSEUDO:
128
6.39k
    return true;
129
19.9k
  default:
130
19.9k
    return false;
131
0
  }
132
0
}
133
134
bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
135
                                          int64_t &Offset0,
136
410k
                                          int64_t &Offset1) const {
137
410k
  if (
!Load0->isMachineOpcode() || 410k
!Load1->isMachineOpcode()410k
)
138
46.4k
    return false;
139
364k
140
364k
  unsigned Opc0 = Load0->getMachineOpcode();
141
364k
  unsigned Opc1 = Load1->getMachineOpcode();
142
364k
143
364k
  // Make sure both are actually loads.
144
364k
  if (
!get(Opc0).mayLoad() || 364k
!get(Opc1).mayLoad()364k
)
145
40.2k
    return false;
146
324k
147
324k
  
if (324k
isDS(Opc0) && 324k
isDS(Opc1)0
) {
148
0
149
0
    // FIXME: Handle this case:
150
0
    if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
151
0
      return false;
152
0
153
0
    // Check base reg.
154
0
    
if (0
Load0->getOperand(1) != Load1->getOperand(1)0
)
155
0
      return false;
156
0
157
0
    // Check chain.
158
0
    
if (0
findChainOperand(Load0) != findChainOperand(Load1)0
)
159
0
      return false;
160
0
161
0
    // Skip read2 / write2 variants for simplicity.
162
0
    // TODO: We should report true if the used offsets are adjacent (excluded
163
0
    // st64 versions).
164
0
    
if (0
AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 ||
165
0
        AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1)
166
0
      return false;
167
0
168
0
    Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue();
169
0
    Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue();
170
0
    return true;
171
0
  }
172
324k
173
324k
  
if (324k
isSMRD(Opc0) && 324k
isSMRD(Opc1)52.1k
) {
174
35.9k
    // Skip time and cache invalidation instructions.
175
35.9k
    if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 ||
176
35.9k
        AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1)
177
4
      return false;
178
35.9k
179
35.9k
    assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1));
180
35.9k
181
35.9k
    // Check base reg.
182
35.9k
    if (Load0->getOperand(0) != Load1->getOperand(0))
183
3.76k
      return false;
184
32.2k
185
32.2k
    const ConstantSDNode *Load0Offset =
186
32.2k
        dyn_cast<ConstantSDNode>(Load0->getOperand(1));
187
32.2k
    const ConstantSDNode *Load1Offset =
188
32.2k
        dyn_cast<ConstantSDNode>(Load1->getOperand(1));
189
32.2k
190
32.2k
    if (
!Load0Offset || 32.2k
!Load1Offset32.2k
)
191
6
      return false;
192
32.2k
193
32.2k
    // Check chain.
194
32.2k
    
if (32.2k
findChainOperand(Load0) != findChainOperand(Load1)32.2k
)
195
0
      return false;
196
32.2k
197
32.2k
    Offset0 = Load0Offset->getZExtValue();
198
32.2k
    Offset1 = Load1Offset->getZExtValue();
199
32.2k
    return true;
200
32.2k
  }
201
288k
202
288k
  // MUBUF and MTBUF can access the same addresses.
203
288k
  
if (288k
(isMUBUF(Opc0) || 288k
isMTBUF(Opc0)59.0k
) &&
(isMUBUF(Opc1) || 229k
isMTBUF(Opc1)9.37k
)) {
204
219k
205
219k
    // MUBUF and MTBUF have vaddr at different indices.
206
219k
    if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
207
197k
        findChainOperand(Load0) != findChainOperand(Load1) ||
208
197k
        !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
209
195k
        !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
210
25.3k
      return false;
211
194k
212
194k
    int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
213
194k
    int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
214
194k
215
194k
    if (
OffIdx0 == -1 || 194k
OffIdx1 == -1194k
)
216
0
      return false;
217
194k
218
194k
    // getNamedOperandIdx returns the index for MachineInstrs.  Since they
219
194k
    // inlcude the output in the operand list, but SDNodes don't, we need to
220
194k
    // subtract the index by one.
221
194k
    --OffIdx0;
222
194k
    --OffIdx1;
223
194k
224
194k
    SDValue Off0 = Load0->getOperand(OffIdx0);
225
194k
    SDValue Off1 = Load1->getOperand(OffIdx1);
226
194k
227
194k
    // The offset might be a FrameIndexSDNode.
228
194k
    if (
!isa<ConstantSDNode>(Off0) || 194k
!isa<ConstantSDNode>(Off1)194k
)
229
0
      return false;
230
194k
231
194k
    Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue();
232
194k
    Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue();
233
194k
    return true;
234
194k
  }
235
68.4k
236
68.4k
  return false;
237
68.4k
}
238
239
31.9k
static bool isStride64(unsigned Opc) {
240
31.9k
  switch (Opc) {
241
5
  case AMDGPU::DS_READ2ST64_B32:
242
5
  case AMDGPU::DS_READ2ST64_B64:
243
5
  case AMDGPU::DS_WRITE2ST64_B32:
244
5
  case AMDGPU::DS_WRITE2ST64_B64:
245
5
    return true;
246
31.9k
  default:
247
31.9k
    return false;
248
0
  }
249
0
}
250
251
bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
252
                                        int64_t &Offset,
253
972k
                                        const TargetRegisterInfo *TRI) const {
254
972k
  unsigned Opc = LdSt.getOpcode();
255
972k
256
972k
  if (
isDS(LdSt)972k
) {
257
68.3k
    const MachineOperand *OffsetImm =
258
68.3k
        getNamedOperand(LdSt, AMDGPU::OpName::offset);
259
68.3k
    if (
OffsetImm68.3k
) {
260
28.0k
      // Normal, single offset LDS instruction.
261
28.0k
      const MachineOperand *AddrReg =
262
28.0k
          getNamedOperand(LdSt, AMDGPU::OpName::addr);
263
28.0k
264
28.0k
      BaseReg = AddrReg->getReg();
265
28.0k
      Offset = OffsetImm->getImm();
266
28.0k
      return true;
267
28.0k
    }
268
40.2k
269
40.2k
    // The 2 offset instructions use offset0 and offset1 instead. We can treat
270
40.2k
    // these as a load with a single offset if the 2 offsets are consecutive. We
271
40.2k
    // will use this for some partially aligned loads.
272
40.2k
    const MachineOperand *Offset0Imm =
273
40.2k
        getNamedOperand(LdSt, AMDGPU::OpName::offset0);
274
40.2k
    const MachineOperand *Offset1Imm =
275
40.2k
        getNamedOperand(LdSt, AMDGPU::OpName::offset1);
276
40.2k
277
40.2k
    uint8_t Offset0 = Offset0Imm->getImm();
278
40.2k
    uint8_t Offset1 = Offset1Imm->getImm();
279
40.2k
280
40.2k
    if (
Offset1 > Offset0 && 40.2k
Offset1 - Offset0 == 140.2k
) {
281
31.9k
      // Each of these offsets is in element sized units, so we need to convert
282
31.9k
      // to bytes of the individual reads.
283
31.9k
284
31.9k
      unsigned EltSize;
285
31.9k
      if (LdSt.mayLoad())
286
7.93k
        EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
287
24.0k
      else {
288
24.0k
        assert(LdSt.mayStore());
289
24.0k
        int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
290
24.0k
        EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
291
24.0k
      }
292
31.9k
293
31.9k
      if (isStride64(Opc))
294
5
        EltSize *= 64;
295
31.9k
296
31.9k
      const MachineOperand *AddrReg =
297
31.9k
          getNamedOperand(LdSt, AMDGPU::OpName::addr);
298
31.9k
      BaseReg = AddrReg->getReg();
299
31.9k
      Offset = EltSize * Offset0;
300
31.9k
      return true;
301
31.9k
    }
302
8.30k
303
8.30k
    return false;
304
8.30k
  }
305
904k
306
904k
  
if (904k
isMUBUF(LdSt) || 904k
isMTBUF(LdSt)95.3k
) {
307
808k
    const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset);
308
808k
    if (
SOffset && 808k
SOffset->isReg()808k
)
309
715k
      return false;
310
93.2k
311
93.2k
    const MachineOperand *AddrReg =
312
93.2k
        getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
313
93.2k
    if (!AddrReg)
314
88.3k
      return false;
315
4.94k
316
4.94k
    const MachineOperand *OffsetImm =
317
4.94k
        getNamedOperand(LdSt, AMDGPU::OpName::offset);
318
4.94k
    BaseReg = AddrReg->getReg();
319
4.94k
    Offset = OffsetImm->getImm();
320
4.94k
321
4.94k
    if (SOffset) // soffset can be an inline immediate.
322
4.94k
      Offset += SOffset->getImm();
323
808k
324
808k
    return true;
325
808k
  }
326
95.1k
327
95.1k
  
if (95.1k
isSMRD(LdSt)95.1k
) {
328
29.7k
    const MachineOperand *OffsetImm =
329
29.7k
        getNamedOperand(LdSt, AMDGPU::OpName::offset);
330
29.7k
    if (!OffsetImm)
331
59
      return false;
332
29.7k
333
29.7k
    const MachineOperand *SBaseReg =
334
29.7k
        getNamedOperand(LdSt, AMDGPU::OpName::sbase);
335
29.7k
    BaseReg = SBaseReg->getReg();
336
29.7k
    Offset = OffsetImm->getImm();
337
29.7k
    return true;
338
29.7k
  }
339
65.4k
340
65.4k
  
if (65.4k
isFLAT(LdSt)65.4k
) {
341
63.9k
    const MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
342
63.9k
    if (
VAddr63.9k
) {
343
63.9k
      // Can't analyze 2 offsets.
344
63.9k
      if (getNamedOperand(LdSt, AMDGPU::OpName::saddr))
345
0
        return false;
346
63.9k
347
63.9k
      BaseReg = VAddr->getReg();
348
63.9k
    } else {
349
0
      // scratch instructions have either vaddr or saddr.
350
0
      BaseReg = getNamedOperand(LdSt, AMDGPU::OpName::saddr)->getReg();
351
0
    }
352
63.9k
353
63.9k
    Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
354
63.9k
    return true;
355
1.51k
  }
356
1.51k
357
1.51k
  return false;
358
1.51k
}
359
360
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, unsigned BaseReg1,
361
25.5k
                                  const MachineInstr &MI2, unsigned BaseReg2) {
362
25.5k
  if (BaseReg1 == BaseReg2)
363
17.6k
    return true;
364
7.88k
365
7.88k
  
if (7.88k
!MI1.hasOneMemOperand() || 7.88k
!MI2.hasOneMemOperand()7.88k
)
366
146
    return false;
367
7.74k
368
7.74k
  auto MO1 = *MI1.memoperands_begin();
369
7.74k
  auto MO2 = *MI2.memoperands_begin();
370
7.74k
  if (MO1->getAddrSpace() != MO2->getAddrSpace())
371
4.87k
    return false;
372
2.86k
373
2.86k
  auto Base1 = MO1->getValue();
374
2.86k
  auto Base2 = MO2->getValue();
375
2.86k
  if (
!Base1 || 2.86k
!Base22.81k
)
376
54
    return false;
377
2.81k
  const MachineFunction &MF = *MI1.getParent()->getParent();
378
2.81k
  const DataLayout &DL = MF.getFunction()->getParent()->getDataLayout();
379
2.81k
  Base1 = GetUnderlyingObject(Base1, DL);
380
2.81k
  Base2 = GetUnderlyingObject(Base1, DL);
381
2.81k
382
2.81k
  if (
isa<UndefValue>(Base1) || 2.81k
isa<UndefValue>(Base2)2.06k
)
383
750
    return false;
384
2.06k
385
2.06k
  return Base1 == Base2;
386
2.06k
}
387
388
bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
389
                                      unsigned BaseReg1,
390
                                      MachineInstr &SecondLdSt,
391
                                      unsigned BaseReg2,
392
25.5k
                                      unsigned NumLoads) const {
393
25.5k
  if (!memOpsHaveSameBasePtr(FirstLdSt, BaseReg1, SecondLdSt, BaseReg2))
394
5.82k
    return false;
395
19.7k
396
19.7k
  const MachineOperand *FirstDst = nullptr;
397
19.7k
  const MachineOperand *SecondDst = nullptr;
398
19.7k
399
19.7k
  if (
(isMUBUF(FirstLdSt) && 19.7k
isMUBUF(SecondLdSt)177
) ||
400
19.5k
      
(isMTBUF(FirstLdSt) && 19.5k
isMTBUF(SecondLdSt)0
) ||
401
19.7k
      
(isFLAT(FirstLdSt) && 19.5k
isFLAT(SecondLdSt)1.68k
)) {
402
1.85k
    const unsigned MaxGlobalLoadCluster = 6;
403
1.85k
    if (NumLoads > MaxGlobalLoadCluster)
404
0
      return false;
405
1.85k
406
1.85k
    FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata);
407
1.85k
    if (!FirstDst)
408
617
      FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
409
1.85k
    SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata);
410
1.85k
    if (!SecondDst)
411
617
      SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
412
19.7k
  } else 
if (17.8k
isSMRD(FirstLdSt) && 17.8k
isSMRD(SecondLdSt)16.0k
) {
413
16.0k
    FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst);
414
16.0k
    SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst);
415
17.8k
  } else 
if (1.77k
isDS(FirstLdSt) && 1.77k
isDS(SecondLdSt)1.77k
) {
416
1.77k
    FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
417
1.77k
    SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
418
1.77k
  }
419
19.7k
420
19.7k
  
if (19.7k
!FirstDst || 19.7k
!SecondDst19.2k
)
421
462
    return false;
422
19.2k
423
19.2k
  // Try to limit clustering based on the total number of bytes loaded
424
19.2k
  // rather than the number of instructions.  This is done to help reduce
425
19.2k
  // register pressure.  The method used is somewhat inexact, though,
426
19.2k
  // because it assumes that all loads in the cluster will load the
427
19.2k
  // same number of bytes as FirstLdSt.
428
19.2k
429
19.2k
  // The unit of this value is bytes.
430
19.2k
  // FIXME: This needs finer tuning.
431
19.2k
  unsigned LoadClusterThreshold = 16;
432
19.2k
433
19.2k
  const MachineRegisterInfo &MRI =
434
19.2k
      FirstLdSt.getParent()->getParent()->getRegInfo();
435
19.2k
  const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg());
436
19.2k
437
19.2k
  return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold;
438
19.2k
}
439
440
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
441
                              MachineBasicBlock::iterator MI,
442
                              const DebugLoc &DL, unsigned DestReg,
443
10
                              unsigned SrcReg, bool KillSrc) {
444
10
  MachineFunction *MF = MBB.getParent();
445
10
  DiagnosticInfoUnsupported IllegalCopy(*MF->getFunction(),
446
10
                                        "illegal SGPR to VGPR copy",
447
10
                                        DL, DS_Error);
448
10
  LLVMContext &C = MF->getFunction()->getContext();
449
10
  C.diagnose(IllegalCopy);
450
10
451
10
  BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
452
10
    .addReg(SrcReg, getKillRegState(KillSrc));
453
10
}
454
455
void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
456
                              MachineBasicBlock::iterator MI,
457
                              const DebugLoc &DL, unsigned DestReg,
458
39.9k
                              unsigned SrcReg, bool KillSrc) const {
459
39.9k
  const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg);
460
39.9k
461
39.9k
  if (
RC == &AMDGPU::VGPR_32RegClass39.9k
) {
462
23.8k
    assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
463
23.8k
           AMDGPU::SReg_32RegClass.contains(SrcReg));
464
23.8k
    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
465
23.8k
      .addReg(SrcReg, getKillRegState(KillSrc));
466
23.8k
    return;
467
23.8k
  }
468
16.1k
469
16.1k
  
if (16.1k
RC == &AMDGPU::SReg_32_XM0RegClass ||
470
16.1k
      
RC == &AMDGPU::SReg_32RegClass16.1k
) {
471
10.4k
    if (
SrcReg == AMDGPU::SCC10.4k
) {
472
0
      BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
473
0
          .addImm(-1)
474
0
          .addImm(0);
475
0
      return;
476
0
    }
477
10.4k
478
10.4k
    
if (10.4k
!AMDGPU::SReg_32RegClass.contains(SrcReg)10.4k
) {
479
2
      reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
480
2
      return;
481
2
    }
482
10.4k
483
10.4k
    BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
484
10.4k
            .addReg(SrcReg, getKillRegState(KillSrc));
485
10.4k
    return;
486
10.4k
  }
487
5.73k
488
5.73k
  
if (5.73k
RC == &AMDGPU::SReg_64RegClass5.73k
) {
489
1.46k
    if (
DestReg == AMDGPU::VCC1.46k
) {
490
23
      if (
AMDGPU::SReg_64RegClass.contains(SrcReg)23
) {
491
15
        BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
492
15
          .addReg(SrcReg, getKillRegState(KillSrc));
493
23
      } else {
494
8
        // FIXME: Hack until VReg_1 removed.
495
8
        assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
496
8
        BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
497
8
          .addImm(0)
498
8
          .addReg(SrcReg, getKillRegState(KillSrc));
499
8
      }
500
23
501
23
      return;
502
23
    }
503
1.44k
504
1.44k
    
if (1.44k
!AMDGPU::SReg_64RegClass.contains(SrcReg)1.44k
) {
505
2
      reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
506
2
      return;
507
2
    }
508
1.43k
509
1.43k
    BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
510
1.43k
            .addReg(SrcReg, getKillRegState(KillSrc));
511
1.43k
    return;
512
1.43k
  }
513
4.27k
514
4.27k
  
if (4.27k
DestReg == AMDGPU::SCC4.27k
) {
515
0
    assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
516
0
    BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
517
0
      .addReg(SrcReg, getKillRegState(KillSrc))
518
0
      .addImm(0);
519
0
    return;
520
0
  }
521
4.27k
522
4.27k
  unsigned EltSize = 4;
523
4.27k
  unsigned Opcode = AMDGPU::V_MOV_B32_e32;
524
4.27k
  if (
RI.isSGPRClass(RC)4.27k
) {
525
142
    if (
RI.getRegSizeInBits(*RC) > 32142
) {
526
142
      Opcode =  AMDGPU::S_MOV_B64;
527
142
      EltSize = 8;
528
142
    } else {
529
0
      Opcode = AMDGPU::S_MOV_B32;
530
0
      EltSize = 4;
531
0
    }
532
142
533
142
    if (
!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))142
) {
534
6
      reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
535
6
      return;
536
6
    }
537
4.26k
  }
538
4.26k
539
4.26k
  ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
540
4.26k
  bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
541
4.26k
542
13.1k
  for (unsigned Idx = 0; 
Idx < SubIndices.size()13.1k
;
++Idx8.90k
) {
543
8.90k
    unsigned SubIdx;
544
8.90k
    if (Forward)
545
6.88k
      SubIdx = SubIndices[Idx];
546
8.90k
    else
547
2.02k
      SubIdx = SubIndices[SubIndices.size() - Idx - 1];
548
8.90k
549
8.90k
    MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
550
8.90k
      get(Opcode), RI.getSubReg(DestReg, SubIdx));
551
8.90k
552
8.90k
    Builder.addReg(RI.getSubReg(SrcReg, SubIdx));
553
8.90k
554
8.90k
    if (Idx == 0)
555
4.26k
      Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
556
8.90k
557
7.28k
    bool UseKill = KillSrc && Idx == SubIndices.size() - 1;
558
8.90k
    Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
559
8.90k
  }
560
39.9k
}
561
562
242k
int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
563
242k
  int NewOpc;
564
242k
565
242k
  // Try to map original to commuted opcode
566
242k
  NewOpc = AMDGPU::getCommuteRev(Opcode);
567
242k
  if (NewOpc != -1)
568
242k
    // Check if the commuted (REV) opcode exists on the target.
569
22.9k
    
return pseudoToMCOpcode(NewOpc) != -1 ? 22.9k
NewOpc22.9k
:
-10
;
570
219k
571
219k
  // Try to map commuted to original opcode
572
219k
  NewOpc = AMDGPU::getCommuteOrig(Opcode);
573
219k
  if (NewOpc != -1)
574
219k
    // Check if the original (non-REV) opcode exists on the target.
575
39.3k
    
return pseudoToMCOpcode(NewOpc) != -1 ? 39.3k
NewOpc26.7k
:
-112.5k
;
576
180k
577
180k
  return Opcode;
578
180k
}
579
580
void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB,
581
                                       MachineBasicBlock::iterator MI,
582
                                       const DebugLoc &DL, unsigned DestReg,
583
0
                                       int64_t Value) const {
584
0
  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
585
0
  const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
586
0
  if (RegClass == &AMDGPU::SReg_32RegClass ||
587
0
      RegClass == &AMDGPU::SGPR_32RegClass ||
588
0
      RegClass == &AMDGPU::SReg_32_XM0RegClass ||
589
0
      
RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass0
) {
590
0
    BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
591
0
      .addImm(Value);
592
0
    return;
593
0
  }
594
0
595
0
  
if (0
RegClass == &AMDGPU::SReg_64RegClass ||
596
0
      RegClass == &AMDGPU::SGPR_64RegClass ||
597
0
      
RegClass == &AMDGPU::SReg_64_XEXECRegClass0
) {
598
0
    BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
599
0
      .addImm(Value);
600
0
    return;
601
0
  }
602
0
603
0
  
if (0
RegClass == &AMDGPU::VGPR_32RegClass0
) {
604
0
    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
605
0
      .addImm(Value);
606
0
    return;
607
0
  }
608
0
  
if (0
RegClass == &AMDGPU::VReg_64RegClass0
) {
609
0
    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
610
0
      .addImm(Value);
611
0
    return;
612
0
  }
613
0
614
0
  unsigned EltSize = 4;
615
0
  unsigned Opcode = AMDGPU::V_MOV_B32_e32;
616
0
  if (
RI.isSGPRClass(RegClass)0
) {
617
0
    if (
RI.getRegSizeInBits(*RegClass) > 320
) {
618
0
      Opcode =  AMDGPU::S_MOV_B64;
619
0
      EltSize = 8;
620
0
    } else {
621
0
      Opcode = AMDGPU::S_MOV_B32;
622
0
      EltSize = 4;
623
0
    }
624
0
  }
625
0
626
0
  ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
627
0
  for (unsigned Idx = 0; 
Idx < SubIndices.size()0
;
++Idx0
) {
628
0
    int64_t IdxValue = Idx == 0 ? 
Value0
:
00
;
629
0
630
0
    MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
631
0
      get(Opcode), RI.getSubReg(DestReg, Idx));
632
0
    Builder.addImm(IdxValue);
633
0
  }
634
0
}
635
636
const TargetRegisterClass *
637
0
SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const {
638
0
  return &AMDGPU::VGPR_32RegClass;
639
0
}
640
641
void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
642
                                     MachineBasicBlock::iterator I,
643
                                     const DebugLoc &DL, unsigned DstReg,
644
                                     ArrayRef<MachineOperand> Cond,
645
                                     unsigned TrueReg,
646
0
                                     unsigned FalseReg) const {
647
0
  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
648
0
  assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
649
0
         "Not a VGPR32 reg");
650
0
651
0
  if (
Cond.size() == 10
) {
652
0
    BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
653
0
      .addReg(FalseReg)
654
0
      .addReg(TrueReg)
655
0
      .add(Cond[0]);
656
0
  } else 
if (0
Cond.size() == 20
) {
657
0
    assert(Cond[0].isImm() && "Cond[0] is not an immediate");
658
0
    switch (Cond[0].getImm()) {
659
0
    case SIInstrInfo::SCC_TRUE: {
660
0
      unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
661
0
      BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
662
0
        .addImm(-1)
663
0
        .addImm(0);
664
0
      BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
665
0
        .addReg(FalseReg)
666
0
        .addReg(TrueReg)
667
0
        .addReg(SReg);
668
0
      break;
669
0
    }
670
0
    case SIInstrInfo::SCC_FALSE: {
671
0
      unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
672
0
      BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
673
0
        .addImm(0)
674
0
        .addImm(-1);
675
0
      BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
676
0
        .addReg(FalseReg)
677
0
        .addReg(TrueReg)
678
0
        .addReg(SReg);
679
0
      break;
680
0
    }
681
0
    case SIInstrInfo::VCCNZ: {
682
0
      MachineOperand RegOp = Cond[1];
683
0
      RegOp.setImplicit(false);
684
0
      BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
685
0
          .addReg(FalseReg)
686
0
          .addReg(TrueReg)
687
0
          .add(RegOp);
688
0
      break;
689
0
    }
690
0
    case SIInstrInfo::VCCZ: {
691
0
      MachineOperand RegOp = Cond[1];
692
0
      RegOp.setImplicit(false);
693
0
      BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
694
0
          .addReg(TrueReg)
695
0
          .addReg(FalseReg)
696
0
          .add(RegOp);
697
0
      break;
698
0
    }
699
0
    case SIInstrInfo::EXECNZ: {
700
0
      unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
701
0
      unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
702
0
      BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
703
0
        .addImm(0);
704
0
      BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
705
0
        .addImm(-1)
706
0
        .addImm(0);
707
0
      BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
708
0
        .addReg(FalseReg)
709
0
        .addReg(TrueReg)
710
0
        .addReg(SReg);
711
0
      break;
712
0
    }
713
0
    case SIInstrInfo::EXECZ: {
714
0
      unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
715
0
      unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
716
0
      BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
717
0
        .addImm(0);
718
0
      BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
719
0
        .addImm(0)
720
0
        .addImm(-1);
721
0
      BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
722
0
        .addReg(FalseReg)
723
0
        .addReg(TrueReg)
724
0
        .addReg(SReg);
725
0
      llvm_unreachable("Unhandled branch predicate EXECZ");
726
0
      break;
727
0
    }
728
0
    default:
729
0
      llvm_unreachable("invalid branch predicate");
730
0
    }
731
0
  } else {
732
0
    llvm_unreachable("Can only handle Cond size 1 or 2");
733
0
  }
734
0
}
735
736
unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB,
737
                               MachineBasicBlock::iterator I,
738
                               const DebugLoc &DL,
739
0
                               unsigned SrcReg, int Value) const {
740
0
  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
741
0
  unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
742
0
  BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
743
0
    .addImm(Value)
744
0
    .addReg(SrcReg);
745
0
746
0
  return Reg;
747
0
}
748
749
unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB,
750
                               MachineBasicBlock::iterator I,
751
                               const DebugLoc &DL,
752
0
                               unsigned SrcReg, int Value) const {
753
0
  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
754
0
  unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
755
0
  BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
756
0
    .addImm(Value)
757
0
    .addReg(SrcReg);
758
0
759
0
  return Reg;
760
0
}
761
762
8.58k
unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
763
8.58k
764
8.58k
  if (
RI.getRegSizeInBits(*DstRC) == 328.58k
) {
765
8.29k
    return RI.isSGPRClass(DstRC) ? 
AMDGPU::S_MOV_B324.54k
:
AMDGPU::V_MOV_B32_e323.75k
;
766
294
  } else 
if (294
RI.getRegSizeInBits(*DstRC) == 64 && 294
RI.isSGPRClass(DstRC)294
) {
767
7
    return AMDGPU::S_MOV_B64;
768
287
  } else 
if (287
RI.getRegSizeInBits(*DstRC) == 64 && 287
!RI.isSGPRClass(DstRC)287
) {
769
287
    return  AMDGPU::V_MOV_B64_PSEUDO;
770
287
  }
771
0
  return AMDGPU::COPY;
772
0
}
773
774
600
static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
775
600
  switch (Size) {
776
458
  case 4:
777
458
    return AMDGPU::SI_SPILL_S32_SAVE;
778
85
  case 8:
779
85
    return AMDGPU::SI_SPILL_S64_SAVE;
780
22
  case 16:
781
22
    return AMDGPU::SI_SPILL_S128_SAVE;
782
27
  case 32:
783
27
    return AMDGPU::SI_SPILL_S256_SAVE;
784
8
  case 64:
785
8
    return AMDGPU::SI_SPILL_S512_SAVE;
786
0
  default:
787
0
    llvm_unreachable("unknown register size");
788
0
  }
789
0
}
790
791
1.08k
static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
792
1.08k
  switch (Size) {
793
419
  case 4:
794
419
    return AMDGPU::SI_SPILL_V32_SAVE;
795
13
  case 8:
796
13
    return AMDGPU::SI_SPILL_V64_SAVE;
797
0
  case 12:
798
0
    return AMDGPU::SI_SPILL_V96_SAVE;
799
657
  case 16:
800
657
    return AMDGPU::SI_SPILL_V128_SAVE;
801
0
  case 32:
802
0
    return AMDGPU::SI_SPILL_V256_SAVE;
803
0
  case 64:
804
0
    return AMDGPU::SI_SPILL_V512_SAVE;
805
0
  default:
806
0
    llvm_unreachable("unknown register size");
807
0
  }
808
0
}
809
810
void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
811
                                      MachineBasicBlock::iterator MI,
812
                                      unsigned SrcReg, bool isKill,
813
                                      int FrameIndex,
814
                                      const TargetRegisterClass *RC,
815
1.68k
                                      const TargetRegisterInfo *TRI) const {
816
1.68k
  MachineFunction *MF = MBB.getParent();
817
1.68k
  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
818
1.68k
  MachineFrameInfo &FrameInfo = MF->getFrameInfo();
819
1.68k
  DebugLoc DL = MBB.findDebugLoc(MI);
820
1.68k
821
1.68k
  assert(SrcReg != MFI->getStackPtrOffsetReg() &&
822
1.68k
         SrcReg != MFI->getFrameOffsetReg() &&
823
1.68k
         SrcReg != MFI->getScratchWaveOffsetReg());
824
1.68k
825
1.68k
  unsigned Size = FrameInfo.getObjectSize(FrameIndex);
826
1.68k
  unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
827
1.68k
  MachinePointerInfo PtrInfo
828
1.68k
    = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
829
1.68k
  MachineMemOperand *MMO
830
1.68k
    = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
831
1.68k
                               Size, Align);
832
1.68k
  unsigned SpillSize = TRI->getSpillSize(*RC);
833
1.68k
834
1.68k
  if (
RI.isSGPRClass(RC)1.68k
) {
835
600
    MFI->setHasSpilledSGPRs();
836
600
837
600
    // We are only allowed to create one new instruction when spilling
838
600
    // registers, so we need to use pseudo instruction for spilling SGPRs.
839
600
    const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
840
600
841
600
    // The SGPR spill/restore instructions only work on number sgprs, so we need
842
600
    // to make sure we are using the correct register class.
843
600
    if (
TargetRegisterInfo::isVirtualRegister(SrcReg) && 600
SpillSize == 4150
) {
844
125
      MachineRegisterInfo &MRI = MF->getRegInfo();
845
125
      MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass);
846
125
    }
847
600
848
600
    MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc)
849
600
      .addReg(SrcReg, getKillRegState(isKill)) // data
850
600
      .addFrameIndex(FrameIndex)               // addr
851
600
      .addMemOperand(MMO)
852
600
      .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
853
600
      .addReg(MFI->getFrameOffsetReg(), RegState::Implicit);
854
600
    // Add the scratch resource registers as implicit uses because we may end up
855
600
    // needing them, and need to ensure that the reserved registers are
856
600
    // correctly handled.
857
600
858
600
    FrameInfo.setStackID(FrameIndex, 1);
859
600
    if (
ST.hasScalarStores()600
) {
860
300
      // m0 is used for offset to scalar stores if used to spill.
861
300
      Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
862
300
    }
863
600
864
600
    return;
865
600
  }
866
1.08k
867
1.08k
  
if (1.08k
!ST.isVGPRSpillingEnabled(*MF->getFunction())1.08k
) {
868
0
    LLVMContext &Ctx = MF->getFunction()->getContext();
869
0
    Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to"
870
0
                  " spill register");
871
0
    BuildMI(MBB, MI, DL, get(AMDGPU::KILL))
872
0
      .addReg(SrcReg);
873
0
874
0
    return;
875
0
  }
876
1.08k
877
1.08k
  assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
878
1.08k
879
1.08k
  unsigned Opcode = getVGPRSpillSaveOpcode(SpillSize);
880
1.08k
  MFI->setHasSpilledVGPRs();
881
1.08k
  BuildMI(MBB, MI, DL, get(Opcode))
882
1.08k
    .addReg(SrcReg, getKillRegState(isKill)) // data
883
1.08k
    .addFrameIndex(FrameIndex)               // addr
884
1.08k
    .addReg(MFI->getScratchRSrcReg())        // scratch_rsrc
885
1.08k
    .addReg(MFI->getFrameOffsetReg())        // scratch_offset
886
1.08k
    .addImm(0)                               // offset
887
1.08k
    .addMemOperand(MMO);
888
1.08k
}
889
890
587
static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
891
587
  switch (Size) {
892
452
  case 4:
893
452
    return AMDGPU::SI_SPILL_S32_RESTORE;
894
79
  case 8:
895
79
    return AMDGPU::SI_SPILL_S64_RESTORE;
896
21
  case 16:
897
21
    return AMDGPU::SI_SPILL_S128_RESTORE;
898
27
  case 32:
899
27
    return AMDGPU::SI_SPILL_S256_RESTORE;
900
8
  case 64:
901
8
    return AMDGPU::SI_SPILL_S512_RESTORE;
902
0
  default:
903
0
    llvm_unreachable("unknown register size");
904
0
  }
905
0
}
906
907
1.00k
static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
908
1.00k
  switch (Size) {
909
328
  case 4:
910
328
    return AMDGPU::SI_SPILL_V32_RESTORE;
911
13
  case 8:
912
13
    return AMDGPU::SI_SPILL_V64_RESTORE;
913
0
  case 12:
914
0
    return AMDGPU::SI_SPILL_V96_RESTORE;
915
661
  case 16:
916
661
    return AMDGPU::SI_SPILL_V128_RESTORE;
917
0
  case 32:
918
0
    return AMDGPU::SI_SPILL_V256_RESTORE;
919
0
  case 64:
920
0
    return AMDGPU::SI_SPILL_V512_RESTORE;
921
0
  default:
922
0
    llvm_unreachable("unknown register size");
923
0
  }
924
0
}
925
926
void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
927
                                       MachineBasicBlock::iterator MI,
928
                                       unsigned DestReg, int FrameIndex,
929
                                       const TargetRegisterClass *RC,
930
1.58k
                                       const TargetRegisterInfo *TRI) const {
931
1.58k
  MachineFunction *MF = MBB.getParent();
932
1.58k
  const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
933
1.58k
  MachineFrameInfo &FrameInfo = MF->getFrameInfo();
934
1.58k
  DebugLoc DL = MBB.findDebugLoc(MI);
935
1.58k
  unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
936
1.58k
  unsigned Size = FrameInfo.getObjectSize(FrameIndex);
937
1.58k
  unsigned SpillSize = TRI->getSpillSize(*RC);
938
1.58k
939
1.58k
  MachinePointerInfo PtrInfo
940
1.58k
    = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
941
1.58k
942
1.58k
  MachineMemOperand *MMO = MF->getMachineMemOperand(
943
1.58k
    PtrInfo, MachineMemOperand::MOLoad, Size, Align);
944
1.58k
945
1.58k
  if (
RI.isSGPRClass(RC)1.58k
) {
946
587
    // FIXME: Maybe this should not include a memoperand because it will be
947
587
    // lowered to non-memory instructions.
948
587
    const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
949
587
    if (
TargetRegisterInfo::isVirtualRegister(DestReg) && 587
SpillSize == 4154
) {
950
126
      MachineRegisterInfo &MRI = MF->getRegInfo();
951
126
      MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
952
126
    }
953
587
954
587
    FrameInfo.setStackID(FrameIndex, 1);
955
587
    MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg)
956
587
      .addFrameIndex(FrameIndex) // addr
957
587
      .addMemOperand(MMO)
958
587
      .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
959
587
      .addReg(MFI->getFrameOffsetReg(), RegState::Implicit);
960
587
961
587
    if (
ST.hasScalarStores()587
) {
962
295
      // m0 is used for offset to scalar stores if used to spill.
963
295
      Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
964
295
    }
965
587
966
587
    return;
967
587
  }
968
1.00k
969
1.00k
  
if (1.00k
!ST.isVGPRSpillingEnabled(*MF->getFunction())1.00k
) {
970
0
    LLVMContext &Ctx = MF->getFunction()->getContext();
971
0
    Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to"
972
0
                  " restore register");
973
0
    BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg);
974
0
975
0
    return;
976
0
  }
977
1.00k
978
1.00k
  assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
979
1.00k
980
1.00k
  unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize);
981
1.00k
  BuildMI(MBB, MI, DL, get(Opcode), DestReg)
982
1.00k
    .addFrameIndex(FrameIndex)        // vaddr
983
1.00k
    .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
984
1.00k
    .addReg(MFI->getFrameOffsetReg()) // scratch_offset
985
1.00k
    .addImm(0)                        // offset
986
1.00k
    .addMemOperand(MMO);
987
1.00k
}
988
989
/// \param @Offset Offset in bytes of the FrameIndex being spilled
990
unsigned SIInstrInfo::calculateLDSSpillAddress(
991
    MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg,
992
0
    unsigned FrameOffset, unsigned Size) const {
993
0
  MachineFunction *MF = MBB.getParent();
994
0
  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
995
0
  const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
996
0
  DebugLoc DL = MBB.findDebugLoc(MI);
997
0
  unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize();
998
0
  unsigned WavefrontSize = ST.getWavefrontSize();
999
0
1000
0
  unsigned TIDReg = MFI->getTIDReg();
1001
0
  if (
!MFI->hasCalculatedTID()0
) {
1002
0
    MachineBasicBlock &Entry = MBB.getParent()->front();
1003
0
    MachineBasicBlock::iterator Insert = Entry.front();
1004
0
    DebugLoc DL = Insert->getDebugLoc();
1005
0
1006
0
    TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass,
1007
0
                                   *MF);
1008
0
    if (TIDReg == AMDGPU::NoRegister)
1009
0
      return TIDReg;
1010
0
1011
0
    
if (0
!AMDGPU::isShader(MF->getFunction()->getCallingConv()) &&
1012
0
        
WorkGroupSize > WavefrontSize0
) {
1013
0
      unsigned TIDIGXReg
1014
0
        = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
1015
0
      unsigned TIDIGYReg
1016
0
        = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
1017
0
      unsigned TIDIGZReg
1018
0
        = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
1019
0
      unsigned InputPtrReg =
1020
0
          MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1021
0
      for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) {
1022
0
        if (!Entry.isLiveIn(Reg))
1023
0
          Entry.addLiveIn(Reg);
1024
0
      }
1025
0
1026
0
      RS->enterBasicBlock(Entry);
1027
0
      // FIXME: Can we scavenge an SReg_64 and access the subregs?
1028
0
      unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
1029
0
      unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
1030
0
      BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0)
1031
0
              .addReg(InputPtrReg)
1032
0
              .addImm(SI::KernelInputOffsets::NGROUPS_Z);
1033
0
      BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1)
1034
0
              .addReg(InputPtrReg)
1035
0
              .addImm(SI::KernelInputOffsets::NGROUPS_Y);
1036
0
1037
0
      // NGROUPS.X * NGROUPS.Y
1038
0
      BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1)
1039
0
              .addReg(STmp1)
1040
0
              .addReg(STmp0);
1041
0
      // (NGROUPS.X * NGROUPS.Y) * TIDIG.X
1042
0
      BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg)
1043
0
              .addReg(STmp1)
1044
0
              .addReg(TIDIGXReg);
1045
0
      // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)
1046
0
      BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg)
1047
0
              .addReg(STmp0)
1048
0
              .addReg(TIDIGYReg)
1049
0
              .addReg(TIDReg);
1050
0
      // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z
1051
0
      BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg)
1052
0
              .addReg(TIDReg)
1053
0
              .addReg(TIDIGZReg);
1054
0
    } else {
1055
0
      // Get the wave id
1056
0
      BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
1057
0
              TIDReg)
1058
0
              .addImm(-1)
1059
0
              .addImm(0);
1060
0
1061
0
      BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64),
1062
0
              TIDReg)
1063
0
              .addImm(-1)
1064
0
              .addReg(TIDReg);
1065
0
    }
1066
0
1067
0
    BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32),
1068
0
            TIDReg)
1069
0
            .addImm(2)
1070
0
            .addReg(TIDReg);
1071
0
    MFI->setTIDReg(TIDReg);
1072
0
  }
1073
0
1074
0
  // Add FrameIndex to LDS offset
1075
0
  unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize);
1076
0
  BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg)
1077
0
          .addImm(LDSOffset)
1078
0
          .addReg(TIDReg);
1079
0
1080
0
  return TmpReg;
1081
0
}
1082
1083
void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB,
1084
                                   MachineBasicBlock::iterator MI,
1085
1.54k
                                   int Count) const {
1086
1.54k
  DebugLoc DL = MBB.findDebugLoc(MI);
1087
3.09k
  while (
Count > 03.09k
) {
1088
1.54k
    int Arg;
1089
1.54k
    if (Count >= 8)
1090
0
      Arg = 7;
1091
1.54k
    else
1092
1.54k
      Arg = Count - 1;
1093
1.54k
    Count -= 8;
1094
1.54k
    BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP))
1095
1.54k
            .addImm(Arg);
1096
1.54k
  }
1097
1.54k
}
1098
1099
void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
1100
1.54k
                             MachineBasicBlock::iterator MI) const {
1101
1.54k
  insertWaitStates(MBB, MI, 1);
1102
1.54k
}
1103
1104
0
void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const {
1105
0
  auto MF = MBB.getParent();
1106
0
  SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1107
0
1108
0
  assert(Info->isEntryFunction());
1109
0
1110
0
  if (
MBB.succ_empty()0
) {
1111
0
    bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1112
0
    if (HasNoTerminator)
1113
0
      BuildMI(MBB, MBB.end(), DebugLoc(),
1114
0
              get(Info->returnsVoid() ? 
AMDGPU::S_ENDPGM0
:
AMDGPU::SI_RETURN_TO_EPILOG0
));
1115
0
  }
1116
0
}
1117
1118
444k
unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const {
1119
444k
  switch (MI.getOpcode()) {
1120
443k
  default: return 1; // FIXME: Do wait states equal cycles?
1121
444k
1122
1.27k
  case AMDGPU::S_NOP:
1123
1.27k
    return MI.getOperand(0).getImm() + 1;
1124
0
  }
1125
0
}
1126
1127
243k
bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
1128
243k
  MachineBasicBlock &MBB = *MI.getParent();
1129
243k
  DebugLoc DL = MBB.findDebugLoc(MI);
1130
243k
  switch (MI.getOpcode()) {
1131
242k
  default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
1132
0
  case AMDGPU::S_MOV_B64_term:
1133
0
    // This is only a terminator to get the correct spill code placement during
1134
0
    // register allocation.
1135
0
    MI.setDesc(get(AMDGPU::S_MOV_B64));
1136
0
    break;
1137
243k
1138
0
  case AMDGPU::S_XOR_B64_term:
1139
0
    // This is only a terminator to get the correct spill code placement during
1140
0
    // register allocation.
1141
0
    MI.setDesc(get(AMDGPU::S_XOR_B64));
1142
0
    break;
1143
243k
1144
0
  case AMDGPU::S_ANDN2_B64_term:
1145
0
    // This is only a terminator to get the correct spill code placement during
1146
0
    // register allocation.
1147
0
    MI.setDesc(get(AMDGPU::S_ANDN2_B64));
1148
0
    break;
1149
243k
1150
210
  case AMDGPU::V_MOV_B64_PSEUDO: {
1151
210
    unsigned Dst = MI.getOperand(0).getReg();
1152
210
    unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
1153
210
    unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
1154
210
1155
210
    const MachineOperand &SrcOp = MI.getOperand(1);
1156
210
    // FIXME: Will this work for 64-bit floating point immediates?
1157
210
    assert(!SrcOp.isFPImm());
1158
210
    if (
SrcOp.isImm()210
) {
1159
210
      APInt Imm(64, SrcOp.getImm());
1160
210
      BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
1161
210
        .addImm(Imm.getLoBits(32).getZExtValue())
1162
210
        .addReg(Dst, RegState::Implicit | RegState::Define);
1163
210
      BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
1164
210
        .addImm(Imm.getHiBits(32).getZExtValue())
1165
210
        .addReg(Dst, RegState::Implicit | RegState::Define);
1166
210
    } else {
1167
0
      assert(SrcOp.isReg());
1168
0
      BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
1169
0
        .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
1170
0
        .addReg(Dst, RegState::Implicit | RegState::Define);
1171
0
      BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
1172
0
        .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
1173
0
        .addReg(Dst, RegState::Implicit | RegState::Define);
1174
0
    }
1175
210
    MI.eraseFromParent();
1176
210
    break;
1177
243k
  }
1178
6
  case AMDGPU::V_SET_INACTIVE_B32: {
1179
6
    BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
1180
6
      .addReg(AMDGPU::EXEC);
1181
6
    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
1182
6
      .add(MI.getOperand(2));
1183
6
    BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
1184
6
      .addReg(AMDGPU::EXEC);
1185
6
    MI.eraseFromParent();
1186
6
    break;
1187
243k
  }
1188
2
  case AMDGPU::V_SET_INACTIVE_B64: {
1189
2
    BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
1190
2
      .addReg(AMDGPU::EXEC);
1191
2
    MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
1192
2
                                 MI.getOperand(0).getReg())
1193
2
      .add(MI.getOperand(2));
1194
2
    expandPostRAPseudo(*Copy);
1195
2
    BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
1196
2
      .addReg(AMDGPU::EXEC);
1197
2
    MI.eraseFromParent();
1198
2
    break;
1199
243k
  }
1200
64
  case AMDGPU::V_MOVRELD_B32_V1:
1201
64
  case AMDGPU::V_MOVRELD_B32_V2:
1202
64
  case AMDGPU::V_MOVRELD_B32_V4:
1203
64
  case AMDGPU::V_MOVRELD_B32_V8:
1204
64
  case AMDGPU::V_MOVRELD_B32_V16: {
1205
64
    const MCInstrDesc &MovRelDesc = get(AMDGPU::V_MOVRELD_B32_e32);
1206
64
    unsigned VecReg = MI.getOperand(0).getReg();
1207
64
    bool IsUndef = MI.getOperand(1).isUndef();
1208
64
    unsigned SubReg = AMDGPU::sub0 + MI.getOperand(3).getImm();
1209
64
    assert(VecReg == MI.getOperand(1).getReg());
1210
64
1211
64
    MachineInstr *MovRel =
1212
64
        BuildMI(MBB, MI, DL, MovRelDesc)
1213
64
            .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
1214
64
            .add(MI.getOperand(2))
1215
64
            .addReg(VecReg, RegState::ImplicitDefine)
1216
64
            .addReg(VecReg,
1217
64
                    RegState::Implicit | (IsUndef ? 
RegState::Undef2
:
062
));
1218
64
1219
64
    const int ImpDefIdx =
1220
64
        MovRelDesc.getNumOperands() + MovRelDesc.getNumImplicitUses();
1221
64
    const int ImpUseIdx = ImpDefIdx + 1;
1222
64
    MovRel->tieOperands(ImpDefIdx, ImpUseIdx);
1223
64
1224
64
    MI.eraseFromParent();
1225
64
    break;
1226
64
  }
1227
484
  case AMDGPU::SI_PC_ADD_REL_OFFSET: {
1228
484
    MachineFunction &MF = *MBB.getParent();
1229
484
    unsigned Reg = MI.getOperand(0).getReg();
1230
484
    unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
1231
484
    unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
1232
484
1233
484
    // Create a bundle so these instructions won't be re-ordered by the
1234
484
    // post-RA scheduler.
1235
484
    MIBundleBuilder Bundler(MBB, MI);
1236
484
    Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
1237
484
1238
484
    // Add 32-bit offset from this instruction to the start of the
1239
484
    // constant data.
1240
484
    Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo)
1241
484
                       .addReg(RegLo)
1242
484
                       .add(MI.getOperand(1)));
1243
484
1244
484
    MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
1245
484
                                  .addReg(RegHi);
1246
484
    if (MI.getOperand(2).getTargetFlags() == SIInstrInfo::MO_NONE)
1247
19
      MIB.addImm(0);
1248
484
    else
1249
465
      MIB.add(MI.getOperand(2));
1250
484
1251
484
    Bundler.append(MIB);
1252
484
    finalizeBundle(MBB, Bundler.begin());
1253
484
1254
484
    MI.eraseFromParent();
1255
484
    break;
1256
64
  }
1257
16
  case AMDGPU::EXIT_WWM: {
1258
16
    // This only gets its own opcode so that SIFixWWMLiveness can tell when WWM
1259
16
    // is exited.
1260
16
    MI.setDesc(get(AMDGPU::S_MOV_B64));
1261
16
    break;
1262
782
  }
1263
782
  }
1264
782
  return true;
1265
782
}
1266
1267
bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI,
1268
                                      MachineOperand &Src0,
1269
                                      unsigned Src0OpName,
1270
                                      MachineOperand &Src1,
1271
191k
                                      unsigned Src1OpName) const {
1272
191k
  MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
1273
191k
  if (!Src0Mods)
1274
153k
    return false;
1275
37.4k
1276
37.4k
  MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
1277
37.4k
  assert(Src1Mods &&
1278
37.4k
         "All commutable instructions have both src0 and src1 modifiers");
1279
37.4k
1280
37.4k
  int Src0ModsVal = Src0Mods->getImm();
1281
37.4k
  int Src1ModsVal = Src1Mods->getImm();
1282
37.4k
1283
37.4k
  Src1Mods->setImm(Src0ModsVal);
1284
37.4k
  Src0Mods->setImm(Src1ModsVal);
1285
37.4k
  return true;
1286
37.4k
}
1287
1288
static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI,
1289
                                             MachineOperand &RegOp,
1290
25.0k
                                             MachineOperand &NonRegOp) {
1291
25.0k
  unsigned Reg = RegOp.getReg();
1292
25.0k
  unsigned SubReg = RegOp.getSubReg();
1293
25.0k
  bool IsKill = RegOp.isKill();
1294
25.0k
  bool IsDead = RegOp.isDead();
1295
25.0k
  bool IsUndef = RegOp.isUndef();
1296
25.0k
  bool IsDebug = RegOp.isDebug();
1297
25.0k
1298
25.0k
  if (NonRegOp.isImm())
1299
25.0k
    RegOp.ChangeToImmediate(NonRegOp.getImm());
1300
0
  else 
if (0
NonRegOp.isFI()0
)
1301
0
    RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
1302
0
  else
1303
0
    return nullptr;
1304
25.0k
1305
25.0k
  NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
1306
25.0k
  NonRegOp.setSubReg(SubReg);
1307
25.0k
1308
25.0k
  return &MI;
1309
25.0k
}
1310
1311
MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
1312
                                                  unsigned Src0Idx,
1313
234k
                                                  unsigned Src1Idx) const {
1314
234k
  assert(!NewMI && "this should never be used");
1315
234k
1316
234k
  unsigned Opc = MI.getOpcode();
1317
234k
  int CommutedOpcode = commuteOpcode(Opc);
1318
234k
  if (CommutedOpcode == -1)
1319
12.5k
    return nullptr;
1320
221k
1321
234k
  assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
1322
221k
           static_cast<int>(Src0Idx) &&
1323
221k
         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
1324
221k
           static_cast<int>(Src1Idx) &&
1325
221k
         "inconsistency with findCommutedOpIndices");
1326
221k
1327
221k
  MachineOperand &Src0 = MI.getOperand(Src0Idx);
1328
221k
  MachineOperand &Src1 = MI.getOperand(Src1Idx);
1329
221k
1330
221k
  MachineInstr *CommutedMI = nullptr;
1331
221k
  if (
Src0.isReg() && 221k
Src1.isReg()198k
) {
1332
184k
    if (
isOperandLegal(MI, Src1Idx, &Src0)184k
) {
1333
166k
      // Be sure to copy the source modifiers to the right place.
1334
166k
      CommutedMI
1335
166k
        = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
1336
166k
    }
1337
184k
1338
221k
  } else 
if (36.9k
Src0.isReg() && 36.9k
!Src1.isReg()14.1k
) {
1339
14.1k
    // src0 should always be able to support any operand type, so no need to
1340
14.1k
    // check operand legality.
1341
14.1k
    CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
1342
36.9k
  } else 
if (22.8k
!Src0.isReg() && 22.8k
Src1.isReg()22.8k
) {
1343
22.8k
    if (isOperandLegal(MI, Src1Idx, &Src0))
1344
10.9k
      CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
1345
22.8k
  } else {
1346
1
    // FIXME: Found two non registers to commute. This does happen.
1347
1
    return nullptr;
1348
1
  }
1349
221k
1350
221k
  
if (221k
CommutedMI221k
) {
1351
191k
    swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
1352
191k
                        Src1, AMDGPU::OpName::src1_modifiers);
1353
191k
1354
191k
    CommutedMI->setDesc(get(CommutedOpcode));
1355
191k
  }
1356
234k
1357
234k
  return CommutedMI;
1358
234k
}
1359
1360
// This needs to be implemented because the source modifiers may be inserted
1361
// between the true commutable operands, and the base
1362
// TargetInstrInfo::commuteInstruction uses it.
1363
bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0,
1364
263k
                                        unsigned &SrcOpIdx1) const {
1365
263k
  if (!MI.isCommutable())
1366
33.5k
    return false;
1367
230k
1368
230k
  unsigned Opc = MI.getOpcode();
1369
230k
  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1370
230k
  if (Src0Idx == -1)
1371
0
    return false;
1372
230k
1373
230k
  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
1374
230k
  if (Src1Idx == -1)
1375
0
    return false;
1376
230k
1377
230k
  return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
1378
230k
}
1379
1380
bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
1381
925
                                        int64_t BrOffset) const {
1382
925
  // BranchRelaxation should never have to check s_setpc_b64 because its dest
1383
925
  // block is unanalyzable.
1384
925
  assert(BranchOp != AMDGPU::S_SETPC_B64);
1385
925
1386
925
  // Convert to dwords.
1387
925
  BrOffset /= 4;
1388
925
1389
925
  // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
1390
925
  // from the next instruction.
1391
925
  BrOffset -= 1;
1392
925
1393
925
  return isIntN(BranchOffsetBits, BrOffset);
1394
925
}
1395
1396
MachineBasicBlock *SIInstrInfo::getBranchDestBlock(
1397
954
  const MachineInstr &MI) const {
1398
954
  if (
MI.getOpcode() == AMDGPU::S_SETPC_B64954
) {
1399
0
    // This would be a difficult analysis to perform, but can always be legal so
1400
0
    // there's no need to analyze it.
1401
0
    return nullptr;
1402
0
  }
1403
954
1404
954
  return MI.getOperand(0).getMBB();
1405
954
}
1406
1407
unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
1408
                                           MachineBasicBlock &DestBB,
1409
                                           const DebugLoc &DL,
1410
                                           int64_t BrOffset,
1411
29
                                           RegScavenger *RS) const {
1412
29
  assert(RS && "RegScavenger required for long branching");
1413
29
  assert(MBB.empty() &&
1414
29
         "new block should be inserted for expanding unconditional branch");
1415
29
  assert(MBB.pred_size() == 1);
1416
29
1417
29
  MachineFunction *MF = MBB.getParent();
1418
29
  MachineRegisterInfo &MRI = MF->getRegInfo();
1419
29
1420
29
  // FIXME: Virtual register workaround for RegScavenger not working with empty
1421
29
  // blocks.
1422
29
  unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
1423
29
1424
29
  auto I = MBB.end();
1425
29
1426
29
  // We need to compute the offset relative to the instruction immediately after
1427
29
  // s_getpc_b64. Insert pc arithmetic code before last terminator.
1428
29
  MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
1429
29
1430
29
  // TODO: Handle > 32-bit block address.
1431
29
  if (
BrOffset >= 029
) {
1432
21
    BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
1433
21
      .addReg(PCReg, RegState::Define, AMDGPU::sub0)
1434
21
      .addReg(PCReg, 0, AMDGPU::sub0)
1435
21
      .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_FORWARD);
1436
21
    BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
1437
21
      .addReg(PCReg, RegState::Define, AMDGPU::sub1)
1438
21
      .addReg(PCReg, 0, AMDGPU::sub1)
1439
21
      .addImm(0);
1440
29
  } else {
1441
8
    // Backwards branch.
1442
8
    BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32))
1443
8
      .addReg(PCReg, RegState::Define, AMDGPU::sub0)
1444
8
      .addReg(PCReg, 0, AMDGPU::sub0)
1445
8
      .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_BACKWARD);
1446
8
    BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32))
1447
8
      .addReg(PCReg, RegState::Define, AMDGPU::sub1)
1448
8
      .addReg(PCReg, 0, AMDGPU::sub1)
1449
8
      .addImm(0);
1450
8
  }
1451
29
1452
29
  // Insert the indirect branch after the other terminator.
1453
29
  BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
1454
29
    .addReg(PCReg);
1455
29
1456
29
  // FIXME: If spilling is necessary, this will fail because this scavenger has
1457
29
  // no emergency stack slots. It is non-trivial to spill in this situation,
1458
29
  // because the restore code needs to be specially placed after the
1459
29
  // jump. BranchRelaxation then needs to be made aware of the newly inserted
1460
29
  // block.
1461
29
  //
1462
29
  // If a spill is needed for the pc register pair, we need to insert a spill
1463
29
  // restore block right before the destination block, and insert a short branch
1464
29
  // into the old destination block's fallthrough predecessor.
1465
29
  // e.g.:
1466
29
  //
1467
29
  // s_cbranch_scc0 skip_long_branch:
1468
29
  //
1469
29
  // long_branch_bb:
1470
29
  //   spill s[8:9]
1471
29
  //   s_getpc_b64 s[8:9]
1472
29
  //   s_add_u32 s8, s8, restore_bb
1473
29
  //   s_addc_u32 s9, s9, 0
1474
29
  //   s_setpc_b64 s[8:9]
1475
29
  //
1476
29
  // skip_long_branch:
1477
29
  //   foo;
1478
29
  //
1479
29
  // .....
1480
29
  //
1481
29
  // dest_bb_fallthrough_predecessor:
1482
29
  // bar;
1483
29
  // s_branch dest_bb
1484
29
  //
1485
29
  // restore_bb:
1486
29
  //  restore s[8:9]
1487
29
  //  fallthrough dest_bb
1488
29
  ///
1489
29
  // dest_bb:
1490
29
  //   buzz;
1491
29
1492
29
  RS->enterBasicBlockEnd(MBB);
1493
29
  unsigned Scav = RS->scavengeRegister(&AMDGPU::SReg_64RegClass,
1494
29
                                       MachineBasicBlock::iterator(GetPC), 0);
1495
29
  MRI.replaceRegWith(PCReg, Scav);
1496
29
  MRI.clearVirtRegs();
1497
29
  RS->setRegUsed(Scav);
1498
29
1499
29
  return 4 + 8 + 4 + 4;
1500
29
}
1501
1502
1.48k
unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
1503
1.48k
  switch (Cond) {
1504
440
  case SIInstrInfo::SCC_TRUE:
1505
440
    return AMDGPU::S_CBRANCH_SCC1;
1506
412
  case SIInstrInfo::SCC_FALSE:
1507
412
    return AMDGPU::S_CBRANCH_SCC0;
1508
237
  case SIInstrInfo::VCCNZ:
1509
237
    return AMDGPU::S_CBRANCH_VCCNZ;
1510
243
  case SIInstrInfo::VCCZ:
1511
243
    return AMDGPU::S_CBRANCH_VCCZ;
1512
89
  case SIInstrInfo::EXECNZ:
1513
89
    return AMDGPU::S_CBRANCH_EXECNZ;
1514
59
  case SIInstrInfo::EXECZ:
1515
59
    return AMDGPU::S_CBRANCH_EXECZ;
1516
0
  default:
1517
0
    llvm_unreachable("invalid branch predicate");
1518
0
  }
1519
0
}
1520
1521
662k
SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
1522
662k
  switch (Opcode) {
1523
1.85k
  case AMDGPU::S_CBRANCH_SCC0:
1524
1.85k
    return SCC_FALSE;
1525
15.2k
  case AMDGPU::S_CBRANCH_SCC1:
1526
15.2k
    return SCC_TRUE;
1527
7.13k
  case AMDGPU::S_CBRANCH_VCCNZ:
1528
7.13k
    return VCCNZ;
1529
1.19k
  case AMDGPU::S_CBRANCH_VCCZ:
1530
1.19k
    return VCCZ;
1531
3.32k
  case AMDGPU::S_CBRANCH_EXECNZ:
1532
3.32k
    return EXECNZ;
1533
354
  case AMDGPU::S_CBRANCH_EXECZ:
1534
354
    return EXECZ;
1535
633k
  default:
1536
633k
    return INVALID_BR;
1537
0
  }
1538
0
}
1539
1540
bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB,
1541
                                    MachineBasicBlock::iterator I,
1542
                                    MachineBasicBlock *&TBB,
1543
                                    MachineBasicBlock *&FBB,
1544
                                    SmallVectorImpl<MachineOperand> &Cond,
1545
687k
                                    bool AllowModify) const {
1546
687k
  if (
I->getOpcode() == AMDGPU::S_BRANCH687k
) {
1547
24.2k
    // Unconditional Branch
1548
24.2k
    TBB = I->getOperand(0).getMBB();
1549
24.2k
    return false;
1550
24.2k
  }
1551
662k
1552
662k
  MachineBasicBlock *CondBB = nullptr;
1553
662k
1554
662k
  if (
I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO662k
) {
1555
0
    CondBB = I->getOperand(1).getMBB();
1556
0
    Cond.push_back(I->getOperand(0));
1557
662k
  } else {
1558
662k
    BranchPredicate Pred = getBranchPredicate(I->getOpcode());
1559
662k
    if (Pred == INVALID_BR)
1560
633k
      return true;
1561
29.0k
1562
29.0k
    CondBB = I->getOperand(0).getMBB();
1563
29.0k
    Cond.push_back(MachineOperand::CreateImm(Pred));
1564
29.0k
    Cond.push_back(I->getOperand(1)); // Save the branch register.
1565
29.0k
  }
1566
29.0k
  ++I;
1567
29.0k
1568
29.0k
  if (
I == MBB.end()29.0k
) {
1569
15.2k
    // Conditional branch followed by fall-through.
1570
15.2k
    TBB = CondBB;
1571
15.2k
    return false;
1572
15.2k
  }
1573
13.7k
1574
13.7k
  
if (13.7k
I->getOpcode() == AMDGPU::S_BRANCH13.7k
) {
1575
13.7k
    TBB = CondBB;
1576
13.7k
    FBB = I->getOperand(0).getMBB();
1577
13.7k
    return false;
1578
13.7k
  }
1579
0
1580
0
  return true;
1581
0
}
1582
1583
bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
1584
                                MachineBasicBlock *&FBB,
1585
                                SmallVectorImpl<MachineOperand> &Cond,
1586
724k
                                bool AllowModify) const {
1587
724k
  MachineBasicBlock::iterator I = MBB.getFirstTerminator();
1588
724k
  if (I == MBB.end())
1589
36.9k
    return false;
1590
687k
1591
687k
  
if (687k
I->getOpcode() != AMDGPU::SI_MASK_BRANCH687k
)
1592
676k
    return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
1593
11.4k
1594
11.4k
  ++I;
1595
11.4k
1596
11.4k
  // TODO: Should be able to treat as fallthrough?
1597
11.4k
  if (I == MBB.end())
1598
785
    return true;
1599
10.6k
1600
10.6k
  
if (10.6k
analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify)10.6k
)
1601
0
    return true;
1602
10.6k
1603
10.6k
  MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB();
1604
10.6k
1605
10.6k
  // Specifically handle the case where the conditional branch is to the same
1606
10.6k
  // destination as the mask branch. e.g.
1607
10.6k
  //
1608
10.6k
  // si_mask_branch BB8
1609
10.6k
  // s_cbranch_execz BB8
1610
10.6k
  // s_cbranch BB9
1611
10.6k
  //
1612
10.6k
  // This is required to understand divergent loops which may need the branches
1613
10.6k
  // to be relaxed.
1614
10.6k
  if (
TBB != MaskBrDest || 10.6k
Cond.empty()10.6k
)
1615
10.3k
    return true;
1616
266
1617
266
  auto Pred = Cond[0].getImm();
1618
2
  return (Pred != EXECZ && Pred != EXECNZ);
1619
724k
}
1620
1621
unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB,
1622
2.37k
                                   int *BytesRemoved) const {
1623
2.37k
  MachineBasicBlock::iterator I = MBB.getFirstTerminator();
1624
2.37k
1625
2.37k
  unsigned Count = 0;
1626
2.37k
  unsigned RemovedSize = 0;
1627
5.31k
  while (
I != MBB.end()5.31k
) {
1628
2.94k
    MachineBasicBlock::iterator Next = std::next(I);
1629
2.94k
    if (
I->getOpcode() == AMDGPU::SI_MASK_BRANCH2.94k
) {
1630
2
      I = Next;
1631
2
      continue;
1632
2
    }
1633
2.93k
1634
2.93k
    RemovedSize += getInstSizeInBytes(*I);
1635
2.93k
    I->eraseFromParent();
1636
2.93k
    ++Count;
1637
2.93k
    I = Next;
1638
2.93k
  }
1639
2.37k
1640
2.37k
  if (BytesRemoved)
1641
23
    *BytesRemoved = RemovedSize;
1642
2.37k
1643
2.37k
  return Count;
1644
2.37k
}
1645
1646
// Copy the flags onto the implicit condition register operand.
1647
static void preserveCondRegFlags(MachineOperand &CondReg,
1648
1.41k
                                 const MachineOperand &OrigCond) {
1649
1.41k
  CondReg.setIsUndef(OrigCond.isUndef());
1650
1.41k
  CondReg.setIsKill(OrigCond.isKill());
1651
1.41k
}
1652
1653
unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
1654
                                   MachineBasicBlock *TBB,
1655
                                   MachineBasicBlock *FBB,
1656
                                   ArrayRef<MachineOperand> Cond,
1657
                                   const DebugLoc &DL,
1658
2.13k
                                   int *BytesAdded) const {
1659
2.13k
  if (
!FBB && 2.13k
Cond.empty()2.04k
) {
1660
658
    BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
1661
658
      .addMBB(TBB);
1662
658
    if (BytesAdded)
1663
0
      *BytesAdded = 4;
1664
658
    return 1;
1665
658
  }
1666
1.48k
1667
1.48k
  
if(1.48k
Cond.size() == 1 && 1.48k
Cond[0].isReg()0
) {
1668
0
     BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO))
1669
0
       .add(Cond[0])
1670
0
       .addMBB(TBB);
1671
0
     return 1;
1672
0
  }
1673
1.48k
1674
1.48k
  assert(TBB && Cond[0].isImm());
1675
1.48k
1676
1.48k
  unsigned Opcode
1677
1.48k
    = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
1678
1.48k
1679
1.48k
  if (
!FBB1.48k
) {
1680
1.38k
    Cond[1].isUndef();
1681
1.38k
    MachineInstr *CondBr =
1682
1.38k
      BuildMI(&MBB, DL, get(Opcode))
1683
1.38k
      .addMBB(TBB);
1684
1.38k
1685
1.38k
    // Copy the flags onto the implicit condition register operand.
1686
1.38k
    preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
1687
1.38k
1688
1.38k
    if (BytesAdded)
1689
0
      *BytesAdded = 4;
1690
1.38k
    return 1;
1691
1.38k
  }
1692
92
1693
1.48k
  assert(TBB && FBB);
1694
92
1695
92
  MachineInstr *CondBr =
1696
92
    BuildMI(&MBB, DL, get(Opcode))
1697
92
    .addMBB(TBB);
1698
92
  BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
1699
92
    .addMBB(FBB);
1700
92
1701
92
  MachineOperand &CondReg = CondBr->getOperand(1);
1702
92
  CondReg.setIsUndef(Cond[1].isUndef());
1703
92
  CondReg.setIsKill(Cond[1].isKill());
1704
92
1705
92
  if (BytesAdded)
1706
23
      *BytesAdded = 8;
1707
2.13k
1708
2.13k
  return 2;
1709
2.13k
}
1710
1711
bool SIInstrInfo::reverseBranchCondition(
1712
1.23k
  SmallVectorImpl<MachineOperand> &Cond) const {
1713
1.23k
  if (
Cond.size() != 21.23k
) {
1714
0
    return true;
1715
0
  }
1716
1.23k
1717
1.23k
  
if (1.23k
Cond[0].isImm()1.23k
) {
1718
1.23k
    Cond[0].setImm(-Cond[0].getImm());
1719
1.23k
    return false;
1720
1.23k
  }
1721
0
1722
0
  return true;
1723
0
}
1724
1725
bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
1726
                                  ArrayRef<MachineOperand> Cond,
1727
                                  unsigned TrueReg, unsigned FalseReg,
1728
                                  int &CondCycles,
1729
22
                                  int &TrueCycles, int &FalseCycles) const {
1730
22
  switch (Cond[0].getImm()) {
1731
15
  case VCCNZ:
1732
15
  case VCCZ: {
1733
15
    const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1734
15
    const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
1735
15
    assert(MRI.getRegClass(FalseReg) == RC);
1736
15
1737
15
    int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
1738
15
    CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
1739
15
1740
15
    // Limit to equal cost for branch vs. N v_cndmask_b32s.
1741
14
    return !RI.isSGPRClass(RC) && NumInsts <= 6;
1742
15
  }
1743
7
  case SCC_TRUE:
1744
7
  case SCC_FALSE: {
1745
7
    // FIXME: We could insert for VGPRs if we could replace the original compare
1746
7
    // with a vector one.
1747
7
    const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1748
7
    const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
1749
7
    assert(MRI.getRegClass(FalseReg) == RC);
1750
7
1751
7
    int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
1752
7
1753
7
    // Multiples of 8 can do s_cselect_b64
1754
7
    if (NumInsts % 2 == 0)
1755
3
      NumInsts /= 2;
1756
7
1757
7
    CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
1758
7
    return RI.isSGPRClass(RC);
1759
7
  }
1760
0
  default:
1761
0
    return false;
1762
0
  }
1763
0
}
1764
1765
void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
1766
                               MachineBasicBlock::iterator I, const DebugLoc &DL,
1767
                               unsigned DstReg, ArrayRef<MachineOperand> Cond,
1768
16
                               unsigned TrueReg, unsigned FalseReg) const {
1769
16
  BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
1770
16
  if (
Pred == VCCZ || 16
Pred == SCC_FALSE16
) {
1771
0
    Pred = static_cast<BranchPredicate>(-Pred);
1772
0
    std::swap(TrueReg, FalseReg);
1773
0
  }
1774
16
1775
16
  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1776
16
  const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
1777
16
  unsigned DstSize = RI.getRegSizeInBits(*DstRC);
1778
16
1779
16
  if (
DstSize == 3216
) {
1780
9
    unsigned SelOp = Pred == SCC_TRUE ?
1781
9
      
AMDGPU::S_CSELECT_B323
:
AMDGPU::V_CNDMASK_B32_e326
;
1782
9
1783
9
    // Instruction's operands are backwards from what is expected.
1784
9
    MachineInstr *Select =
1785
9
      BuildMI(MBB, I, DL, get(SelOp), DstReg)
1786
9
      .addReg(FalseReg)
1787
9
      .addReg(TrueReg);
1788
9
1789
9
    preserveCondRegFlags(Select->getOperand(3), Cond[1]);
1790
9
    return;
1791
9
  }
1792
7
1793
7
  
if (7
DstSize == 64 && 7
Pred == SCC_TRUE3
) {
1794
1
    MachineInstr *Select =
1795
1
      BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
1796
1
      .addReg(FalseReg)
1797
1
      .addReg(TrueReg);
1798
1
1799
1
    preserveCondRegFlags(Select->getOperand(3), Cond[1]);
1800
1
    return;
1801
1
  }
1802
6
1803
6
  static const int16_t Sub0_15[] = {
1804
6
    AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
1805
6
    AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
1806
6
    AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
1807
6
    AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
1808
6
  };
1809
6
1810
6
  static const int16_t Sub0_15_64[] = {
1811
6
    AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
1812
6
    AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
1813
6
    AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
1814
6
    AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
1815
6
  };
1816
6
1817
6
  unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
1818
6
  const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
1819
6
  const int16_t *SubIndices = Sub0_15;
1820
6
  int NElts = DstSize / 32;
1821
6
1822
6
  // 64-bit select is only avaialble for SALU.
1823
6
  if (
Pred == SCC_TRUE6
) {
1824
2
    SelOp = AMDGPU::S_CSELECT_B64;
1825
2
    EltRC = &AMDGPU::SGPR_64RegClass;
1826
2
    SubIndices = Sub0_15_64;
1827
2
1828
2
    assert(NElts % 2 == 0);
1829
2
    NElts /= 2;
1830
2
  }
1831
6
1832
6
  MachineInstrBuilder MIB = BuildMI(
1833
6
    MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
1834
6
1835
6
  I = MIB->getIterator();
1836
6
1837
6
  SmallVector<unsigned, 8> Regs;
1838
22
  for (int Idx = 0; 
Idx != NElts22
;
++Idx16
) {
1839
16
    unsigned DstElt = MRI.createVirtualRegister(EltRC);
1840
16
    Regs.push_back(DstElt);
1841
16
1842
16
    unsigned SubIdx = SubIndices[Idx];
1843
16
1844
16
    MachineInstr *Select =
1845
16
      BuildMI(MBB, I, DL, get(SelOp), DstElt)
1846
16
      .addReg(FalseReg, 0, SubIdx)
1847
16
      .addReg(TrueReg, 0, SubIdx);
1848
16
    preserveCondRegFlags(Select->getOperand(3), Cond[1]);
1849
16
1850
16
    MIB.addReg(DstElt)
1851
16
       .addImm(SubIdx);
1852
16
  }
1853
16
}
1854
1855
760k
bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const {
1856
760k
  switch (MI.getOpcode()) {
1857
22.1k
  case AMDGPU::V_MOV_B32_e32:
1858
22.1k
  case AMDGPU::V_MOV_B32_e64:
1859
22.1k
  case AMDGPU::V_MOV_B64_PSEUDO: {
1860
22.1k
    // If there are additional implicit register operands, this may be used for
1861
22.1k
    // register indexing so the source register operand isn't simply copied.
1862
22.1k
    unsigned NumOps = MI.getDesc().getNumOperands() +
1863
22.1k
      MI.getDesc().getNumImplicitUses();
1864
22.1k
1865
22.1k
    return MI.getNumOperands() == NumOps;
1866
22.1k
  }
1867
287k
  case AMDGPU::S_MOV_B32:
1868
287k
  case AMDGPU::S_MOV_B64:
1869
287k
  case AMDGPU::COPY:
1870
287k
    return true;
1871
451k
  default:
1872
451k
    return false;
1873
0
  }
1874
0
}
1875
1876
unsigned SIInstrInfo::getAddressSpaceForPseudoSourceKind(
1877
93.9k
    PseudoSourceValue::PSVKind Kind) const {
1878
93.9k
  switch(Kind) {
1879
17.2k
  case PseudoSourceValue::Stack:
1880
17.2k
  case PseudoSourceValue::FixedStack:
1881
17.2k
    return AMDGPUASI.PRIVATE_ADDRESS;
1882
76.6k
  case PseudoSourceValue::ConstantPool:
1883
76.6k
  case PseudoSourceValue::GOT:
1884
76.6k
  case PseudoSourceValue::JumpTable:
1885
76.6k
  case PseudoSourceValue::GlobalValueCallEntry:
1886
76.6k
  case PseudoSourceValue::ExternalSymbolCallEntry:
1887
76.6k
  case PseudoSourceValue::TargetCustom:
1888
76.6k
    return AMDGPUASI.CONSTANT_ADDRESS;
1889
0
  }
1890
0
  return AMDGPUASI.FLAT_ADDRESS;
1891
0
}
1892
1893
16
static void removeModOperands(MachineInstr &MI) {
1894
16
  unsigned Opc = MI.getOpcode();
1895
16
  int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,
1896
16
                                              AMDGPU::OpName::src0_modifiers);
1897
16
  int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc,
1898
16
                                              AMDGPU::OpName::src1_modifiers);
1899
16
  int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc,
1900
16
                                              AMDGPU::OpName::src2_modifiers);
1901
16
1902
16
  MI.RemoveOperand(Src2ModIdx);
1903
16
  MI.RemoveOperand(Src1ModIdx);
1904
16
  MI.RemoveOperand(Src0ModIdx);
1905
16
}
1906
1907
bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
1908
47.1k
                                unsigned Reg, MachineRegisterInfo *MRI) const {
1909
47.1k
  if (!MRI->hasOneNonDBGUse(Reg))
1910
28.7k
    return false;
1911
18.4k
1912
18.4k
  unsigned Opc = UseMI.getOpcode();
1913
18.4k
  if (
Opc == AMDGPU::COPY18.4k
) {
1914
3.78k
    bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg());
1915
3.78k
    switch (DefMI.getOpcode()) {
1916
0
    default:
1917
0
      return false;
1918
265
    case AMDGPU::S_MOV_B64:
1919
265
      // TODO: We could fold 64-bit immediates, but this get compilicated
1920
265
      // when there are sub-registers.
1921
265
      return false;
1922
3.78k
1923
3.51k
    case AMDGPU::V_MOV_B32_e32:
1924
3.51k
    case AMDGPU::S_MOV_B32:
1925
3.51k
      break;
1926
3.51k
    }
1927
3.51k
    
unsigned NewOpc = isVGPRCopy ? 3.51k
AMDGPU::V_MOV_B32_e323.50k
:
AMDGPU::S_MOV_B3218
;
1928
3.51k
    const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
1929
3.51k
    assert(ImmOp);
1930
3.51k
    // FIXME: We could handle FrameIndex values here.
1931
3.51k
    if (
!ImmOp->isImm()3.51k
) {
1932
17
      return false;
1933
17
    }
1934
3.50k
    UseMI.setDesc(get(NewOpc));
1935
3.50k
    UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm());
1936
3.50k
    UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
1937
3.50k
    return true;
1938
3.50k
  }
1939
14.6k
1940
14.6k
  
if (14.6k
Opc == AMDGPU::V_MAD_F32 || 14.6k
Opc == AMDGPU::V_MAC_F32_e6414.6k
||
1941
14.6k
      
Opc == AMDGPU::V_MAD_F1614.5k
||
Opc == AMDGPU::V_MAC_F16_e6414.5k
) {
1942
164
    // Don't fold if we are using source or output modifiers. The new VOP2
1943
164
    // instructions don't have them.
1944
164
    if (hasAnyModifiersSet(UseMI))
1945
32
      return false;
1946
132
1947
132
    const MachineOperand &ImmOp = DefMI.getOperand(1);
1948
132
1949
132
    // If this is a free constant, there's no reason to do this.
1950
132
    // TODO: We could fold this here instead of letting SIFoldOperands do it
1951
132
    // later.
1952
132
    MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
1953
132
1954
132
    // Any src operand can be used for the legality check.
1955
132
    if (isInlineConstant(UseMI, *Src0, ImmOp))
1956
87
      return false;
1957
45
1958
45
    
bool IsF32 = Opc == AMDGPU::V_MAD_F32 || 45
Opc == AMDGPU::V_MAC_F32_e6445
;
1959
45
    MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
1960
45
    MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
1961
45
1962
45
    // Multiplied part is the constant: Use v_madmk_{f16, f32}.
1963
45
    // We should only expect these to be on src0 due to canonicalizations.
1964
45
    if (
Src0->isReg() && 45
Src0->getReg() == Reg45
) {
1965
0
      if (
!Src1->isReg() || 0
RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))0
)
1966
0
        return false;
1967
0
1968
0
      
if (0
!Src2->isReg() || 0
RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))0
)
1969
0
        return false;
1970
0
1971
0
      // We need to swap operands 0 and 1 since madmk constant is at operand 1.
1972
0
1973
0
      const int64_t Imm = DefMI.getOperand(1).getImm();
1974
0
1975
0
      // FIXME: This would be a lot easier if we could return a new instruction
1976
0
      // instead of having to modify in place.
1977
0
1978
0
      // Remove these first since they are at the end.
1979
0
      UseMI.RemoveOperand(
1980
0
          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
1981
0
      UseMI.RemoveOperand(
1982
0
          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
1983
0
1984
0
      unsigned Src1Reg = Src1->getReg();
1985
0
      unsigned Src1SubReg = Src1->getSubReg();
1986
0
      Src0->setReg(Src1Reg);
1987
0
      Src0->setSubReg(Src1SubReg);
1988
0
      Src0->setIsKill(Src1->isKill());
1989
0
1990
0
      if (Opc == AMDGPU::V_MAC_F32_e64 ||
1991
0
          Opc == AMDGPU::V_MAC_F16_e64)
1992
0
        UseMI.untieRegOperand(
1993
0
            AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
1994
0
1995
0
      Src1->ChangeToImmediate(Imm);
1996
0
1997
0
      removeModOperands(UseMI);
1998
0
      UseMI.setDesc(get(IsF32 ? 
AMDGPU::V_MADMK_F320
:
AMDGPU::V_MADMK_F160
));
1999
0
2000
0
      bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
2001
0
      if (DeleteDef)
2002
0
        DefMI.eraseFromParent();
2003
0
2004
0
      return true;
2005
0
    }
2006
45
2007
45
    // Added part is the constant: Use v_madak_{f16, f32}.
2008
45
    
if (45
Src2->isReg() && 45
Src2->getReg() == Reg45
) {
2009
19
      // Not allowed to use constant bus for another operand.
2010
19
      // We can however allow an inline immediate as src0.
2011
19
      if (!Src0->isImm() &&
2012
19
          
(Src0->isReg() && 19
RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))19
))
2013
2
        return false;
2014
17
2015
17
      
if (17
!Src1->isReg() || 17
RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))17
)
2016
1
        return false;
2017
16
2018
16
      const int64_t Imm = DefMI.getOperand(1).getImm();
2019
16
2020
16
      // FIXME: This would be a lot easier if we could return a new instruction
2021
16
      // instead of having to modify in place.
2022
16
2023
16
      // Remove these first since they are at the end.
2024
16
      UseMI.RemoveOperand(
2025
16
          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
2026
16
      UseMI.RemoveOperand(
2027
16
          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
2028
16
2029
16
      if (Opc == AMDGPU::V_MAC_F32_e64 ||
2030
1
          Opc == AMDGPU::V_MAC_F16_e64)
2031
16
        UseMI.untieRegOperand(
2032
16
            AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
2033
16
2034
16
      // ChangingToImmediate adds Src2 back to the instruction.
2035
16
      Src2->ChangeToImmediate(Imm);
2036
16
2037
16
      // These come before src2.
2038
16
      removeModOperands(UseMI);
2039
16
      UseMI.setDesc(get(IsF32 ? 
AMDGPU::V_MADAK_F3215
:
AMDGPU::V_MADAK_F161
));
2040
16
2041
16
      bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
2042
16
      if (DeleteDef)
2043
0
        DefMI.eraseFromParent();
2044
19
2045
19
      return true;
2046
19
    }
2047
164
  }
2048
14.4k
2049
14.4k
  return false;
2050
14.4k
}
2051
2052
static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
2053
11.4k
                                int WidthB, int OffsetB) {
2054
11.4k
  int LowOffset = OffsetA < OffsetB ? 
OffsetA2.59k
:
OffsetB8.81k
;
2055
11.4k
  int HighOffset = OffsetA < OffsetB ? 
OffsetB2.59k
:
OffsetA8.81k
;
2056
11.4k
  int LowWidth = (LowOffset == OffsetA) ? 
WidthA5.83k
:
WidthB5.57k
;
2057
11.4k
  return LowOffset + LowWidth <= HighOffset;
2058
11.4k
}
2059
2060
bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa,
2061
835k
                                               MachineInstr &MIb) const {
2062
835k
  unsigned BaseReg0, BaseReg1;
2063
835k
  int64_t Offset0, Offset1;
2064
835k
2065
835k
  if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) &&
2066
835k
      
getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)56.0k
) {
2067
53.1k
2068
53.1k
    if (
!MIa.hasOneMemOperand() || 53.1k
!MIb.hasOneMemOperand()39.7k
) {
2069
14.8k
      // FIXME: Handle ds_read2 / ds_write2.
2070
14.8k
      return false;
2071
14.8k
    }
2072
38.3k
    unsigned Width0 = (*MIa.memoperands_begin())->getSize();
2073
38.3k
    unsigned Width1 = (*MIb.memoperands_begin())->getSize();
2074
38.3k
    if (BaseReg0 == BaseReg1 &&
2075
38.3k
        
offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)11.4k
) {
2076
8.10k
      return true;
2077
8.10k
    }
2078
812k
  }
2079
812k
2080
812k
  return false;
2081
812k
}
2082
2083
bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa,
2084
                                                  MachineInstr &MIb,
2085
910k
                                                  AliasAnalysis *AA) const {
2086
910k
  assert((MIa.mayLoad() || MIa.mayStore()) &&
2087
910k
         "MIa must load from or modify a memory location");
2088
910k
  assert((MIb.mayLoad() || MIb.mayStore()) &&
2089
910k
         "MIb must load from or modify a memory location");
2090
910k
2091
910k
  if (
MIa.hasUnmodeledSideEffects() || 910k
MIb.hasUnmodeledSideEffects()910k
)
2092
0
    return false;
2093
910k
2094
910k
  // XXX - Can we relax this between address spaces?
2095
910k
  
if (910k
MIa.hasOrderedMemoryRef() || 910k
MIb.hasOrderedMemoryRef()910k
)
2096
8
    return false;
2097
910k
2098
910k
  
if (910k
AA && 910k
MIa.hasOneMemOperand()493
&&
MIb.hasOneMemOperand()449
) {
2099
445
    const MachineMemOperand *MMOa = *MIa.memoperands_begin();
2100
445
    const MachineMemOperand *MMOb = *MIb.memoperands_begin();
2101
445
    if (
MMOa->getValue() && 445
MMOb->getValue()445
) {
2102
439
      MemoryLocation LocA(MMOa->getValue(), MMOa->getSize(), MMOa->getAAInfo());
2103
439
      MemoryLocation LocB(MMOb->getValue(), MMOb->getSize(), MMOb->getAAInfo());
2104
439
      if (!AA->alias(LocA, LocB))
2105
186
        return true;
2106
910k
    }
2107
445
  }
2108
910k
2109
910k
  // TODO: Should we check the address space from the MachineMemOperand? That
2110
910k
  // would allow us to distinguish objects we know don't alias based on the
2111
910k
  // underlying address space, even if it was lowered to a different one,
2112
910k
  // e.g. private accesses lowered to use MUBUF instructions on a scratch
2113
910k
  // buffer.
2114
910k
  
if (910k
isDS(MIa)910k
) {
2115
80.2k
    if (isDS(MIb))
2116
33.7k
      return checkInstOffsetsDoNotOverlap(MIa, MIb);
2117
46.4k
2118
46.4k
    
return !isFLAT(MIb) || 46.4k
isSegmentSpecificFLAT(MIb)34.5k
;
2119
80.2k
  }
2120
829k
2121
829k
  
if (829k
isMUBUF(MIa) || 829k
isMTBUF(MIa)41.3k
) {
2122
788k
    if (
isMUBUF(MIb) || 788k
isMTBUF(MIb)13.2k
)
2123
775k
      return checkInstOffsetsDoNotOverlap(MIa, MIb);
2124
13.2k
2125
13.2k
    
return !isFLAT(MIb) && 13.2k
!isSMRD(MIb)3.12k
;
2126
788k
  }
2127
41.3k
2128
41.3k
  
if (41.3k
isSMRD(MIa)41.3k
) {
2129
1.33k
    if (isSMRD(MIb))
2130
0
      return checkInstOffsetsDoNotOverlap(MIa, MIb);
2131
1.33k
2132
1.33k
    
return !isFLAT(MIb) && 1.33k
!isMUBUF(MIa)1.01k
&&
!isMTBUF(MIa)1.01k
;
2133
1.33k
  }
2134
39.9k
2135
39.9k
  
if (39.9k
isFLAT(MIa)39.9k
) {
2136
39.8k
    if (isFLAT(MIb))
2137
26.5k
      return checkInstOffsetsDoNotOverlap(MIa, MIb);
2138
13.2k
2139
13.2k
    return false;
2140
13.2k
  }
2141
191
2142
191
  return false;
2143
191
}
2144
2145
445
static int64_t getFoldableImm(const MachineOperand* MO) {
2146
445
  if (!MO->isReg())
2147
0
    return false;
2148
445
  const MachineFunction *MF = MO->getParent()->getParent()->getParent();
2149
445
  const MachineRegisterInfo &MRI = MF->getRegInfo();
2150
445
  auto Def = MRI.getUniqueVRegDef(MO->getReg());
2151
445
  if (
Def && 445
Def->getOpcode() == AMDGPU::V_MOV_B32_e32442
&&
2152
21
      Def->getOperand(1).isImm())
2153
21
    return Def->getOperand(1).getImm();
2154
424
  return AMDGPU::NoRegister;
2155
424
}
2156
2157
MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
2158
                                                 MachineInstr &MI,
2159
173
                                                 LiveVariables *LV) const {
2160
173
  bool IsF16 = false;
2161
173
2162
173
  switch (MI.getOpcode()) {
2163
1
  default:
2164
1
    return nullptr;
2165
0
  case AMDGPU::V_MAC_F16_e64:
2166
0
    IsF16 = true;
2167
0
    LLVM_FALLTHROUGH;
2168
6
  case AMDGPU::V_MAC_F32_e64:
2169
6
    break;
2170
6
  case AMDGPU::V_MAC_F16_e32:
2171
6
    IsF16 = true;
2172
6
    LLVM_FALLTHROUGH;
2173
166
  case AMDGPU::V_MAC_F32_e32: {
2174
166
    int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
2175
166
                                             AMDGPU::OpName::src0);
2176
166
    const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
2177
166
    if (
!Src0->isReg() && 166
!Src0->isImm()4
)
2178
1
      return nullptr;
2179
165
2180
165
    
if (165
Src0->isImm() && 165
!isInlineConstant(MI, Src0Idx, *Src0)3
)
2181
2
      return nullptr;
2182
163
2183
163
    break;
2184
163
  }
2185
169
  }
2186
169
2187
169
  const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
2188
169
  const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
2189
169
  const MachineOperand *Src0Mods =
2190
169
    getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2191
169
  const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
2192
169
  const MachineOperand *Src1Mods =
2193
169
    getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2194
169
  const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
2195
169
  const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
2196
169
  const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
2197
169
2198
169
  if (
!Src0Mods && 169
!Src1Mods163
&&
!Clamp163
&&
!Omod163
&&
2199
169
      // If we have an SGPR input, we will violate the constant bus restriction.
2200
169
      
(!Src0->isReg() || 163
!RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg())162
)) {
2201
158
    if (auto 
Imm158
= getFoldableImm(Src2)) {
2202
10
      return BuildMI(*MBB, MI, MI.getDebugLoc(),
2203
10
                     get(IsF16 ? 
AMDGPU::V_MADAK_F162
:
AMDGPU::V_MADAK_F328
))
2204
10
               .add(*Dst)
2205
10
               .add(*Src0)
2206
10
               .add(*Src1)
2207
10
               .addImm(Imm);
2208
10
    }
2209
148
    
if (auto 148
Imm148
= getFoldableImm(Src1)) {
2210
9
      return BuildMI(*MBB, MI, MI.getDebugLoc(),
2211
9
                     get(IsF16 ? 
AMDGPU::V_MADMK_F161
:
AMDGPU::V_MADMK_F328
))
2212
9
               .add(*Dst)
2213
9
               .add(*Src0)
2214
9
               .addImm(Imm)
2215
9
               .add(*Src2);
2216
9
    }
2217
139
    
if (auto 139
Imm139
= getFoldableImm(Src0)) {
2218
2
      if (isOperandLegal(MI, AMDGPU::getNamedOperandIdx(AMDGPU::V_MADMK_F32,
2219
2
                           AMDGPU::OpName::src0), Src1))
2220
2
        return BuildMI(*MBB, MI, MI.getDebugLoc(),
2221
2
                       get(IsF16 ? 
AMDGPU::V_MADMK_F161
:
AMDGPU::V_MADMK_F321
))
2222
2
                 .add(*Dst)
2223
2
                 .add(*Src1)
2224
2
                 .addImm(Imm)
2225
2
                 .add(*Src2);
2226
148
    }
2227
158
  }
2228
148
2229
148
  return BuildMI(*MBB, MI, MI.getDebugLoc(),
2230
148
                 get(IsF16 ? 
AMDGPU::V_MAD_F162
:
AMDGPU::V_MAD_F32146
))
2231
148
      .add(*Dst)
2232
148
      .addImm(Src0Mods ? 
Src0Mods->getImm()6
:
0142
)
2233
148
      .add(*Src0)
2234
148
      .addImm(Src1Mods ? 
Src1Mods->getImm()6
:
0142
)
2235
148
      .add(*Src1)
2236
148
      .addImm(0) // Src mods
2237
148
      .add(*Src2)
2238
148
      .addImm(Clamp ? 
Clamp->getImm()6
:
0142
)
2239
148
      .addImm(Omod ? 
Omod->getImm()6
:
0142
);
2240
173
}
2241
2242
// It's not generally safe to move VALU instructions across these since it will
2243
// start using the register as a base index rather than directly.
2244
// XXX - Why isn't hasSideEffects sufficient for these?
2245
427k
static bool changesVGPRIndexingMode(const MachineInstr &MI) {
2246
427k
  switch (MI.getOpcode()) {
2247
232
  case AMDGPU::S_SET_GPR_IDX_ON:
2248
232
  case AMDGPU::S_SET_GPR_IDX_MODE:
2249
232
  case AMDGPU::S_SET_GPR_IDX_OFF:
2250
232
    return true;
2251
426k
  default:
2252
426k
    return false;
2253
0
  }
2254
0
}
2255
2256
bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
2257
                                       const MachineBasicBlock *MBB,
2258
458k
                                       const MachineFunction &MF) const {
2259
458k
  // XXX - Do we want the SP check in the base implementation?
2260
458k
2261
458k
  // Target-independent instructions do not have an implicit-use of EXEC, even
2262
458k
  // when they operate on VGPRs. Treating EXEC modifications as scheduling
2263
458k
  // boundaries prevents incorrect movements of such instructions.
2264
458k
  return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) ||
2265
429k
         MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
2266
427k
         MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
2267
427k
         MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
2268
427k
         changesVGPRIndexingMode(MI);
2269
458k
}
2270
2271
5.18k
bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
2272
5.18k
  switch (Imm.getBitWidth()) {
2273
0
  case 32:
2274
0
    return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
2275
0
                                        ST.hasInv2PiInlineImm());
2276
5.09k
  case 64:
2277
5.09k
    return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
2278
5.09k
                                        ST.hasInv2PiInlineImm());
2279
86
  case 16:
2280
86
    return ST.has16BitInsts() &&
2281
86
           AMDGPU::isInlinableLiteral16(Imm.getSExtValue(),
2282
86
                                        ST.hasInv2PiInlineImm());
2283
0
  default:
2284
0
    llvm_unreachable("invalid bitwidth");
2285
0
  }
2286
0
}
2287
2288
bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
2289
3.76M
                                   uint8_t OperandType) const {
2290
3.76M
  if (!MO.isImm() ||
2291
3.76M
      OperandType < AMDGPU::OPERAND_SRC_FIRST ||
2292
3.71M
      OperandType > AMDGPU::OPERAND_SRC_LAST)
2293
50.6k
    return false;
2294
3.71M
2295
3.71M
  // MachineOperand provides no way to tell the true operand size, since it only
2296
3.71M
  // records a 64-bit value. We need to know the size to determine if a 32-bit
2297
3.71M
  // floating point immediate bit pattern is legal for an integer immediate. It
2298
3.71M
  // would be for any 32-bit integer operand, but would not be for a 64-bit one.
2299
3.71M
2300
3.71M
  int64_t Imm = MO.getImm();
2301
3.71M
  switch (OperandType) {
2302
3.56M
  case AMDGPU::OPERAND_REG_IMM_INT32:
2303
3.56M
  case AMDGPU::OPERAND_REG_IMM_FP32:
2304
3.56M
  case AMDGPU::OPERAND_REG_INLINE_C_INT32:
2305
3.56M
  case AMDGPU::OPERAND_REG_INLINE_C_FP32: {
2306
3.56M
    int32_t Trunc = static_cast<int32_t>(Imm);
2307
3.56M
    return Trunc == Imm &&
2308
3.56M
           AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
2309
3.56M
  }
2310
38.2k
  case AMDGPU::OPERAND_REG_IMM_INT64:
2311
38.2k
  case AMDGPU::OPERAND_REG_IMM_FP64:
2312
38.2k
  case AMDGPU::OPERAND_REG_INLINE_C_INT64:
2313
38.2k
  case AMDGPU::OPERAND_REG_INLINE_C_FP64:
2314
38.2k
    return AMDGPU::isInlinableLiteral64(MO.getImm(),
2315
38.2k
                                        ST.hasInv2PiInlineImm());
2316
114k
  case AMDGPU::OPERAND_REG_IMM_INT16:
2317
114k
  case AMDGPU::OPERAND_REG_IMM_FP16:
2318
114k
  case AMDGPU::OPERAND_REG_INLINE_C_INT16:
2319
114k
  case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
2320
114k
    if (
isInt<16>(Imm) || 114k
isUInt<16>(Imm)3.20k
) {
2321
114k
      // A few special case instructions have 16-bit operands on subtargets
2322
114k
      // where 16-bit instructions are not legal.
2323
114k
      // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
2324
114k
      // constants in these cases
2325
114k
      int16_t Trunc = static_cast<int16_t>(Imm);
2326
114k
      return ST.has16BitInsts() &&
2327
114k
             AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm());
2328
114k
    }
2329
71
2330
71
    return false;
2331
71
  }
2332
3.24k
  case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
2333
3.24k
  case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
2334
3.24k
    uint32_t Trunc = static_cast<uint32_t>(Imm);
2335
3.24k
    return  AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm());
2336
3.24k
  }
2337
0
  default:
2338
0
    llvm_unreachable("invalid bitwidth");
2339
0
  }
2340
0
}
2341
2342
bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO,
2343
418k
                                        const MCOperandInfo &OpInfo) const {
2344
418k
  switch (MO.getType()) {
2345
292k
  case MachineOperand::MO_Register:
2346
292k
    return false;
2347
124k
  case MachineOperand::MO_Immediate:
2348
124k
    return !isInlineConstant(MO, OpInfo);
2349
1.94k
  case MachineOperand::MO_FrameIndex:
2350
1.94k
  case MachineOperand::MO_MachineBasicBlock:
2351
1.94k
  case MachineOperand::MO_ExternalSymbol:
2352
1.94k
  case MachineOperand::MO_GlobalAddress:
2353
1.94k
  case MachineOperand::MO_MCSymbol:
2354
1.94k
    return true;
2355
0
  default:
2356
0
    llvm_unreachable("unexpected operand type");
2357
0
  }
2358
0
}
2359
2360
static bool compareMachineOp(const MachineOperand &Op0,
2361
15.3k
                             const MachineOperand &Op1) {
2362
15.3k
  if (Op0.getType() != Op1.getType())
2363
0
    return false;
2364
15.3k
2365
15.3k
  switch (Op0.getType()) {
2366
15.3k
  case MachineOperand::MO_Register:
2367
15.3k
    return Op0.getReg() == Op1.getReg();
2368
0
  case MachineOperand::MO_Immediate:
2369
0
    return Op0.getImm() == Op1.getImm();
2370
0
  default:
2371
0
    llvm_unreachable("Didn't expect to be comparing these operand types");
2372
0
  }
2373
0
}
2374
2375
bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
2376
69.1k
                                    const MachineOperand &MO) const {
2377
69.1k
  const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo];
2378
69.1k
2379
69.1k
  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
2380
69.1k
2381
69.1k
  if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
2382
0
    return true;
2383
69.1k
2384
69.1k
  
if (69.1k
OpInfo.RegClass < 069.1k
)
2385
0
    return false;
2386
69.1k
2387
69.1k
  
if (69.1k
MO.isImm() && 69.1k
isInlineConstant(MO, OpInfo)68.8k
)
2388
45.6k
    return RI.opCanUseInlineConstant(OpInfo.OperandType);
2389
23.5k
2390
23.5k
  return RI.opCanUseLiteralConstant(OpInfo.OperandType);
2391
23.5k
}
2392
2393
625k
bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
2394
625k
  int Op32 = AMDGPU::getVOPe32(Opcode);
2395
625k
  if (Op32 == -1)
2396
538k
    return false;
2397
87.6k
2398
87.6k
  return pseudoToMCOpcode(Op32) != -1;
2399
87.6k
}
2400
2401
0
bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
2402
0
  // The src0_modifier operand is present on all instructions
2403
0
  // that have modifiers.
2404
0
2405
0
  return AMDGPU::getNamedOperandIdx(Opcode,
2406
0
                                    AMDGPU::OpName::src0_modifiers) != -1;
2407
0
}
2408
2409
bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI,
2410
168k
                                  unsigned OpName) const {
2411
168k
  const MachineOperand *Mods = getNamedOperand(MI, OpName);
2412
41.6k
  return Mods && Mods->getImm();
2413
168k
}
2414
2415
164
bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const {
2416
164
  return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
2417
156
         hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
2418
155
         hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) ||
2419
136
         hasModifiersSet(MI, AMDGPU::OpName::clamp) ||
2420
134
         hasModifiersSet(MI, AMDGPU::OpName::omod);
2421
164
}
2422
2423
bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
2424
                                  const MachineOperand &MO,
2425
6.61M
                                  const MCOperandInfo &OpInfo) const {
2426
6.61M
  // Literal constants use the constant bus.
2427
6.61M
  //if (isLiteralConstantLike(MO, OpInfo))
2428
6.61M
  // return true;
2429
6.61M
  if (MO.isImm())
2430
1.71M
    return !isInlineConstant(MO, OpInfo);
2431
4.90M
2432
4.90M
  
if (4.90M
!MO.isReg()4.90M
)
2433
9.15k
    return true; // Misc other operands like FrameIndex
2434
4.89M
2435
4.89M
  
if (4.89M
!MO.isUse()4.89M
)
2436
116k
    return false;
2437
4.77M
2438
4.77M
  
if (4.77M
TargetRegisterInfo::isVirtualRegister(MO.getReg())4.77M
)
2439
2.42M
    return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
2440
2.35M
2441
2.35M
  // FLAT_SCR is just an SGPR pair.
2442
2.35M
  
if (2.35M
!MO.isImplicit() && 2.35M
(MO.getReg() == AMDGPU::FLAT_SCR)2.27M
)
2443
0
    return true;
2444
2.35M
2445
2.35M
  // EXEC register uses the constant bus.
2446
2.35M
  
if (2.35M
!MO.isImplicit() && 2.35M
MO.getReg() == AMDGPU::EXEC2.27M
)
2447
26
    return true;
2448
2.35M
2449
2.35M
  // SGPRs use the constant bus
2450
2.35M
  
return (MO.getReg() == AMDGPU::VCC || 2.35M
MO.getReg() == AMDGPU::M02.27M
||
2451
2.27M
          (!MO.isImplicit() &&
2452
2.21M
           (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) ||
2453
2.27M
            AMDGPU::SGPR_64RegClass.contains(MO.getReg()))));
2454
6.61M
}
2455
2456
3.28M
static unsigned findImplicitSGPRRead(const MachineInstr &MI) {
2457
4.12M
  for (const MachineOperand &MO : MI.implicit_operands()) {
2458
4.12M
    // We only care about reads.
2459
4.12M
    if (MO.isDef())
2460
479k
      continue;
2461
3.64M
2462
3.64M
    switch (MO.getReg()) {
2463
189k
    case AMDGPU::VCC:
2464
189k
    case AMDGPU::M0:
2465
189k
    case AMDGPU::FLAT_SCR:
2466
189k
      return MO.getReg();
2467
189k
2468
3.45M
    default:
2469
3.45M
      break;
2470
3.09M
    }
2471
3.09M
  }
2472
3.09M
2473
3.09M
  return AMDGPU::NoRegister;
2474
3.09M
}
2475
2476
9.03M
static bool shouldReadExec(const MachineInstr &MI) {
2477
9.03M
  if (
SIInstrInfo::isVALU(MI)9.03M
) {
2478
3.23M
    switch (MI.getOpcode()) {
2479
23.7k
    case AMDGPU::V_READLANE_B32:
2480
23.7k
    case AMDGPU::V_READLANE_B32_si:
2481
23.7k
    case AMDGPU::V_READLANE_B32_vi:
2482
23.7k
    case AMDGPU::V_WRITELANE_B32:
2483
23.7k
    case AMDGPU::V_WRITELANE_B32_si:
2484
23.7k
    case AMDGPU::V_WRITELANE_B32_vi:
2485
23.7k
      return false;
2486
3.20M
    }
2487
3.20M
2488
3.20M
    return true;
2489
3.20M
  }
2490
5.79M
2491
5.79M
  
if (5.79M
SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
2492
5.79M
      SIInstrInfo::isSALU(MI) ||
2493
2.98M
      SIInstrInfo::isSMRD(MI))
2494
4.00M
    return false;
2495
1.79M
2496
1.79M
  return true;
2497
1.79M
}
2498
2499
static bool isSubRegOf(const SIRegisterInfo &TRI,
2500
                       const MachineOperand &SuperVec,
2501
2.76k
                       const MachineOperand &SubReg) {
2502
2.76k
  if (TargetRegisterInfo::isPhysicalRegister(SubReg.getReg()))
2503
1.73k
    return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
2504
1.02k
2505
1.02k
  return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
2506
1.02k
         SubReg.getReg() == SuperVec.getReg();
2507
2.76k
}
2508
2509
bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
2510
12.5M
                                    StringRef &ErrInfo) const {
2511
12.5M
  uint16_t Opcode = MI.getOpcode();
2512
12.5M
  if (SIInstrInfo::isGenericOpcode(MI.getOpcode()))
2513
3.52M
    return true;
2514
9.03M
2515
9.03M
  const MachineFunction *MF = MI.getParent()->getParent();
2516
9.03M
  const MachineRegisterInfo &MRI = MF->getRegInfo();
2517
9.03M
2518
9.03M
  int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
2519
9.03M
  int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
2520
9.03M
  int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
2521
9.03M
2522
9.03M
  // Make sure the number of operands is correct.
2523
9.03M
  const MCInstrDesc &Desc = get(Opcode);
2524
9.03M
  if (!Desc.isVariadic() &&
2525
9.03M
      
Desc.getNumOperands() != MI.getNumExplicitOperands()9.01M
) {
2526
0
    ErrInfo = "Instruction has wrong number of operands.";
2527
0
    return false;
2528
0
  }
2529
9.03M
2530
9.03M
  
if (9.03M
MI.isInlineAsm()9.03M
) {
2531
0
    // Verify register classes for inlineasm constraints.
2532
0
    for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
2533
0
         
I != E0
;
++I0
) {
2534
0
      const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
2535
0
      if (!RC)
2536
0
        continue;
2537
0
2538
0
      const MachineOperand &Op = MI.getOperand(I);
2539
0
      if (!Op.isReg())
2540
0
        continue;
2541
0
2542
0
      unsigned Reg = Op.getReg();
2543
0
      if (
!TargetRegisterInfo::isVirtualRegister(Reg) && 0
!RC->contains(Reg)0
) {
2544
0
        ErrInfo = "inlineasm operand has incorrect register class.";
2545
0
        return false;
2546
0
      }
2547
0
    }
2548
0
2549
0
    return true;
2550
9.03M
  }
2551
9.03M
2552
9.03M
  // Make sure the register classes are correct.
2553
41.0M
  
for (int i = 0, e = Desc.getNumOperands(); 9.03M
i != e41.0M
;
++i31.9M
) {
2554
31.9M
    if (
MI.getOperand(i).isFPImm()31.9M
) {
2555
0
      ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
2556
0
                "all fp values to integers.";
2557
0
      return false;
2558
0
    }
2559
31.9M
2560
31.9M
    int RegClass = Desc.OpInfo[i].RegClass;
2561
31.9M
2562
31.9M
    switch (Desc.OpInfo[i].OperandType) {
2563
12.7M
    case MCOI::OPERAND_REGISTER:
2564
12.7M
      if (
MI.getOperand(i).isImm()12.7M
) {
2565
0
        ErrInfo = "Illegal immediate value for operand.";
2566
0
        return false;
2567
0
      }
2568
12.7M
      break;
2569
3.94M
    case AMDGPU::OPERAND_REG_IMM_INT32:
2570
3.94M
    case AMDGPU::OPERAND_REG_IMM_FP32:
2571
3.94M
      break;
2572
4.21M
    case AMDGPU::OPERAND_REG_INLINE_C_INT32:
2573
4.21M
    case AMDGPU::OPERAND_REG_INLINE_C_FP32:
2574
4.21M
    case AMDGPU::OPERAND_REG_INLINE_C_INT64:
2575
4.21M
    case AMDGPU::OPERAND_REG_INLINE_C_FP64:
2576
4.21M
    case AMDGPU::OPERAND_REG_INLINE_C_INT16:
2577
4.21M
    case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
2578
4.21M
      const MachineOperand &MO = MI.getOperand(i);
2579
4.21M
      if (
!MO.isReg() && 4.21M
(!MO.isImm() || 1.74M
!isInlineConstant(MI, i)1.74M
)) {
2580
0
        ErrInfo = "Illegal immediate value for operand.";
2581
0
        return false;
2582
0
      }
2583
4.21M
      break;
2584
4.21M
    }
2585
9.65M
    case MCOI::OPERAND_IMMEDIATE:
2586
9.65M
    case AMDGPU::OPERAND_KIMM32:
2587
9.65M
      // Check if this operand is an immediate.
2588
9.65M
      // FrameIndex operands will be replaced by immediates, so they are
2589
9.65M
      // allowed.
2590
9.65M
      if (
!MI.getOperand(i).isImm() && 9.65M
!MI.getOperand(i).isFI()5.55k
) {
2591
0
        ErrInfo = "Expected immediate, but got non-immediate";
2592
0
        return false;
2593
0
      }
2594
9.65M
      
LLVM_FALLTHROUGH9.65M
;
2595
11.0M
    default:
2596
11.0M
      continue;
2597
20.9M
    }
2598
20.9M
2599
20.9M
    
if (20.9M
!MI.getOperand(i).isReg()20.9M
)
2600
4.05M
      continue;
2601
16.8M
2602
16.8M
    
if (16.8M
RegClass != -116.8M
) {
2603
16.8M
      unsigned Reg = MI.getOperand(i).getReg();
2604
16.8M
      if (Reg == AMDGPU::NoRegister ||
2605
16.8M
          TargetRegisterInfo::isVirtualRegister(Reg))
2606
7.92M
        continue;
2607
8.94M
2608
8.94M
      const TargetRegisterClass *RC = RI.getRegClass(RegClass);
2609
8.94M
      if (
!RC->contains(Reg)8.94M
) {
2610
0
        ErrInfo = "Operand has incorrect register class.";
2611
0
        return false;
2612
0
      }
2613
16.8M
    }
2614
31.9M
  }
2615
9.03M
2616
9.03M
  // Verify SDWA
2617
9.03M
  
if (9.03M
isSDWA(MI)9.03M
) {
2618
31.8k
    if (
!ST.hasSDWA()31.8k
) {
2619
0
      ErrInfo = "SDWA is not supported on this target";
2620
0
      return false;
2621
0
    }
2622
31.8k
2623
31.8k
    int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
2624
31.8k
2625
31.8k
    const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx };
2626
31.8k
2627
127k
    for (int OpIdx: OpIndicies) {
2628
127k
      if (OpIdx == -1)
2629
35.0k
        continue;
2630
92.3k
      const MachineOperand &MO = MI.getOperand(OpIdx);
2631
92.3k
2632
92.3k
      if (
!ST.hasSDWAScalar()92.3k
) {
2633
87.2k
        // Only VGPRS on VI
2634
87.2k
        if (
!MO.isReg() || 87.2k
!RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))87.2k
) {
2635
0
          ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
2636
0
          return false;
2637
0
        }
2638
5.14k
      } else {
2639
5.14k
        // No immediates on GFX9
2640
5.14k
        if (
!MO.isReg()5.14k
) {
2641
0
          ErrInfo = "Only reg allowed as operands in SDWA instructions on GFX9";
2642
0
          return false;
2643
0
        }
2644
31.8k
      }
2645
127k
    }
2646
31.8k
2647
31.8k
    
if (31.8k
!ST.hasSDWAOmod()31.8k
) {
2648
30.0k
      // No omod allowed on VI
2649
30.0k
      const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
2650
30.0k
      if (OMod != nullptr &&
2651
30.0k
        
(!OMod->isImm() || 6.19k
OMod->getImm() != 06.19k
)) {
2652
0
        ErrInfo = "OMod not allowed in SDWA instructions on VI";
2653
0
        return false;
2654
0
      }
2655
31.8k
    }
2656
31.8k
2657
31.8k
    uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
2658
31.8k
    if (
isVOPC(BasicOpcode)31.8k
) {
2659
58
      if (
!ST.hasSDWASdst() && 58
DstIdx != -113
) {
2660
0
        // Only vcc allowed as dst on VI for VOPC
2661
0
        const MachineOperand &Dst = MI.getOperand(DstIdx);
2662
0
        if (
!Dst.isReg() || 0
Dst.getReg() != AMDGPU::VCC0
) {
2663
0
          ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
2664
0
          return false;
2665
0
        }
2666
58
      } else 
if (58
!ST.hasSDWAOutModsVOPC()58
) {
2667
45
        // No clamp allowed on GFX9 for VOPC
2668
45
        const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
2669
45
        if (
Clamp && 45
(!Clamp->isImm() || 45
Clamp->getImm() != 045
)) {
2670
0
          ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
2671
0
          return false;
2672
0
        }
2673
45
2674
45
        // No omod allowed on GFX9 for VOPC
2675
45
        const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
2676
45
        if (
OMod && 45
(!OMod->isImm() || 0
OMod->getImm() != 00
)) {
2677
0
          ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
2678
0
          return false;
2679
0
        }
2680
9.03M
      }
2681
58
    }
2682
31.8k
  }
2683
9.03M
2684
9.03M
  // Verify VOP*
2685
9.03M
  
if (9.03M
isVOP1(MI) || 9.03M
isVOP2(MI)8.19M
||
isVOP3(MI)7.05M
||
isVOPC(MI)5.86M
||
isSDWA(MI)5.84M
) {
2686
3.21M
    // Only look at the true operands. Only a real operand can use the constant
2687
3.21M
    // bus, and we don't want to check pseudo-operands like the source modifier
2688
3.21M
    // flags.
2689
3.21M
    const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
2690
3.21M
2691
3.21M
    unsigned ConstantBusCount = 0;
2692
3.21M
2693
3.21M
    if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1)
2694
979
      ++ConstantBusCount;
2695
3.21M
2696
3.21M
    unsigned SGPRUsed = findImplicitSGPRRead(MI);
2697
3.21M
    if (SGPRUsed != AMDGPU::NoRegister)
2698
184k
      ++ConstantBusCount;
2699
3.21M
2700
8.75M
    for (int OpIdx : OpIndices) {
2701
8.75M
      if (OpIdx == -1)
2702
2.64M
        break;
2703
6.10M
      const MachineOperand &MO = MI.getOperand(OpIdx);
2704
6.10M
      if (
usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])6.10M
) {
2705
1.25M
        if (
MO.isReg()1.25M
) {
2706
1.11M
          if (MO.getReg() != SGPRUsed)
2707
1.10M
            ++ConstantBusCount;
2708
1.11M
          SGPRUsed = MO.getReg();
2709
1.25M
        } else {
2710
137k
          ++ConstantBusCount;
2711
137k
        }
2712
1.25M
      }
2713
8.75M
    }
2714
3.21M
    if (
ConstantBusCount > 13.21M
) {
2715
0
      ErrInfo = "VOP* instruction uses the constant bus more than once";
2716
0
      return false;
2717
0
    }
2718
9.03M
  }
2719
9.03M
2720
9.03M
  // Verify misc. restrictions on specific instructions.
2721
9.03M
  
if (9.03M
Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 ||
2722
9.03M
      
Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F649.02M
) {
2723
10.7k
    const MachineOperand &Src0 = MI.getOperand(Src0Idx);
2724
10.7k
    const MachineOperand &Src1 = MI.getOperand(Src1Idx);
2725
10.7k
    const MachineOperand &Src2 = MI.getOperand(Src2Idx);
2726
10.7k
    if (
Src0.isReg() && 10.7k
Src1.isReg()10.4k
&&
Src2.isReg()10.4k
) {
2727
10.2k
      if (!compareMachineOp(Src0, Src1) &&
2728
10.2k
          
!compareMachineOp(Src0, Src2)5.04k
) {
2729
0
        ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
2730
0
        return false;
2731
0
      }
2732
9.03M
    }
2733
10.7k
  }
2734
9.03M
2735
9.03M
  
if (9.03M
isSOPK(MI)9.03M
) {
2736
10.2k
    int64_t Imm = getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm();
2737
10.2k
    if (
sopkIsZext(MI)10.2k
) {
2738
672
      if (
!isUInt<16>(Imm)672
) {
2739
0
        ErrInfo = "invalid immediate for SOPK instruction";
2740
0
        return false;
2741
0
      }
2742
9.59k
    } else {
2743
9.59k
      if (
!isInt<16>(Imm)9.59k
) {
2744
0
        ErrInfo = "invalid immediate for SOPK instruction";
2745
0
        return false;
2746
0
      }
2747
9.03M
    }
2748
10.2k
  }
2749
9.03M
2750
9.03M
  
if (9.03M
Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
2751
9.02M
      Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
2752
9.02M
      Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
2753
9.03M
      
Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e649.02M
) {
2754
2.76k
    const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
2755
2.06k
                       Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
2756
2.76k
2757
2.76k
    const unsigned StaticNumOps = Desc.getNumOperands() +
2758
2.76k
      Desc.getNumImplicitUses();
2759
2.76k
    const unsigned NumImplicitOps = IsDst ? 
2705
:
12.06k
;
2760
2.76k
2761
2.76k
    // Allow additional implicit operands. This allows a fixup done by the post
2762
2.76k
    // RA scheduler where the main implicit operand is killed and implicit-defs
2763
2.76k
    // are added for sub-registers that remain live after this instruction.
2764
2.76k
    if (
MI.getNumOperands() < StaticNumOps + NumImplicitOps2.76k
) {
2765
0
      ErrInfo = "missing implicit register operands";
2766
0
      return false;
2767
0
    }
2768
2.76k
2769
2.76k
    const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
2770
2.76k
    if (
IsDst2.76k
) {
2771
705
      if (
!Dst->isUse()705
) {
2772
0
        ErrInfo = "v_movreld_b32 vdst should be a use operand";
2773
0
        return false;
2774
0
      }
2775
705
2776
705
      unsigned UseOpIdx;
2777
705
      if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
2778
705
          
UseOpIdx != StaticNumOps + 1705
) {
2779
0
        ErrInfo = "movrel implicit operands should be tied";
2780
0
        return false;
2781
0
      }
2782
2.76k
    }
2783
2.76k
2784
2.76k
    const MachineOperand &Src0 = MI.getOperand(Src0Idx);
2785
2.76k
    const MachineOperand &ImpUse
2786
2.76k
      = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
2787
2.76k
    if (
!ImpUse.isReg() || 2.76k
!ImpUse.isUse()2.76k
||
2788
2.76k
        
!isSubRegOf(RI, ImpUse, IsDst ? 2.76k
*Dst705
:
Src02.06k
)) {
2789
0
      ErrInfo = "src0 should be subreg of implicit vector use";
2790
0
      return false;
2791
0
    }
2792
9.03M
  }
2793
9.03M
2794
9.03M
  // Make sure we aren't losing exec uses in the td files. This mostly requires
2795
9.03M
  // being careful when using let Uses to try to add other use registers.
2796
9.03M
  
if (9.03M
shouldReadExec(MI)9.03M
) {
2797
5.00M
    if (
!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)5.00M
) {
2798
0
      ErrInfo = "VALU instruction does not implicitly read exec mask";
2799
0
      return false;
2800
0
    }
2801
9.03M
  }
2802
9.03M
2803
9.03M
  
if (9.03M
isSMRD(MI)9.03M
) {
2804
1.19M
    if (
MI.mayStore()1.19M
) {
2805
834
      // The register offset form of scalar stores may only use m0 as the
2806
834
      // soffset register.
2807
834
      const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff);
2808
834
      if (
Soff && 834
Soff->getReg() != AMDGPU::M0162
) {
2809
0
        ErrInfo = "scalar stores must use m0 as offset register";
2810
0
        return false;
2811
0
      }
2812
9.03M
    }
2813
1.19M
  }
2814
9.03M
2815
9.03M
  
if (9.03M
isFLAT(MI) && 9.03M
!MF->getSubtarget<SISubtarget>().hasFlatInstOffsets()352k
) {
2816
277k
    const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
2817
277k
    if (
Offset->getImm() != 0277k
) {
2818
0
      ErrInfo = "subtarget does not support offsets in flat instructions";
2819
0
      return false;
2820
0
    }
2821
9.03M
  }
2822
9.03M
2823
9.03M
  return true;
2824
9.03M
}
2825
2826
107k
unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
2827
107k
  switch (MI.getOpcode()) {
2828
1.87k
  default: return AMDGPU::INSTRUCTION_LIST_END;
2829
21.3k
  case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
2830
47.5k
  case AMDGPU::COPY: return AMDGPU::COPY;
2831
329
  case AMDGPU::PHI: return AMDGPU::PHI;
2832
22
  case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
2833
4
  case AMDGPU::WQM: return AMDGPU::WQM;
2834
4
  case AMDGPU::WWM: return AMDGPU::WWM;
2835
23
  case AMDGPU::S_MOV_B32:
2836
23
    return MI.getOperand(1).isReg() ?
2837
23
           
AMDGPU::COPY23
:
AMDGPU::V_MOV_B32_e320
;
2838
5.33k
  case AMDGPU::S_ADD_I32:
2839
5.33k
  case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32;
2840
3.54k
  case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32;
2841
2.17k
  case AMDGPU::S_SUB_I32:
2842
2.17k
  case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32;
2843
1.24k
  case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
2844
466
  case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32;
2845
3.26k
  case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
2846
3.61k
  case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
2847
213
  case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
2848
81
  case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
2849
50
  case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
2850
67
  case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
2851
16
  case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
2852
1.77k
  case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
2853
213
  case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64;
2854
4.44k
  case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
2855
1.65k
  case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64;
2856
2.72k
  case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
2857
138
  case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64;
2858
357
  case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32;
2859
720
  case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32;
2860
2.00k
  case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32;
2861
1.46k
  case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32;
2862
0
  case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
2863
12
  case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
2864
4
  case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
2865
16
  case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
2866
0
  case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
2867
0
  case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
2868
10
  case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32;
2869
4
  case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32;
2870
5
  case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
2871
0
  case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
2872
22
  case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32;
2873
29
  case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32;
2874
0
  case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32;
2875
3
  case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32;
2876
2
  case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32;
2877
0
  case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32;
2878
1
  case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32;
2879
1
  case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32;
2880
64
  case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
2881
14
  case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
2882
158
  case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
2883
2
  case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
2884
0
  case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
2885
77
  case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
2886
0
  }
2887
0
}
2888
2889
0
bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const {
2890
0
  return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END;
2891
0
}
2892
2893
const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
2894
1.73M
                                                      unsigned OpNo) const {
2895
1.73M
  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
2896
1.73M
  const MCInstrDesc &Desc = get(MI.getOpcode());
2897
1.73M
  if (
MI.isVariadic() || 1.73M
OpNo >= Desc.getNumOperands()1.61M
||
2898
1.73M
      
Desc.OpInfo[OpNo].RegClass == -11.33M
) {
2899
763k
    unsigned Reg = MI.getOperand(OpNo).getReg();
2900
763k
2901
763k
    if (TargetRegisterInfo::isVirtualRegister(Reg))
2902
452k
      return MRI.getRegClass(Reg);
2903
311k
    return RI.getPhysRegClass(Reg);
2904
311k
  }
2905
969k
2906
969k
  unsigned RCID = Desc.OpInfo[OpNo].RegClass;
2907
969k
  return RI.getRegClass(RCID);
2908
969k
}
2909
2910
137k
bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const {
2911
137k
  switch (MI.getOpcode()) {
2912
88.5k
  case AMDGPU::COPY:
2913
88.5k
  case AMDGPU::REG_SEQUENCE:
2914
88.5k
  case AMDGPU::PHI:
2915
88.5k
  case AMDGPU::INSERT_SUBREG:
2916
88.5k
    return RI.hasVGPRs(getOpRegClass(MI, 0));
2917
48.7k
  default:
2918
48.7k
    return RI.hasVGPRs(getOpRegClass(MI, OpNo));
2919
0
  }
2920
0
}
2921
2922
20.1k
void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
2923
20.1k
  MachineBasicBlock::iterator I = MI;
2924
20.1k
  MachineBasicBlock *MBB = MI.getParent();
2925
20.1k
  MachineOperand &MO = MI.getOperand(OpIdx);
2926
20.1k
  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
2927
20.1k
  unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass;
2928
20.1k
  const TargetRegisterClass *RC = RI.getRegClass(RCID);
2929
20.1k
  unsigned Opcode = AMDGPU::V_MOV_B32_e32;
2930
20.1k
  if (MO.isReg())
2931
20.1k
    Opcode = AMDGPU::COPY;
2932
0
  else 
if (0
RI.isSGPRClass(RC)0
)
2933
0
    Opcode = AMDGPU::S_MOV_B32;
2934
20.1k
2935
20.1k
  const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
2936
20.1k
  if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC))
2937
2.18k
    VRC = &AMDGPU::VReg_64RegClass;
2938
20.1k
  else
2939
17.9k
    VRC = &AMDGPU::VGPR_32RegClass;
2940
20.1k
2941
20.1k
  unsigned Reg = MRI.createVirtualRegister(VRC);
2942
20.1k
  DebugLoc DL = MBB->findDebugLoc(I);
2943
20.1k
  BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
2944
20.1k
  MO.ChangeToRegister(Reg, false);
2945
20.1k
}
2946
2947
unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
2948
                                         MachineRegisterInfo &MRI,
2949
                                         MachineOperand &SuperReg,
2950
                                         const TargetRegisterClass *SuperRC,
2951
                                         unsigned SubIdx,
2952
                                         const TargetRegisterClass *SubRC)
2953
1.96k
                                         const {
2954
1.96k
  MachineBasicBlock *MBB = MI->getParent();
2955
1.96k
  DebugLoc DL = MI->getDebugLoc();
2956
1.96k
  unsigned SubReg = MRI.createVirtualRegister(SubRC);
2957
1.96k
2958
1.96k
  if (
SuperReg.getSubReg() == AMDGPU::NoSubRegister1.96k
) {
2959
1.96k
    BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
2960
1.96k
      .addReg(SuperReg.getReg(), 0, SubIdx);
2961
1.96k
    return SubReg;
2962
1.96k
  }
2963
0
2964
0
  // Just in case the super register is itself a sub-register, copy it to a new
2965
0
  // value so we don't need to worry about merging its subreg index with the
2966
0
  // SubIdx passed to this function. The register coalescer should be able to
2967
0
  // eliminate this extra copy.
2968
0
  unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC);
2969
0
2970
0
  BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
2971
0
    .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
2972
0
2973
0
  BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
2974
0
    .addReg(NewSuperReg, 0, SubIdx);
2975
0
2976
0
  return SubReg;
2977
0
}
2978
2979
MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
2980
  MachineBasicBlock::iterator MII,
2981
  MachineRegisterInfo &MRI,
2982
  MachineOperand &Op,
2983
  const TargetRegisterClass *SuperRC,
2984
  unsigned SubIdx,
2985
1.92k
  const TargetRegisterClass *SubRC) const {
2986
1.92k
  if (
Op.isImm()1.92k
) {
2987
0
    if (SubIdx == AMDGPU::sub0)
2988
0
      return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
2989
0
    
if (0
SubIdx == AMDGPU::sub10
)
2990
0
      return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
2991
0
2992
0
    
llvm_unreachable0
("Unhandled register index for immediate");
2993
0
  }
2994
1.92k
2995
1.92k
  unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
2996
1.92k
                                       SubIdx, SubRC);
2997
1.92k
  return MachineOperand::CreateReg(SubReg, false);
2998
1.92k
}
2999
3000
// Change the order of operands from (0, 1, 2) to (0, 2, 1)
3001
4.98k
void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
3002
4.98k
  assert(Inst.getNumExplicitOperands() == 3);
3003
4.98k
  MachineOperand Op1 = Inst.getOperand(1);
3004
4.98k
  Inst.RemoveOperand(1);
3005
4.98k
  Inst.addOperand(Op1);
3006
4.98k
}
3007
3008
bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
3009
                                    const MCOperandInfo &OpInfo,
3010
322k
                                    const MachineOperand &MO) const {
3011
322k
  if (!MO.isReg())
3012
464
    return false;
3013
321k
3014
321k
  unsigned Reg = MO.getReg();
3015
321k
  const TargetRegisterClass *RC =
3016
321k
    TargetRegisterInfo::isVirtualRegister(Reg) ?
3017
317k
    MRI.getRegClass(Reg) :
3018
4.37k
    RI.getPhysRegClass(Reg);
3019
322k
3020
322k
  const SIRegisterInfo *TRI =
3021
322k
      static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
3022
322k
  RC = TRI->getSubRegClass(RC, MO.getSubReg());
3023
322k
3024
322k
  // In order to be legal, the common sub-class must be equal to the
3025
322k
  // class of the current operand.  For example:
3026
322k
  //
3027
322k
  // v_mov_b32 s0 ; Operand defined as vsrc_b32
3028
322k
  //              ; RI.getCommonSubClass(s0,vsrc_b32) = sgpr ; LEGAL
3029
322k
  //
3030
322k
  // s_sendmsg 0, s0 ; Operand defined as m0reg
3031
322k
  //                 ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL
3032
322k
3033
322k
  return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC;
3034
322k
}
3035
3036
bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
3037
                                     const MCOperandInfo &OpInfo,
3038
0
                                     const MachineOperand &MO) const {
3039
0
  if (MO.isReg())
3040
0
    return isLegalRegOperand(MRI, OpInfo, MO);
3041
0
3042
0
  // Handle non-register types that are treated like immediates.
3043
0
  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
3044
0
  return true;
3045
0
}
3046
3047
bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
3048
393k
                                 const MachineOperand *MO) const {
3049
393k
  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
3050
393k
  const MCInstrDesc &InstDesc = MI.getDesc();
3051
393k
  const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
3052
393k
  const TargetRegisterClass *DefinedRC =
3053
393k
      OpInfo.RegClass != -1 ? 
RI.getRegClass(OpInfo.RegClass)393k
:
nullptr0
;
3054
393k
  if (!MO)
3055
0
    MO = &MI.getOperand(OpIdx);
3056
393k
3057
393k
  if (
isVALU(MI) && 393k
usesConstantBus(MRI, *MO, OpInfo)274k
) {
3058
88.5k
3059
88.5k
    RegSubRegPair SGPRUsed;
3060
88.5k
    if (MO->isReg())
3061
69.9k
      SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg());
3062
88.5k
3063
459k
    for (unsigned i = 0, e = MI.getNumOperands(); 
i != e459k
;
++i371k
) {
3064
399k
      if (i == OpIdx)
3065
81.7k
        continue;
3066
317k
      const MachineOperand &Op = MI.getOperand(i);
3067
317k
      if (
Op.isReg()317k
) {
3068
271k
        if (
(Op.getReg() != SGPRUsed.Reg || 271k
Op.getSubReg() != SGPRUsed.SubReg37.6k
) &&
3069
271k
            
usesConstantBus(MRI, Op, InstDesc.OpInfo[i])234k
) {
3070
28.3k
          return false;
3071
28.3k
        }
3072
46.0k
      } else 
if (46.0k
InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM3246.0k
) {
3073
23
        return false;
3074
23
      }
3075
399k
    }
3076
88.5k
  }
3077
393k
3078
365k
  
if (365k
MO->isReg()365k
) {
3079
296k
    assert(DefinedRC);
3080
296k
    return isLegalRegOperand(MRI, OpInfo, *MO);
3081
296k
  }
3082
68.8k
3083
68.8k
  // Handle non-register types that are treated like immediates.
3084
365k
  assert(MO->isImm() || MO->isTargetIndex() || MO->isFI());
3085
68.8k
3086
68.8k
  if (
!DefinedRC68.8k
) {
3087
0
    // This operand expects an immediate.
3088
0
    return true;
3089
0
  }
3090
68.8k
3091
68.8k
  return isImmOperandLegal(MI, OpIdx, *MO);
3092
68.8k
}
3093
3094
void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
3095
17.2k
                                       MachineInstr &MI) const {
3096
17.2k
  unsigned Opc = MI.getOpcode();
3097
17.2k
  const MCInstrDesc &InstrDesc = get(Opc);
3098
17.2k
3099
17.2k
  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
3100
17.2k
  MachineOperand &Src1 = MI.getOperand(Src1Idx);
3101
17.2k
3102
17.2k
  // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
3103
17.2k
  // we need to only have one constant bus use.
3104
17.2k
  //
3105
17.2k
  // Note we do not need to worry about literal constants here. They are
3106
17.2k
  // disabled for the operand type for instructions because they will always
3107
17.2k
  // violate the one constant bus use rule.
3108
17.2k
  bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister;
3109
17.2k
  if (
HasImplicitSGPR17.2k
) {
3110
4.78k
    int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
3111
4.78k
    MachineOperand &Src0 = MI.getOperand(Src0Idx);
3112
4.78k
3113
4.78k
    if (
Src0.isReg() && 4.78k
RI.isSGPRReg(MRI, Src0.getReg())4.78k
)
3114
2.68k
      legalizeOpWithMove(MI, Src0Idx);
3115
4.78k
  }
3116
17.2k
3117
17.2k
  // VOP2 src0 instructions support all operand types, so we don't need to check
3118
17.2k
  // their legality. If src1 is already legal, we don't need to do anything.
3119
17.2k
  if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
3120
6.39k
    return;
3121
10.8k
3122
10.8k
  // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
3123
10.8k
  // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
3124
10.8k
  // select is uniform.
3125
10.8k
  
if (10.8k
Opc == AMDGPU::V_READLANE_B32 && 10.8k
Src1.isReg()1
&&
3126
10.8k
      
RI.isVGPR(MRI, Src1.getReg())1
) {
3127
1
    unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3128
1
    const DebugLoc &DL = MI.getDebugLoc();
3129
1
    BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
3130
1
        .add(Src1);
3131
1
    Src1.ChangeToRegister(Reg, false);
3132
1
    return;
3133
1
  }
3134
10.8k
3135
10.8k
  // We do not use commuteInstruction here because it is too aggressive and will
3136
10.8k
  // commute if it is possible. We only want to commute here if it improves
3137
10.8k
  // legality. This can be called a fairly large number of times so don't waste
3138
10.8k
  // compile time pointlessly swapping and checking legality again.
3139
10.8k
  
if (10.8k
HasImplicitSGPR || 10.8k
!MI.isCommutable()8.72k
) {
3140
2.10k
    legalizeOpWithMove(MI, Src1Idx);
3141
2.10k
    return;
3142
2.10k
  }
3143
8.72k
3144
8.72k
  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
3145
8.72k
  MachineOperand &Src0 = MI.getOperand(Src0Idx);
3146
8.72k
3147
8.72k
  // If src0 can be used as src1, commuting will make the operands legal.
3148
8.72k
  // Otherwise we have to give up and insert a move.
3149
8.72k
  //
3150
8.72k
  // TODO: Other immediate-like operand kinds could be commuted if there was a
3151
8.72k
  // MachineOperand::ChangeTo* for them.
3152
8.72k
  if (
(!Src1.isImm() && 8.72k
!Src1.isReg()8.25k
) ||
3153
8.72k
      
!isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)8.72k
) {
3154
0
    legalizeOpWithMove(MI, Src1Idx);
3155
0
    return;
3156
0
  }
3157
8.72k
3158
8.72k
  int CommutedOpc = commuteOpcode(MI);
3159
8.72k
  if (
CommutedOpc == -18.72k
) {
3160
0
    legalizeOpWithMove(MI, Src1Idx);
3161
0
    return;
3162
0
  }
3163
8.72k
3164
8.72k
  MI.setDesc(get(CommutedOpc));
3165
8.72k
3166
8.72k
  unsigned Src0Reg = Src0.getReg();
3167
8.72k
  unsigned Src0SubReg = Src0.getSubReg();
3168
8.72k
  bool Src0Kill = Src0.isKill();
3169
8.72k
3170
8.72k
  if (Src1.isImm())
3171
464
    Src0.ChangeToImmediate(Src1.getImm());
3172
8.25k
  else 
if (8.25k
Src1.isReg()8.25k
) {
3173
8.25k
    Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
3174
8.25k
    Src0.setSubReg(Src1.getSubReg());
3175
8.25k
  } else
3176
0
    llvm_unreachable("Should only have register or immediate operands");
3177
8.72k
3178
8.72k
  Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
3179
8.72k
  Src1.setSubReg(Src0SubReg);
3180
8.72k
}
3181
3182
// Legalize VOP3 operands. Because all operand types are supported for any
3183
// operand, and since literal constants are not allowed and should never be
3184
// seen, we only need to worry about inserting copies if we use multiple SGPR
3185
// operands.
3186
void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
3187
47.7k
                                       MachineInstr &MI) const {
3188
47.7k
  unsigned Opc = MI.getOpcode();
3189
47.7k
3190
47.7k
  int VOP3Idx[3] = {
3191
47.7k
    AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
3192
47.7k
    AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
3193
47.7k
    AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
3194
47.7k
  };
3195
47.7k
3196
47.7k
  // Find the one SGPR operand we are allowed to use.
3197
47.7k
  unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx);
3198
47.7k
3199
154k
  for (unsigned i = 0; 
i < 3154k
;
++i106k
) {
3200
139k
    int Idx = VOP3Idx[i];
3201
139k
    if (Idx == -1)
3202
32.8k
      break;
3203
106k
    MachineOperand &MO = MI.getOperand(Idx);
3204
106k
3205
106k
    // We should never see a VOP3 instruction with an illegal immediate operand.
3206
106k
    if (!MO.isReg())
3207
10.8k
      continue;
3208
95.4k
3209
95.4k
    
if (95.4k
!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))95.4k
)
3210
47.4k
      continue; // VGPRs are legal
3211
47.9k
3212
47.9k
    
if (47.9k
SGPRReg == AMDGPU::NoRegister || 47.9k
SGPRReg == MO.getReg()22.5k
) {
3213
32.6k
      SGPRReg = MO.getReg();
3214
32.6k
      // We can use one SGPR in each VOP3 instruction.
3215
32.6k
      continue;
3216
32.6k
    }
3217
15.3k
3218
15.3k
    // If we make it this far, then the operand is not legal and we must
3219
15.3k
    // legalize it.
3220
15.3k
    legalizeOpWithMove(MI, Idx);
3221
15.3k
  }
3222
47.7k
}
3223
3224
unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI,
3225
21
                                         MachineRegisterInfo &MRI) const {
3226
21
  const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
3227
21
  const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
3228
21
  unsigned DstReg = MRI.createVirtualRegister(SRC);
3229
21
  unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
3230
21
3231
21
  SmallVector<unsigned, 8> SRegs;
3232
87
  for (unsigned i = 0; 
i < SubRegs87
;
++i66
) {
3233
66
    unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3234
66
    BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
3235
66
            get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
3236
66
        .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
3237
66
    SRegs.push_back(SGPR);
3238
66
  }
3239
21
3240
21
  MachineInstrBuilder MIB =
3241
21
      BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
3242
21
              get(AMDGPU::REG_SEQUENCE), DstReg);
3243
87
  for (unsigned i = 0; 
i < SubRegs87
;
++i66
) {
3244
66
    MIB.addReg(SRegs[i]);
3245
66
    MIB.addImm(RI.getSubRegFromChannel(i));
3246
66
  }
3247
21
  return DstReg;
3248
21
}
3249
3250
void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
3251
13
                                       MachineInstr &MI) const {
3252
13
3253
13
  // If the pointer is store in VGPRs, then we need to move them to
3254
13
  // SGPRs using v_readfirstlane.  This is safe because we only select
3255
13
  // loads with uniform pointers to SMRD instruction so we know the
3256
13
  // pointer value is uniform.
3257
13
  MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
3258
13
  if (
SBase && 13
!RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))13
) {
3259
13
      unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
3260
13
      SBase->setReg(SGPR);
3261
13
  }
3262
13
}
3263
3264
void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
3265
                                         MachineBasicBlock::iterator I,
3266
                                         const TargetRegisterClass *DstRC,
3267
                                         MachineOperand &Op,
3268
                                         MachineRegisterInfo &MRI,
3269
27.4k
                                         const DebugLoc &DL) const {
3270
27.4k
  unsigned OpReg = Op.getReg();
3271
27.4k
  unsigned OpSubReg = Op.getSubReg();
3272
27.4k
3273
27.4k
  const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
3274
27.4k
      RI.getRegClassForReg(MRI, OpReg), OpSubReg);
3275
27.4k
3276
27.4k
  // Check if operand is already the correct register class.
3277
27.4k
  if (DstRC == OpRC)
3278
253
    return;
3279
27.1k
3280
27.1k
  unsigned DstReg = MRI.createVirtualRegister(DstRC);
3281
27.1k
  MachineInstr *Copy =
3282
27.1k
      BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op);
3283
27.1k
3284
27.1k
  Op.setReg(DstReg);
3285
27.1k
  Op.setSubReg(0);
3286
27.1k
3287
27.1k
  MachineInstr *Def = MRI.getVRegDef(OpReg);
3288
27.1k
  if (!Def)
3289
0
    return;
3290
27.1k
3291
27.1k
  // Try to eliminate the copy if it is copying an immediate value.
3292
27.1k
  
if (27.1k
Def->isMoveImmediate()27.1k
)
3293
5.47k
    FoldImmediate(*Copy, *Def, OpReg, &MRI);
3294
27.4k
}
3295
3296
90.6k
void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
3297
90.6k
  MachineFunction &MF = *MI.getParent()->getParent();
3298
90.6k
  MachineRegisterInfo &MRI = MF.getRegInfo();
3299
90.6k
3300
90.6k
  // Legalize VOP2
3301
90.6k
  if (
isVOP2(MI) || 90.6k
isVOPC(MI)73.4k
) {
3302
17.2k
    legalizeOperandsVOP2(MRI, MI);
3303
17.2k
    return;
3304
17.2k
  }
3305
73.4k
3306
73.4k
  // Legalize VOP3
3307
73.4k
  
if (73.4k
isVOP3(MI)73.4k
) {
3308
19.4k
    legalizeOperandsVOP3(MRI, MI);
3309
19.4k
    return;
3310
19.4k
  }
3311
54.0k
3312
54.0k
  // Legalize SMRD
3313
54.0k
  
if (54.0k
isSMRD(MI)54.0k
) {
3314
13
    legalizeOperandsSMRD(MRI, MI);
3315
13
    return;
3316
13
  }
3317
53.9k
3318
53.9k
  // Legalize REG_SEQUENCE and PHI
3319
53.9k
  // The register class of the operands much be the same type as the register
3320
53.9k
  // class of the output.
3321
53.9k
  
if (53.9k
MI.getOpcode() == AMDGPU::PHI53.9k
) {
3322
329
    const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
3323
993
    for (unsigned i = 1, e = MI.getNumOperands(); 
i != e993
;
i += 2664
) {
3324
664
      if (!MI.getOperand(i).isReg() ||
3325
664
          !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg()))
3326
0
        continue;
3327
664
      const TargetRegisterClass *OpRC =
3328
664
          MRI.getRegClass(MI.getOperand(i).getReg());
3329
664
      if (
RI.hasVGPRs(OpRC)664
) {
3330
233
        VRC = OpRC;
3331
664
      } else {
3332
431
        SRC = OpRC;
3333
431
      }
3334
664
    }
3335
329
3336
329
    // If any of the operands are VGPR registers, then they all most be
3337
329
    // otherwise we will create illegal VGPR->SGPR copies when legalizing
3338
329
    // them.
3339
329
    if (
VRC || 329
!RI.isSGPRClass(getOpRegClass(MI, 0))108
) {
3340
329
      if (
!VRC329
) {
3341
108
        assert(SRC);
3342
108
        VRC = RI.getEquivalentVGPRClass(SRC);
3343
108
      }
3344
329
      RC = VRC;
3345
0
    } else {
3346
0
      RC = SRC;
3347
0
    }
3348
329
3349
329
    // Update all the operands so they have the same type.
3350
993
    for (unsigned I = 1, E = MI.getNumOperands(); 
I != E993
;
I += 2664
) {
3351
664
      MachineOperand &Op = MI.getOperand(I);
3352
664
      if (
!Op.isReg() || 664
!TargetRegisterInfo::isVirtualRegister(Op.getReg())664
)
3353
0
        continue;
3354
664
3355
664
      // MI is a PHI instruction.
3356
664
      MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
3357
664
      MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator();
3358
664
3359
664
      // Avoid creating no-op copies with the same src and dst reg class.  These
3360
664
      // confuse some of the machine passes.
3361
664
      legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
3362
664
    }
3363
329
  }
3364
53.9k
3365
53.9k
  // REG_SEQUENCE doesn't really require operand legalization, but if one has a
3366
53.9k
  // VGPR dest type and SGPR sources, insert copies so all operands are
3367
53.9k
  // VGPRs. This seems to help operand folding / the register coalescer.
3368
53.9k
  if (
MI.getOpcode() == AMDGPU::REG_SEQUENCE53.9k
) {
3369
21.3k
    MachineBasicBlock *MBB = MI.getParent();
3370
21.3k
    const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
3371
21.3k
    if (
RI.hasVGPRs(DstRC)21.3k
) {
3372
21.3k
      // Update all the operands so they are VGPR register classes. These may
3373
21.3k
      // not be the same register class because REG_SEQUENCE supports mixing
3374
21.3k
      // subregister index types e.g. sub0_sub1 + sub2 + sub3
3375
73.8k
      for (unsigned I = 1, E = MI.getNumOperands(); 
I != E73.8k
;
I += 252.4k
) {
3376
52.4k
        MachineOperand &Op = MI.getOperand(I);
3377
52.4k
        if (
!Op.isReg() || 52.4k
!TargetRegisterInfo::isVirtualRegister(Op.getReg())52.4k
)
3378
0
          continue;
3379
52.4k
3380
52.4k
        const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
3381
52.4k
        const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
3382
52.4k
        if (VRC == OpRC)
3383
25.7k
          continue;
3384
26.7k
3385
26.7k
        legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
3386
26.7k
        Op.setIsKill();
3387
26.7k
      }
3388
21.3k
    }
3389
21.3k
3390
21.3k
    return;
3391
21.3k
  }
3392
32.6k
3393
32.6k
  // Legalize INSERT_SUBREG
3394
32.6k
  // src0 must have the same register class as dst
3395
32.6k
  
if (32.6k
MI.getOpcode() == AMDGPU::INSERT_SUBREG32.6k
) {
3396
22
    unsigned Dst = MI.getOperand(0).getReg();
3397
22
    unsigned Src0 = MI.getOperand(1).getReg();
3398
22
    const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
3399
22
    const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
3400
22
    if (
DstRC != Src0RC22
) {
3401
2
      MachineBasicBlock *MBB = MI.getParent();
3402
2
      MachineOperand &Op = MI.getOperand(1);
3403
2
      legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
3404
2
    }
3405
22
    return;
3406
22
  }
3407
32.6k
3408
32.6k
  // Legalize MIMG and MUBUF/MTBUF for shaders.
3409
32.6k
  //
3410
32.6k
  // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
3411
32.6k
  // scratch memory access. In both cases, the legalization never involves
3412
32.6k
  // conversion to the addr64 form.
3413
32.6k
  
if (32.6k
isMIMG(MI) ||
3414
32.6k
      (AMDGPU::isShader(MF.getFunction()->getCallingConv()) &&
3415
32.6k
       
(isMUBUF(MI) || 208
isMTBUF(MI)204
))) {
3416
8
    MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc);
3417
8
    if (
SRsrc && 8
!RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))8
) {
3418
6
      unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI);
3419
6
      SRsrc->setReg(SGPR);
3420
6
    }
3421
8
3422
8
    MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp);
3423
8
    if (
SSamp && 8
!RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))4
) {
3424
2
      unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI);
3425
2
      SSamp->setReg(SGPR);
3426
2
    }
3427
8
    return;
3428
8
  }
3429
32.6k
3430
32.6k
  // Legalize MUBUF* instructions by converting to addr64 form.
3431
32.6k
  // FIXME: If we start using the non-addr64 instructions for compute, we
3432
32.6k
  // may need to legalize them as above. This especially applies to the
3433
32.6k
  // buffer_load_format_* variants and variants with idxen (or bothen).
3434
32.6k
  int SRsrcIdx =
3435
32.6k
      AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
3436
32.6k
  if (
SRsrcIdx != -132.6k
) {
3437
39
    // We have an MUBUF instruction
3438
39
    MachineOperand *SRsrc = &MI.getOperand(SRsrcIdx);
3439
39
    unsigned SRsrcRC = get(MI.getOpcode()).OpInfo[SRsrcIdx].RegClass;
3440
39
    if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()),
3441
39
                                             RI.getRegClass(SRsrcRC))) {
3442
0
      // The operands are legal.
3443
0
      // FIXME: We may need to legalize operands besided srsrc.
3444
0
      return;
3445
0
    }
3446
39
3447
39
    MachineBasicBlock &MBB = *MI.getParent();
3448
39
3449
39
    // Extract the ptr from the resource descriptor.
3450
39
    unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc,
3451
39
      &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
3452
39
3453
39
    // Create an empty resource descriptor
3454
39
    unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3455
39
    unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3456
39
    unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3457
39
    unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
3458
39
    uint64_t RsrcDataFormat = getDefaultRsrcDataFormat();
3459
39
3460
39
    // Zero64 = 0
3461
39
    BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B64), Zero64)
3462
39
        .addImm(0);
3463
39
3464
39
    // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
3465
39
    BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
3466
39
        .addImm(RsrcDataFormat & 0xFFFFFFFF);
3467
39
3468
39
    // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
3469
39
    BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
3470
39
        .addImm(RsrcDataFormat >> 32);
3471
39
3472
39
    // NewSRsrc = {Zero64, SRsrcFormat}
3473
39
    BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc)
3474
39
        .addReg(Zero64)
3475
39
        .addImm(AMDGPU::sub0_sub1)
3476
39
        .addReg(SRsrcFormatLo)
3477
39
        .addImm(AMDGPU::sub2)
3478
39
        .addReg(SRsrcFormatHi)
3479
39
        .addImm(AMDGPU::sub3);
3480
39
3481
39
    MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
3482
39
    unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
3483
39
    if (
VAddr39
) {
3484
8
      // This is already an ADDR64 instruction so we need to add the pointer
3485
8
      // extracted from the resource descriptor to the current value of VAddr.
3486
8
      unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3487
8
      unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3488
8
3489
8
      // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0
3490
8
      DebugLoc DL = MI.getDebugLoc();
3491
8
      BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo)
3492
8
        .addReg(SRsrcPtr, 0, AMDGPU::sub0)
3493
8
        .addReg(VAddr->getReg(), 0, AMDGPU::sub0);
3494
8
3495
8
      // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1
3496
8
      BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi)
3497
8
        .addReg(SRsrcPtr, 0, AMDGPU::sub1)
3498
8
        .addReg(VAddr->getReg(), 0, AMDGPU::sub1);
3499
8
3500
8
      // NewVaddr = {NewVaddrHi, NewVaddrLo}
3501
8
      BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
3502
8
          .addReg(NewVAddrLo)
3503
8
          .addImm(AMDGPU::sub0)
3504
8
          .addReg(NewVAddrHi)
3505
8
          .addImm(AMDGPU::sub1);
3506
39
    } else {
3507
31
      // This instructions is the _OFFSET variant, so we need to convert it to
3508
31
      // ADDR64.
3509
31
      assert(MBB.getParent()->getSubtarget<SISubtarget>().getGeneration()
3510
31
             < SISubtarget::VOLCANIC_ISLANDS &&
3511
31
             "FIXME: Need to emit flat atomics here");
3512
31
3513
31
      MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
3514
31
      MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
3515
31
      MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
3516
31
      unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
3517
31
3518
31
      // Atomics rith return have have an additional tied operand and are
3519
31
      // missing some of the special bits.
3520
31
      MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
3521
31
      MachineInstr *Addr64;
3522
31
3523
31
      if (
!VDataIn31
) {
3524
30
        // Regular buffer load / store.
3525
30
        MachineInstrBuilder MIB =
3526
30
            BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
3527
30
                .add(*VData)
3528
30
                .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
3529
30
                // This will be replaced later
3530
30
                // with the new value of vaddr.
3531
30
                .add(*SRsrc)
3532
30
                .add(*SOffset)
3533
30
                .add(*Offset);
3534
30
3535
30
        // Atomics do not have this operand.
3536
30
        if (const MachineOperand *GLC =
3537
29
                getNamedOperand(MI, AMDGPU::OpName::glc)) {
3538
29
          MIB.addImm(GLC->getImm());
3539
29
        }
3540
30
3541
30
        MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc));
3542
30
3543
30
        if (const MachineOperand *TFE =
3544
29
                getNamedOperand(MI, AMDGPU::OpName::tfe)) {
3545
29
          MIB.addImm(TFE->getImm());
3546
29
        }
3547
30
3548
30
        MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
3549
30
        Addr64 = MIB;
3550
31
      } else {
3551
1
        // Atomics with return.
3552
1
        Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
3553
1
                     .add(*VData)
3554
1
                     .add(*VDataIn)
3555
1
                     .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
3556
1
                     // This will be replaced later
3557
1
                     // with the new value of vaddr.
3558
1
                     .add(*SRsrc)
3559
1
                     .add(*SOffset)
3560
1
                     .add(*Offset)
3561
1
                     .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc))
3562
1
                     .setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
3563
1
      }
3564
31
3565
31
      MI.removeFromParent();
3566
31
3567
31
      // NewVaddr = {NewVaddrHi, NewVaddrLo}
3568
31
      BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
3569
31
              NewVAddr)
3570
31
          .addReg(SRsrcPtr, 0, AMDGPU::sub0)
3571
31
          .addImm(AMDGPU::sub0)
3572
31
          .addReg(SRsrcPtr, 0, AMDGPU::sub1)
3573
31
          .addImm(AMDGPU::sub1);
3574
31
3575
31
      VAddr = getNamedOperand(*Addr64, AMDGPU::OpName::vaddr);
3576
31
      SRsrc = getNamedOperand(*Addr64, AMDGPU::OpName::srsrc);
3577
31
    }
3578
39
3579
39
    // Update the instruction to use NewVaddr
3580
39
    VAddr->setReg(NewVAddr);
3581
39
    // Update the instruction to use NewSRsrc
3582
39
    SRsrc->setReg(NewSRsrc);
3583
39
  }
3584
90.6k
}
3585
3586
31.6k
void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
3587
31.6k
  SetVectorType Worklist;
3588
31.6k
  Worklist.insert(&TopInst);
3589
31.6k
3590
138k
  while (
!Worklist.empty()138k
) {
3591
107k
    MachineInstr &Inst = *Worklist.pop_back_val();
3592
107k
    MachineBasicBlock *MBB = Inst.getParent();
3593
107k
    MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
3594
107k
3595
107k
    unsigned Opcode = Inst.getOpcode();
3596
107k
    unsigned NewOpcode = getVALUOp(Inst);
3597
107k
3598
107k
    // Handle some special cases
3599
107k
    switch (Opcode) {
3600
94.2k
    default:
3601
94.2k
      break;
3602
111
    case AMDGPU::S_AND_B64:
3603
111
      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64);
3604
111
      Inst.eraseFromParent();
3605
111
      continue;
3606
107k
3607
216
    case AMDGPU::S_OR_B64:
3608
216
      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64);
3609
216
      Inst.eraseFromParent();
3610
216
      continue;
3611
107k
3612
130
    case AMDGPU::S_XOR_B64:
3613
130
      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64);
3614
130
      Inst.eraseFromParent();
3615
130
      continue;
3616
107k
3617
16
    case AMDGPU::S_NOT_B64:
3618
16
      splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32);
3619
16
      Inst.eraseFromParent();
3620
16
      continue;
3621
107k
3622
26
    case AMDGPU::S_BCNT1_I32_B64:
3623
26
      splitScalar64BitBCNT(Worklist, Inst);
3624
26
      Inst.eraseFromParent();
3625
26
      continue;
3626
107k
3627
1.24k
    case AMDGPU::S_BFE_I64:
3628
1.24k
      splitScalar64BitBFE(Worklist, Inst);
3629
1.24k
      Inst.eraseFromParent();
3630
1.24k
      continue;
3631
107k
3632
4.44k
    case AMDGPU::S_LSHL_B32:
3633
4.44k
      if (
ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS4.44k
) {
3634
2.10k
        NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
3635
2.10k
        swapOperands(Inst);
3636
2.10k
      }
3637
4.44k
      break;
3638
1.77k
    case AMDGPU::S_ASHR_I32:
3639
1.77k
      if (
ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS1.77k
) {
3640
741
        NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
3641
741
        swapOperands(Inst);
3642
741
      }
3643
1.77k
      break;
3644
2.72k
    case AMDGPU::S_LSHR_B32:
3645
2.72k
      if (
ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS2.72k
) {
3646
1.24k
        NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
3647
1.24k
        swapOperands(Inst);
3648
1.24k
      }
3649
2.72k
      break;
3650
1.65k
    case AMDGPU::S_LSHL_B64:
3651
1.65k
      if (
ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS1.65k
) {
3652
719
        NewOpcode = AMDGPU::V_LSHLREV_B64;
3653
719
        swapOperands(Inst);
3654
719
      }
3655
1.65k
      break;
3656
213
    case AMDGPU::S_ASHR_I64:
3657
213
      if (
ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS213
) {
3658
95
        NewOpcode = AMDGPU::V_ASHRREV_I64;
3659
95
        swapOperands(Inst);
3660
95
      }
3661
213
      break;
3662
138
    case AMDGPU::S_LSHR_B64:
3663
138
      if (
ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS138
) {
3664
69
        NewOpcode = AMDGPU::V_LSHRREV_B64;
3665
69
        swapOperands(Inst);
3666
69
      }
3667
138
      break;
3668
107k
3669
16
    case AMDGPU::S_ABS_I32:
3670
16
      lowerScalarAbs(Worklist, Inst);
3671
16
      Inst.eraseFromParent();
3672
16
      continue;
3673
107k
3674
77
    case AMDGPU::S_CBRANCH_SCC0:
3675
77
    case AMDGPU::S_CBRANCH_SCC1:
3676
77
      // Clear unused bits of vcc
3677
77
      BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64),
3678
77
              AMDGPU::VCC)
3679
77
          .addReg(AMDGPU::EXEC)
3680
77
          .addReg(AMDGPU::VCC);
3681
77
      break;
3682
77
3683
0
    case AMDGPU::S_BFE_U64:
3684
0
    case AMDGPU::S_BFM_B64:
3685
0
      llvm_unreachable("Moving this op to VALU not implemented");
3686
0
3687
52
    case AMDGPU::S_PACK_LL_B32_B16:
3688
52
    case AMDGPU::S_PACK_LH_B32_B16:
3689
52
    case AMDGPU::S_PACK_HH_B32_B16:
3690
52
      movePackToVALU(Worklist, MRI, Inst);
3691
52
      Inst.eraseFromParent();
3692
52
      continue;
3693
52
3694
12
    case AMDGPU::S_XNOR_B32:
3695
12
      lowerScalarXnor(Worklist, Inst);
3696
12
      Inst.eraseFromParent();
3697
12
      continue;
3698
52
3699
4
    case AMDGPU::S_XNOR_B64:
3700
4
      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32);
3701
4
      Inst.eraseFromParent();
3702
4
      continue;
3703
105k
    }
3704
105k
3705
105k
    
if (105k
NewOpcode == AMDGPU::INSTRUCTION_LIST_END105k
) {
3706
61
      // We cannot move this instruction to the VALU, so we should try to
3707
61
      // legalize its operands instead.
3708
61
      legalizeOperands(Inst);
3709
61
      continue;
3710
61
    }
3711
105k
3712
105k
    // Use the new VALU Opcode.
3713
105k
    const MCInstrDesc &NewDesc = get(NewOpcode);
3714
105k
    Inst.setDesc(NewDesc);
3715
105k
3716
105k
    // Remove any references to SCC. Vector instructions can't read from it, and
3717
105k
    // We're just about to add the implicit use / defs of VCC, and we don't want
3718
105k
    // both.
3719
368k
    for (unsigned i = Inst.getNumOperands() - 1; 
i > 0368k
;
--i263k
) {
3720
263k
      MachineOperand &Op = Inst.getOperand(i);
3721
263k
      if (
Op.isReg() && 263k
Op.getReg() == AMDGPU::SCC206k
) {
3722
39.0k
        Inst.RemoveOperand(i);
3723
39.0k
        addSCCDefUsersToVALUWorklist(Inst, Worklist);
3724
39.0k
      }
3725
263k
    }
3726
105k
3727
105k
    if (
Opcode == AMDGPU::S_SEXT_I32_I8 || 105k
Opcode == AMDGPU::S_SEXT_I32_I16104k
) {
3728
1.07k
      // We are converting these to a BFE, so we need to add the missing
3729
1.07k
      // operands for the size and offset.
3730
1.07k
      unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 
8357
:
16720
;
3731
1.07k
      Inst.addOperand(MachineOperand::CreateImm(0));
3732
1.07k
      Inst.addOperand(MachineOperand::CreateImm(Size));
3733
1.07k
3734
105k
    } else 
if (104k
Opcode == AMDGPU::S_BCNT1_I32_B32104k
) {
3735
64
      // The VALU version adds the second operand to the result, so insert an
3736
64
      // extra 0 operand.
3737
64
      Inst.addOperand(MachineOperand::CreateImm(0));
3738
64
    }
3739
105k
3740
105k
    Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent());
3741
105k
3742
105k
    if (
Opcode == AMDGPU::S_BFE_I32 || 105k
Opcode == AMDGPU::S_BFE_U32103k
) {
3743
3.47k
      const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
3744
3.47k
      // If we need to move this to VGPRs, we need to unpack the second operand
3745
3.47k
      // back into the 2 separate ones for bit offset and width.
3746
3.47k
      assert(OffsetWidthOp.isImm() &&
3747
3.47k
             "Scalar BFE is only implemented for constant width and offset");
3748
3.47k
      uint32_t Imm = OffsetWidthOp.getImm();
3749
3.47k
3750
3.47k
      uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
3751
3.47k
      uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
3752
3.47k
      Inst.RemoveOperand(2);                     // Remove old immediate.
3753
3.47k
      Inst.addOperand(MachineOperand::CreateImm(Offset));
3754
3.47k
      Inst.addOperand(MachineOperand::CreateImm(BitWidth));
3755
3.47k
    }
3756
105k
3757
105k
    bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef();
3758
105k
    unsigned NewDstReg = AMDGPU::NoRegister;
3759
105k
    if (
HasDst105k
) {
3760
105k
      unsigned DstReg = Inst.getOperand(0).getReg();
3761
105k
      if (TargetRegisterInfo::isPhysicalRegister(DstReg))
3762
46
        continue;
3763
105k
3764
105k
      // Update the destination register class.
3765
105k
      const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
3766
105k
      if (!NewDstRC)
3767
0
        continue;
3768
105k
3769
105k
      
if (105k
Inst.isCopy() &&
3770
47.4k
          TargetRegisterInfo::isVirtualRegister(Inst.getOperand(1).getReg()) &&
3771
105k
          
NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())47.3k
) {
3772
15.5k
        // Instead of creating a copy where src and dst are the same register
3773
15.5k
        // class, we just replace all uses of dst with src.  These kinds of
3774
15.5k
        // copies interfere with the heuristics MachineSink uses to decide
3775
15.5k
        // whether or not to split a critical edge.  Since the pass assumes
3776
15.5k
        // that copies will end up as machine instructions and not be
3777
15.5k
        // eliminated.
3778
15.5k
        addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
3779
15.5k
        MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
3780
15.5k
        MRI.clearKillFlags(Inst.getOperand(1).getReg());
3781
15.5k
        Inst.getOperand(0).setReg(DstReg);
3782
15.5k
        continue;
3783
15.5k
      }
3784
89.4k
3785
89.4k
      NewDstReg = MRI.createVirtualRegister(NewDstRC);
3786
89.4k
      MRI.replaceRegWith(DstReg, NewDstReg);
3787
89.4k
    }
3788
105k
3789
105k
    // Legalize the operands
3790
89.6k
    legalizeOperands(Inst);
3791
89.6k
3792
89.6k
    if (HasDst)
3793
89.4k
     addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
3794
107k
  }
3795
31.6k
}
3796
3797
void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist,
3798
16
                                 MachineInstr &Inst) const {
3799
16
  MachineBasicBlock &MBB = *Inst.getParent();
3800
16
  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3801
16
  MachineBasicBlock::iterator MII = Inst;
3802
16
  DebugLoc DL = Inst.getDebugLoc();
3803
16
3804
16
  MachineOperand &Dest = Inst.getOperand(0);
3805
16
  MachineOperand &Src = Inst.getOperand(1);
3806
16
  unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3807
16
  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3808
16
3809
16
  BuildMI(MBB, MII, DL, get(AMDGPU::V_SUB_I32_e32), TmpReg)
3810
16
    .addImm(0)
3811
16
    .addReg(Src.getReg());
3812
16
3813
16
  BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
3814
16
    .addReg(Src.getReg())
3815
16
    .addReg(TmpReg);
3816
16
3817
16
  MRI.replaceRegWith(Dest.getReg(), ResultReg);
3818
16
  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
3819
16
}
3820
3821
void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
3822
12
                                  MachineInstr &Inst) const {
3823
12
  MachineBasicBlock &MBB = *Inst.getParent();
3824
12
  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3825
12
  MachineBasicBlock::iterator MII = Inst;
3826
12
  const DebugLoc &DL = Inst.getDebugLoc();
3827
12
3828
12
  MachineOperand &Dest = Inst.getOperand(0);
3829
12
  MachineOperand &Src0 = Inst.getOperand(1);
3830
12
  MachineOperand &Src1 = Inst.getOperand(2);
3831
12
3832
12
  legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
3833
12
  legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
3834
12
3835
12
  unsigned Xor = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3836
12
  BuildMI(MBB, MII, DL, get(AMDGPU::V_XOR_B32_e64), Xor)
3837
12
    .add(Src0)
3838
12
    .add(Src1);
3839
12
3840
12
  unsigned Not = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3841
12
  BuildMI(MBB, MII, DL, get(AMDGPU::V_NOT_B32_e64), Not)
3842
12
    .addReg(Xor);
3843
12
3844
12
  MRI.replaceRegWith(Dest.getReg(), Not);
3845
12
  addUsersToMoveToVALUWorklist(Not, MRI, Worklist);
3846
12
}
3847
3848
void SIInstrInfo::splitScalar64BitUnaryOp(
3849
    SetVectorType &Worklist, MachineInstr &Inst,
3850
16
    unsigned Opcode) const {
3851
16
  MachineBasicBlock &MBB = *Inst.getParent();
3852
16
  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3853
16
3854
16
  MachineOperand &Dest = Inst.getOperand(0);
3855
16
  MachineOperand &Src0 = Inst.getOperand(1);
3856
16
  DebugLoc DL = Inst.getDebugLoc();
3857
16
3858
16
  MachineBasicBlock::iterator MII = Inst;
3859
16
3860
16
  const MCInstrDesc &InstDesc = get(Opcode);
3861
16
  const TargetRegisterClass *Src0RC = Src0.isReg() ?
3862
16
    MRI.getRegClass(Src0.getReg()) :
3863
0
    &AMDGPU::SGPR_32RegClass;
3864
16
3865
16
  const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
3866
16
3867
16
  MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
3868
16
                                                       AMDGPU::sub0, Src0SubRC);
3869
16
3870
16
  const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
3871
16
  const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
3872
16
  const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
3873
16
3874
16
  unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
3875
16
  BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
3876
16
3877
16
  MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
3878
16
                                                       AMDGPU::sub1, Src0SubRC);
3879
16
3880
16
  unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
3881
16
  BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
3882
16
3883
16
  unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
3884
16
  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
3885
16
    .addReg(DestSub0)
3886
16
    .addImm(AMDGPU::sub0)
3887
16
    .addReg(DestSub1)
3888
16
    .addImm(AMDGPU::sub1);
3889
16
3890
16
  MRI.replaceRegWith(Dest.getReg(), FullDestReg);
3891
16
3892
16
  // We don't need to legalizeOperands here because for a single operand, src0
3893
16
  // will support any kind of input.
3894
16
3895
16
  // Move all users of this moved value.
3896
16
  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
3897
16
}
3898
3899
void SIInstrInfo::splitScalar64BitBinaryOp(
3900
    SetVectorType &Worklist, MachineInstr &Inst,
3901
461
    unsigned Opcode) const {
3902
461
  MachineBasicBlock &MBB = *Inst.getParent();
3903
461
  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3904
461
3905
461
  MachineOperand &Dest = Inst.getOperand(0);
3906
461
  MachineOperand &Src0 = Inst.getOperand(1);
3907
461
  MachineOperand &Src1 = Inst.getOperand(2);
3908
461
  DebugLoc DL = Inst.getDebugLoc();
3909
461
3910
461
  MachineBasicBlock::iterator MII = Inst;
3911
461
3912
461
  const MCInstrDesc &InstDesc = get(Opcode);
3913
461
  const TargetRegisterClass *Src0RC = Src0.isReg() ?
3914
461
    MRI.getRegClass(Src0.getReg()) :
3915
0
    &AMDGPU::SGPR_32RegClass;
3916
461
3917
461
  const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
3918
461
  const TargetRegisterClass *Src1RC = Src1.isReg() ?
3919
461
    MRI.getRegClass(Src1.getReg()) :
3920
0
    &AMDGPU::SGPR_32RegClass;
3921
461
3922
461
  const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
3923
461
3924
461
  MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
3925
461
                                                       AMDGPU::sub0, Src0SubRC);
3926
461
  MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
3927
461
                                                       AMDGPU::sub0, Src1SubRC);
3928
461
3929
461
  const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
3930
461
  const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
3931
461
  const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
3932
461
3933
461
  unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
3934
461
  MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
3935
461
                              .add(SrcReg0Sub0)
3936
461
                              .add(SrcReg1Sub0);
3937
461
3938
461
  MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
3939
461
                                                       AMDGPU::sub1, Src0SubRC);
3940
461
  MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
3941
461
                                                       AMDGPU::sub1, Src1SubRC);
3942
461
3943
461
  unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
3944
461
  MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
3945
461
                              .add(SrcReg0Sub1)
3946
461
                              .add(SrcReg1Sub1);
3947
461
3948
461
  unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
3949
461
  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
3950
461
    .addReg(DestSub0)
3951
461
    .addImm(AMDGPU::sub0)
3952
461
    .addReg(DestSub1)
3953
461
    .addImm(AMDGPU::sub1);
3954
461
3955
461
  MRI.replaceRegWith(Dest.getReg(), FullDestReg);
3956
461
3957
461
  // Try to legalize the operands in case we need to swap the order to keep it
3958
461
  // valid.
3959
461
  legalizeOperands(LoHalf);
3960
461
  legalizeOperands(HiHalf);
3961
461
3962
461
  // Move all users of this moved vlaue.
3963
461
  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
3964
461
}
3965
3966
void SIInstrInfo::splitScalar64BitBCNT(
3967
26
    SetVectorType &Worklist, MachineInstr &Inst) const {
3968
26
  MachineBasicBlock &MBB = *Inst.getParent();
3969
26
  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3970
26
3971
26
  MachineBasicBlock::iterator MII = Inst;
3972
26
  DebugLoc DL = Inst.getDebugLoc();
3973
26
3974
26
  MachineOperand &Dest = Inst.getOperand(0);
3975
26
  MachineOperand &Src = Inst.getOperand(1);
3976
26
3977
26
  const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
3978
26
  const TargetRegisterClass *SrcRC = Src.isReg() ?
3979
26
    MRI.getRegClass(Src.getReg()) :
3980
0
    &AMDGPU::SGPR_32RegClass;
3981
26
3982
26
  unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3983
26
  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3984
26
3985
26
  const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0);
3986
26
3987
26
  MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
3988
26
                                                      AMDGPU::sub0, SrcSubRC);
3989
26
  MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
3990
26
                                                      AMDGPU::sub1, SrcSubRC);
3991
26
3992
26
  BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
3993
26
3994
26
  BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
3995
26
3996
26
  MRI.replaceRegWith(Dest.getReg(), ResultReg);
3997
26
3998
26
  // We don't need to legalize operands here. src0 for etiher instruction can be
3999
26
  // an SGPR, and the second input is unused or determined here.
4000
26
  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4001
26
}
4002
4003
void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
4004
1.24k
                                      MachineInstr &Inst) const {
4005
1.24k
  MachineBasicBlock &MBB = *Inst.getParent();
4006
1.24k
  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4007
1.24k
  MachineBasicBlock::iterator MII = Inst;
4008
1.24k
  DebugLoc DL = Inst.getDebugLoc();
4009
1.24k
4010
1.24k
  MachineOperand &Dest = Inst.getOperand(0);
4011
1.24k
  uint32_t Imm = Inst.getOperand(2).getImm();
4012
1.24k
  uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
4013
1.24k
  uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
4014
1.24k
4015
1.24k
  (void) Offset;
4016
1.24k
4017
1.24k
  // Only sext_inreg cases handled.
4018
1.24k
  assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
4019
1.24k
         Offset == 0 && "Not implemented");
4020
1.24k
4021
1.24k
  if (
BitWidth < 321.24k
) {
4022
1.24k
    unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4023
1.24k
    unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4024
1.24k
    unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
4025
1.24k
4026
1.24k
    BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo)
4027
1.24k
        .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
4028
1.24k
        .addImm(0)
4029
1.24k
        .addImm(BitWidth);
4030
1.24k
4031
1.24k
    BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
4032
1.24k
      .addImm(31)
4033
1.24k
      .addReg(MidRegLo);
4034
1.24k
4035
1.24k
    BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
4036
1.24k
      .addReg(MidRegLo)
4037
1.24k
      .addImm(AMDGPU::sub0)
4038
1.24k
      .addReg(MidRegHi)
4039
1.24k
      .addImm(AMDGPU::sub1);
4040
1.24k
4041
1.24k
    MRI.replaceRegWith(Dest.getReg(), ResultReg);
4042
1.24k
    addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4043
1.24k
    return;
4044
1.24k
  }
4045
6
4046
6
  MachineOperand &Src = Inst.getOperand(1);
4047
6
  unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4048
6
  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
4049
6
4050
6
  BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
4051
6
    .addImm(31)
4052
6
    .addReg(Src.getReg(), 0, AMDGPU::sub0);
4053
6
4054
6
  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
4055
6
    .addReg(Src.getReg(), 0, AMDGPU::sub0)
4056
6
    .addImm(AMDGPU::sub0)
4057
6
    .addReg(TmpReg)
4058
6
    .addImm(AMDGPU::sub1);
4059
6
4060
6
  MRI.replaceRegWith(Dest.getReg(), ResultReg);
4061
6
  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4062
6
}
4063
4064
void SIInstrInfo::addUsersToMoveToVALUWorklist(
4065
  unsigned DstReg,
4066
  MachineRegisterInfo &MRI,
4067
106k
  SetVectorType &Worklist) const {
4068
106k
  for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
4069
244k
         E = MRI.use_end(); 
I != E244k
;) {
4070
137k
    MachineInstr &UseMI = *I->getParent();
4071
137k
    if (
!canReadVGPR(UseMI, I.getOperandNo())137k
) {
4072
75.3k
      Worklist.insert(&UseMI);
4073
75.3k
4074
75.4k
      do {
4075
75.4k
        ++I;
4076
75.4k
      } while (
I != E && 75.4k
I->getParent() == &UseMI22.5k
);
4077
137k
    } else {
4078
62.0k
      ++I;
4079
62.0k
    }
4080
137k
  }
4081
106k
}
4082
4083
void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
4084
                                 MachineRegisterInfo &MRI,
4085
52
                                 MachineInstr &Inst) const {
4086
52
  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4087
52
  MachineBasicBlock *MBB = Inst.getParent();
4088
52
  MachineOperand &Src0 = Inst.getOperand(1);
4089
52
  MachineOperand &Src1 = Inst.getOperand(2);
4090
52
  const DebugLoc &DL = Inst.getDebugLoc();
4091
52
4092
52
  switch (Inst.getOpcode()) {
4093
49
  case AMDGPU::S_PACK_LL_B32_B16: {
4094
49
    unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4095
49
    unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4096
49
4097
49
    // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
4098
49
    // 0.
4099
49
    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
4100
49
      .addImm(0xffff);
4101
49
4102
49
    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
4103
49
      .addReg(ImmReg, RegState::Kill)
4104
49
      .add(Src0);
4105
49
4106
49
    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32), ResultReg)
4107
49
      .add(Src1)
4108
49
      .addImm(16)
4109
49
      .addReg(TmpReg, RegState::Kill);
4110
49
    break;
4111
52
  }
4112
2
  case AMDGPU::S_PACK_LH_B32_B16: {
4113
2
    unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4114
2
    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
4115
2
      .addImm(0xffff);
4116
2
    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg)
4117
2
      .addReg(ImmReg, RegState::Kill)
4118
2
      .add(Src0)
4119
2
      .add(Src1);
4120
2
    break;
4121
52
  }
4122
1
  case AMDGPU::S_PACK_HH_B32_B16: {
4123
1
    unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4124
1
    unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4125
1
    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
4126
1
      .addImm(16)
4127
1
      .add(Src0);
4128
1
    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
4129
1
      .addImm(0xffff0000);
4130
1
    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32), ResultReg)
4131
1
      .add(Src1)
4132
1
      .addReg(ImmReg, RegState::Kill)
4133
1
      .addReg(TmpReg, RegState::Kill);
4134
1
    break;
4135
52
  }
4136
0
  default:
4137
0
    llvm_unreachable("unhandled s_pack_* instruction");
4138
52
  }
4139
52
4140
52
  MachineOperand &Dest = Inst.getOperand(0);
4141
52
  MRI.replaceRegWith(Dest.getReg(), ResultReg);
4142
52
  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4143
52
}
4144
4145
void SIInstrInfo::addSCCDefUsersToVALUWorklist(
4146
39.0k
    MachineInstr &SCCDefInst, SetVectorType &Worklist) const {
4147
39.0k
  // This assumes that all the users of SCC are in the same block
4148
39.0k
  // as the SCC def.
4149
39.0k
  for (MachineInstr &MI :
4150
39.0k
       make_range(MachineBasicBlock::iterator(SCCDefInst),
4151
1.77M
                      SCCDefInst.getParent()->end())) {
4152
1.77M
    // Exit if we find another SCC def.
4153
1.77M
    if (MI.findRegisterDefOperandIdx(AMDGPU::SCC) != -1)
4154
29.9k
      return;
4155
1.74M
4156
1.74M
    
if (1.74M
MI.findRegisterUseOperandIdx(AMDGPU::SCC) != -11.74M
)
4157
77
      Worklist.insert(&MI);
4158
1.77M
  }
4159
39.0k
}
4160
4161
const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
4162
105k
  const MachineInstr &Inst) const {
4163
105k
  const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
4164
105k
4165
105k
  switch (Inst.getOpcode()) {
4166
105k
  // For target instructions, getOpRegClass just returns the virtual register
4167
105k
  // class associated with the operand, so we need to find an equivalent VGPR
4168
105k
  // register class in order to move the instruction to the VALU.
4169
69.1k
  case AMDGPU::COPY:
4170
69.1k
  case AMDGPU::PHI:
4171
69.1k
  case AMDGPU::REG_SEQUENCE:
4172
69.1k
  case AMDGPU::INSERT_SUBREG:
4173
69.1k
  case AMDGPU::WQM:
4174
69.1k
  case AMDGPU::WWM:
4175
69.1k
    if (RI.hasVGPRs(NewDstRC))
4176
0
      return nullptr;
4177
69.1k
4178
69.1k
    NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
4179
69.1k
    if (!NewDstRC)
4180
0
      return nullptr;
4181
69.1k
    return NewDstRC;
4182
35.8k
  default:
4183
35.8k
    return NewDstRC;
4184
0
  }
4185
0
}
4186
4187
// Find the one SGPR operand we are allowed to use.
4188
unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
4189
47.7k
                                   int OpIndices[3]) const {
4190
47.7k
  const MCInstrDesc &Desc = MI.getDesc();
4191
47.7k
4192
47.7k
  // Find the one SGPR operand we are allowed to use.
4193
47.7k
  //
4194
47.7k
  // First we need to consider the instruction's operand requirements before
4195
47.7k
  // legalizing. Some operands are required to be SGPRs, such as implicit uses
4196
47.7k
  // of VCC, but we are still bound by the constant bus requirement to only use
4197
47.7k
  // one.
4198
47.7k
  //
4199
47.7k
  // If the operand's class is an SGPR, we can never move it.
4200
47.7k
4201
47.7k
  unsigned SGPRReg = findImplicitSGPRRead(MI);
4202
47.7k
  if (SGPRReg != AMDGPU::NoRegister)
4203
123
    return SGPRReg;
4204
47.6k
4205
47.6k
  unsigned UsedSGPRs[3] = { AMDGPU::NoRegister };
4206
47.6k
  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
4207
47.6k
4208
147k
  for (unsigned i = 0; 
i < 3147k
;
++i99.4k
) {
4209
138k
    int Idx = OpIndices[i];
4210
138k
    if (Idx == -1)
4211
32.8k
      break;
4212
105k
4213
105k
    const MachineOperand &MO = MI.getOperand(Idx);
4214
105k
    if (!MO.isReg())
4215
10.8k
      continue;
4216
95.0k
4217
95.0k
    // Is this operand statically required to be an SGPR based on the operand
4218
95.0k
    // constraints?
4219
95.0k
    const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass);
4220
95.0k
    bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
4221
95.0k
    if (IsRequiredSGPR)
4222
6.45k
      return MO.getReg();
4223
88.6k
4224
88.6k
    // If this could be a VGPR or an SGPR, Check the dynamic register class.
4225
88.6k
    unsigned Reg = MO.getReg();
4226
88.6k
    const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
4227
88.6k
    if (RI.isSGPRClass(RegRC))
4228
41.4k
      UsedSGPRs[i] = Reg;
4229
138k
  }
4230
47.6k
4231
47.6k
  // We don't have a required SGPR operand, so we have a bit more freedom in
4232
47.6k
  // selecting operands to move.
4233
47.6k
4234
47.6k
  // Try to select the most used SGPR. If an SGPR is equal to one of the
4235
47.6k
  // others, we choose that.
4236
47.6k
  //
4237
47.6k
  // e.g.
4238
47.6k
  // V_FMA_F32 v0, s0, s0, s0 -> No moves
4239
47.6k
  // V_FMA_F32 v0, s0, s1, s0 -> Move s1
4240
47.6k
4241
47.6k
  // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
4242
47.6k
  // prefer those.
4243
47.6k
4244
41.1k
  
if (41.1k
UsedSGPRs[0] != AMDGPU::NoRegister41.1k
) {
4245
17.1k
    if (
UsedSGPRs[0] == UsedSGPRs[1] || 17.1k
UsedSGPRs[0] == UsedSGPRs[2]16.8k
)
4246
351
      SGPRReg = UsedSGPRs[0];
4247
17.1k
  }
4248
41.1k
4249
41.1k
  if (
SGPRReg == AMDGPU::NoRegister && 41.1k
UsedSGPRs[1] != AMDGPU::NoRegister40.8k
) {
4250
13.7k
    if (UsedSGPRs[1] == UsedSGPRs[2])
4251
8
      SGPRReg = UsedSGPRs[1];
4252
13.7k
  }
4253
41.1k
4254
41.1k
  return SGPRReg;
4255
47.7k
}
4256
4257
MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
4258
2.53M
                                             unsigned OperandName) const {
4259
2.53M
  int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
4260
2.53M
  if (Idx == -1)
4261
607k
    return nullptr;
4262
1.92M
4263
1.92M
  return &MI.getOperand(Idx);
4264
1.92M
}
4265
4266
22.5k
uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
4267
22.5k
  uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
4268
22.5k
  if (
ST.isAmdHsaOS()22.5k
) {
4269
436
    // Set ATC = 1. GFX9 doesn't have this bit.
4270
436
    if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS)
4271
341
      RsrcDataFormat |= (1ULL << 56);
4272
436
4273
436
    // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
4274
436
    // BTW, it disables TC L2 and therefore decreases performance.
4275
436
    if (ST.getGeneration() == SISubtarget::VOLCANIC_ISLANDS)
4276
236
      RsrcDataFormat |= (2ULL << 59);
4277
436
  }
4278
22.5k
4279
22.5k
  return RsrcDataFormat;
4280
22.5k
}
4281
4282
463
uint64_t SIInstrInfo::getScratchRsrcWords23() const {
4283
463
  uint64_t Rsrc23 = getDefaultRsrcDataFormat() |
4284
463
                    AMDGPU::RSRC_TID_ENABLE |
4285
463
                    0xffffffff; // Size;
4286
463
4287
463
  // GFX9 doesn't have ELEMENT_SIZE.
4288
463
  if (
ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS463
) {
4289
397
    uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1;
4290
397
    Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
4291
397
  }
4292
463
4293
463
  // IndexStride = 64.
4294
463
  Rsrc23 |= UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
4295
463
4296
463
  // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
4297
463
  // Clear them unless we want a huge stride.
4298
463
  if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
4299
232
    Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
4300
463
4301
463
  return Rsrc23;
4302
463
}
4303
4304
60
bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const {
4305
60
  unsigned Opc = MI.getOpcode();
4306
60
4307
60
  return isSMRD(Opc);
4308
60
}
4309
4310
14
bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr &MI) const {
4311
14
  unsigned Opc = MI.getOpcode();
4312
14
4313
14
  return isMUBUF(Opc) || 
isMTBUF(Opc)14
||
isMIMG(Opc)14
;
4314
14
}
4315
4316
unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI,
4317
2.78k
                                    int &FrameIndex) const {
4318
2.78k
  const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
4319
2.78k
  if (
!Addr || 2.78k
!Addr->isFI()2.71k
)
4320
723
    return AMDGPU::NoRegister;
4321
2.05k
4322
2.78k
  assert(!MI.memoperands_empty() &&
4323
2.05k
         (*MI.memoperands_begin())->getAddrSpace() == AMDGPUASI.PRIVATE_ADDRESS);
4324
2.05k
4325
2.05k
  FrameIndex = Addr->getIndex();
4326
2.05k
  return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
4327
2.05k
}
4328
4329
unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI,
4330
50
                                        int &FrameIndex) const {
4331
50
  const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
4332
50
  assert(Addr && Addr->isFI());
4333
50
  FrameIndex = Addr->getIndex();
4334
50
  return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
4335
50
}
4336
4337
unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
4338
13.1k
                                          int &FrameIndex) const {
4339
13.1k
  if (!MI.mayLoad())
4340
10.9k
    return AMDGPU::NoRegister;
4341
2.20k
4342
2.20k
  
if (2.20k
isMUBUF(MI) || 2.20k
isVGPRSpill(MI)810
)
4343
1.40k
    return isStackAccess(MI, FrameIndex);
4344
799
4345
799
  
if (799
isSGPRSpill(MI)799
)
4346
50
    return isSGPRStackAccess(MI, FrameIndex);
4347
749
4348
749
  return AMDGPU::NoRegister;
4349
749
}
4350
4351
unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
4352
7.99k
                                         int &FrameIndex) const {
4353
7.99k
  if (!MI.mayStore())
4354
6.18k
    return AMDGPU::NoRegister;
4355
1.80k
4356
1.80k
  
if (1.80k
isMUBUF(MI) || 1.80k
isVGPRSpill(MI)428
)
4357
1.37k
    return isStackAccess(MI, FrameIndex);
4358
428
4359
428
  
if (428
isSGPRSpill(MI)428
)
4360
0
    return isSGPRStackAccess(MI, FrameIndex);
4361
428
4362
428
  return AMDGPU::NoRegister;
4363
428
}
4364
4365
623k
unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
4366
623k
  unsigned Opc = MI.getOpcode();
4367
623k
  const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc);
4368
623k
  unsigned DescSize = Desc.getSize();
4369
623k
4370
623k
  // If we have a definitive size, we can use it. Otherwise we need to inspect
4371
623k
  // the operands to know the size.
4372
623k
  //
4373
623k
  // FIXME: Instructions that have a base 32-bit encoding report their size as
4374
623k
  // 4, even though they are really 8 bytes if they have a literal operand.
4375
623k
  if (
DescSize != 0 && 623k
DescSize != 4614k
)
4376
173k
    return DescSize;
4377
449k
4378
449k
  // 4-byte instructions may have a 32-bit literal encoded after them. Check
4379
449k
  // operands that coud ever be literals.
4380
449k
  
if (449k
isVALU(MI) || 449k
isSALU(MI)263k
) {
4381
411k
    if (isFixedSize(MI))
4382
1.00k
      return DescSize;
4383
410k
4384
410k
    int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
4385
410k
    if (Src0Idx == -1)
4386
113k
      return 4; // No operands.
4387
296k
4388
296k
    
if (296k
isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx])296k
)
4389
29.4k
      return 8;
4390
266k
4391
266k
    int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
4392
266k
    if (Src1Idx == -1)
4393
144k
      return 4;
4394
122k
4395
122k
    
if (122k
isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx])122k
)
4396
9.29k
      return 8;
4397
113k
4398
113k
    return 4;
4399
113k
  }
4400
38.4k
4401
38.4k
  
if (38.4k
DescSize == 438.4k
)
4402
30.8k
    return 4;
4403
7.57k
4404
7.57k
  switch (Opc) {
4405
4.94k
  case TargetOpcode::IMPLICIT_DEF:
4406
4.94k
  case TargetOpcode::KILL:
4407
4.94k
  case TargetOpcode::DBG_VALUE:
4408
4.94k
  case TargetOpcode::BUNDLE:
4409
4.94k
  case TargetOpcode::EH_LABEL:
4410
4.94k
    return 0;
4411
2.63k
  case TargetOpcode::INLINEASM: {
4412
2.63k
    const MachineFunction *MF = MI.getParent()->getParent();
4413
2.63k
    const char *AsmStr = MI.getOperand(0).getSymbolName();
4414
2.63k
    return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
4415
4.94k
  }
4416
0
  default:
4417
0
    llvm_unreachable("unable to find instruction size");
4418
0
  }
4419
0
}
4420
4421
90
bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const {
4422
90
  if (!isFLAT(MI))
4423
79
    return false;
4424
11
4425
11
  
if (11
MI.memoperands_empty()11
)
4426
6
    return true;
4427
5
4428
5
  
for (const MachineMemOperand *MMO : MI.memoperands()) 5
{
4429
5
    if (MMO->getAddrSpace() == AMDGPUASI.FLAT_ADDRESS)
4430
2
      return true;
4431
3
  }
4432
3
  return false;
4433
3
}
4434
4435
0
bool SIInstrInfo::isNonUniformBranchInstr(MachineInstr &Branch) const {
4436
0
  return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO;
4437
0
}
4438
4439
void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry,
4440
0
                                            MachineBasicBlock *IfEnd) const {
4441
0
  MachineBasicBlock::iterator TI = IfEntry->getFirstTerminator();
4442
0
  assert(TI != IfEntry->end());
4443
0
4444
0
  MachineInstr *Branch = &(*TI);
4445
0
  MachineFunction *MF = IfEntry->getParent();
4446
0
  MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo();
4447
0
4448
0
  if (
Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO0
) {
4449
0
    unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4450
0
    MachineInstr *SIIF =
4451
0
        BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg)
4452
0
            .add(Branch->getOperand(0))
4453
0
            .add(Branch->getOperand(1));
4454
0
    MachineInstr *SIEND =
4455
0
        BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF))
4456
0
            .addReg(DstReg);
4457
0
4458
0
    IfEntry->erase(TI);
4459
0
    IfEntry->insert(IfEntry->end(), SIIF);
4460
0
    IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND);
4461
0
  }
4462
0
}
4463
4464
void SIInstrInfo::convertNonUniformLoopRegion(
4465
0
    MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const {
4466
0
  MachineBasicBlock::iterator TI = LoopEnd->getFirstTerminator();
4467
0
  // We expect 2 terminators, one conditional and one unconditional.
4468
0
  assert(TI != LoopEnd->end());
4469
0
4470
0
  MachineInstr *Branch = &(*TI);
4471
0
  MachineFunction *MF = LoopEnd->getParent();
4472
0
  MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo();
4473
0
4474
0
  if (
Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO0
) {
4475
0
4476
0
    unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4477
0
    unsigned BackEdgeReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4478
0
    MachineInstrBuilder HeaderPHIBuilder =
4479
0
        BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg);
4480
0
    for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(),
4481
0
                                          E = LoopEntry->pred_end();
4482
0
         
PI != E0
;
++PI0
) {
4483
0
      if (
*PI == LoopEnd0
) {
4484
0
        HeaderPHIBuilder.addReg(BackEdgeReg);
4485
0
      } else {
4486
0
        MachineBasicBlock *PMBB = *PI;
4487
0
        unsigned ZeroReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4488
0
        materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(),
4489
0
                             ZeroReg, 0);
4490
0
        HeaderPHIBuilder.addReg(ZeroReg);
4491
0
      }
4492
0
      HeaderPHIBuilder.addMBB(*PI);
4493
0
    }
4494
0
    MachineInstr *HeaderPhi = HeaderPHIBuilder;
4495
0
    MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(),
4496
0
                                      get(AMDGPU::SI_IF_BREAK), BackEdgeReg)
4497
0
                                  .addReg(DstReg)
4498
0
                                  .add(Branch->getOperand(0));
4499
0
    MachineInstr *SILOOP =
4500
0
        BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP))
4501
0
            .addReg(BackEdgeReg)
4502
0
            .addMBB(LoopEntry);
4503
0
4504
0
    LoopEntry->insert(LoopEntry->begin(), HeaderPhi);
4505
0
    LoopEnd->erase(TI);
4506
0
    LoopEnd->insert(LoopEnd->end(), SIIFBREAK);
4507
0
    LoopEnd->insert(LoopEnd->end(), SILOOP);
4508
0
  }
4509
0
}
4510
4511
ArrayRef<std::pair<int, const char *>>
4512
5
SIInstrInfo::getSerializableTargetIndices() const {
4513
5
  static const std::pair<int, const char *> TargetIndices[] = {
4514
5
      {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
4515
5
      {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
4516
5
      {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
4517
5
      {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
4518
5
      {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
4519
5
  return makeArrayRef(TargetIndices);
4520
5
}
4521
4522
/// This is used by the post-RA scheduler (SchedulePostRAList.cpp).  The
4523
/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
4524
ScheduleHazardRecognizer *
4525
SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
4526
11.4k
                                            const ScheduleDAG *DAG) const {
4527
11.4k
  return new GCNHazardRecognizer(DAG->MF);
4528
11.4k
}
4529
4530
/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
4531
/// pass.
4532
ScheduleHazardRecognizer *
4533
15.1k
SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const {
4534
15.1k
  return new GCNHazardRecognizer(MF);
4535
15.1k
}
4536
4537
std::pair<unsigned, unsigned>
4538
3
SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
4539
3
  return std::make_pair(TF & MO_MASK, TF & ~MO_MASK);
4540
3
}
4541
4542
ArrayRef<std::pair<unsigned, const char *>>
4543
4
SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
4544
4
  static const std::pair<unsigned, const char *> TargetFlags[] = {
4545
4
    { MO_GOTPCREL, "amdgpu-gotprel" },
4546
4
    { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" },
4547
4
    { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" },
4548
4
    { MO_REL32_LO, "amdgpu-rel32-lo" },
4549
4
    { MO_REL32_HI, "amdgpu-rel32-hi" }
4550
4
  };
4551
4
4552
4
  return makeArrayRef(TargetFlags);
4553
4
}
4554
4555
741
bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const {
4556
703
  return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
4557
526
         MI.modifiesRegister(AMDGPU::EXEC, &RI);
4558
741
}
4559
4560
MachineInstrBuilder
4561
SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
4562
                           MachineBasicBlock::iterator I,
4563
                           const DebugLoc &DL,
4564
0
                           unsigned DestReg) const {
4565
0
  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4566
0
4567
0
  unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4568
0
4569
0
  return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg)
4570
0
           .addReg(UnusedCarry, RegState::Define | RegState::Dead);
4571
0
}