Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
Line
Count
Source (jump to first uncovered line)
1
//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
/// \file
8
//===----------------------------------------------------------------------===//
9
//
10
11
#include "AMDGPU.h"
12
#include "AMDGPUSubtarget.h"
13
#include "SIInstrInfo.h"
14
#include "SIMachineFunctionInfo.h"
15
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16
#include "llvm/ADT/DepthFirstIterator.h"
17
#include "llvm/CodeGen/LiveIntervals.h"
18
#include "llvm/CodeGen/MachineFunctionPass.h"
19
#include "llvm/CodeGen/MachineInstrBuilder.h"
20
#include "llvm/CodeGen/MachineRegisterInfo.h"
21
#include "llvm/Support/Debug.h"
22
#include "llvm/Support/raw_ostream.h"
23
#include "llvm/Target/TargetMachine.h"
24
25
#define DEBUG_TYPE "si-fold-operands"
26
using namespace llvm;
27
28
namespace {
29
30
struct FoldCandidate {
31
  MachineInstr *UseMI;
32
  union {
33
    MachineOperand *OpToFold;
34
    uint64_t ImmToFold;
35
    int FrameIndexToFold;
36
  };
37
  int ShrinkOpcode;
38
  unsigned char UseOpNo;
39
  MachineOperand::MachineOperandType Kind;
40
  bool Commuted;
41
42
  FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp,
43
                bool Commuted_ = false,
44
                int ShrinkOp = -1) :
45
    UseMI(MI), OpToFold(nullptr), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
46
    Kind(FoldOp->getType()),
47
164k
    Commuted(Commuted_) {
48
164k
    if (FoldOp->isImm()) {
49
61.7k
      ImmToFold = FoldOp->getImm();
50
102k
    } else if (FoldOp->isFI()) {
51
171
      FrameIndexToFold = FoldOp->getIndex();
52
102k
    } else {
53
102k
      assert(FoldOp->isReg() || FoldOp->isGlobal());
54
102k
      OpToFold = FoldOp;
55
102k
    }
56
164k
  }
57
58
179k
  bool isFI() const {
59
179k
    return Kind == MachineOperand::MO_FrameIndex;
60
179k
  }
61
62
453k
  bool isImm() const {
63
453k
    return Kind == MachineOperand::MO_Immediate;
64
453k
  }
65
66
151k
  bool isReg() const {
67
151k
    return Kind == MachineOperand::MO_Register;
68
151k
  }
69
70
179k
  bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; }
71
72
636
  bool isCommuted() const {
73
636
    return Commuted;
74
636
  }
75
76
61.6k
  bool needsShrink() const {
77
61.6k
    return ShrinkOpcode != -1;
78
61.6k
  }
79
80
296
  int getShrinkOpcode() const {
81
296
    return ShrinkOpcode;
82
296
  }
83
};
84
85
class SIFoldOperands : public MachineFunctionPass {
86
public:
87
  static char ID;
88
  MachineRegisterInfo *MRI;
89
  const SIInstrInfo *TII;
90
  const SIRegisterInfo *TRI;
91
  const GCNSubtarget *ST;
92
  const SIMachineFunctionInfo *MFI;
93
94
  void foldOperand(MachineOperand &OpToFold,
95
                   MachineInstr *UseMI,
96
                   int UseOpIdx,
97
                   SmallVectorImpl<FoldCandidate> &FoldList,
98
                   SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
99
100
  void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
101
102
  const MachineOperand *isClamp(const MachineInstr &MI) const;
103
  bool tryFoldClamp(MachineInstr &MI);
104
105
  std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
106
  bool tryFoldOMod(MachineInstr &MI);
107
108
public:
109
4.79k
  SIFoldOperands() : MachineFunctionPass(ID) {
110
4.79k
    initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
111
4.79k
  }
112
113
  bool runOnMachineFunction(MachineFunction &MF) override;
114
115
55.2k
  StringRef getPassName() const override { return "SI Fold Operands"; }
116
117
4.74k
  void getAnalysisUsage(AnalysisUsage &AU) const override {
118
4.74k
    AU.setPreservesCFG();
119
4.74k
    MachineFunctionPass::getAnalysisUsage(AU);
120
4.74k
  }
121
};
122
123
} // End anonymous namespace.
124
125
INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE,
126
                "SI Fold Operands", false, false)
127
128
char SIFoldOperands::ID = 0;
129
130
char &llvm::SIFoldOperandsID = SIFoldOperands::ID;
131
132
// Wrapper around isInlineConstant that understands special cases when
133
// instruction types are replaced during operand folding.
134
static bool isInlineConstantIfFolded(const SIInstrInfo *TII,
135
                                     const MachineInstr &UseMI,
136
                                     unsigned OpNo,
137
173k
                                     const MachineOperand &OpToFold) {
138
173k
  if (TII->isInlineConstant(UseMI, OpNo, OpToFold))
139
58.0k
    return true;
140
115k
141
115k
  unsigned Opc = UseMI.getOpcode();
142
115k
  switch (Opc) {
143
115k
  case AMDGPU::V_MAC_F32_e64:
144
400
  case AMDGPU::V_MAC_F16_e64:
145
400
  case AMDGPU::V_FMAC_F32_e64: {
146
400
    // Special case for mac. Since this is replaced with mad when folded into
147
400
    // src2, we need to check the legality for the final instruction.
148
400
    int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
149
400
    if (static_cast<int>(OpNo) == Src2Idx) {
150
146
      bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64;
151
146
      bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
152
146
153
146
      unsigned Opc = IsFMA ?
154
132
        
AMDGPU::V_FMA_F3214
: (IsF32 ?
AMDGPU::V_MAD_F32127
:
AMDGPU::V_MAD_F165
);
155
146
      const MCInstrDesc &MadDesc = TII->get(Opc);
156
146
      return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType);
157
146
    }
158
254
    return false;
159
254
  }
160
114k
  default:
161
114k
    return false;
162
115k
  }
163
115k
}
164
165
// TODO: Add heuristic that the frame index might not fit in the addressing mode
166
// immediate offset to avoid materializing in loops.
167
static bool frameIndexMayFold(const SIInstrInfo *TII,
168
                              const MachineInstr &UseMI,
169
                              int OpNo,
170
430k
                              const MachineOperand &OpToFold) {
171
430k
  return OpToFold.isFI() &&
172
430k
    
(1.37k
TII->isMUBUF(UseMI)1.37k
||
TII->isFLATScratch(UseMI)1.26k
) &&
173
430k
    
OpNo == AMDGPU::getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::vaddr)112
;
174
430k
}
175
176
0
FunctionPass *llvm::createSIFoldOperandsPass() {
177
0
  return new SIFoldOperands();
178
0
}
179
180
static bool updateOperand(FoldCandidate &Fold,
181
                          const SIInstrInfo &TII,
182
                          const TargetRegisterInfo &TRI,
183
151k
                          const GCNSubtarget &ST) {
184
151k
  MachineInstr *MI = Fold.UseMI;
185
151k
  MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
186
151k
  assert(Old.isReg());
187
151k
188
151k
  if (Fold.isImm()) {
189
61.4k
    if (MI->getDesc().TSFlags & SIInstrFlags::IsPacked &&
190
61.4k
        
!(MI->getDesc().TSFlags & SIInstrFlags::IsMAI)222
&&
191
61.4k
        AMDGPU::isInlinableLiteralV216(static_cast<uint16_t>(Fold.ImmToFold),
192
221
                                       ST.hasInv2PiInlineImm())) {
193
218
      // Set op_sel/op_sel_hi on this operand or bail out if op_sel is
194
218
      // already set.
195
218
      unsigned Opcode = MI->getOpcode();
196
218
      int OpNo = MI->getOperandNo(&Old);
197
218
      int ModIdx = -1;
198
218
      if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0))
199
95
        ModIdx = AMDGPU::OpName::src0_modifiers;
200
123
      else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1))
201
122
        ModIdx = AMDGPU::OpName::src1_modifiers;
202
1
      else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2))
203
1
        ModIdx = AMDGPU::OpName::src2_modifiers;
204
218
      assert(ModIdx != -1);
205
218
      ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
206
218
      MachineOperand &Mod = MI->getOperand(ModIdx);
207
218
      unsigned Val = Mod.getImm();
208
218
      if ((Val & SISrcMods::OP_SEL_0) || !(Val & SISrcMods::OP_SEL_1))
209
0
        return false;
210
218
      // Only apply the following transformation if that operand requries
211
218
      // a packed immediate.
212
218
      switch (TII.get(Opcode).OpInfo[OpNo].OperandType) {
213
218
      case AMDGPU::OPERAND_REG_IMM_V2FP16:
214
217
      case AMDGPU::OPERAND_REG_IMM_V2INT16:
215
217
      case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
216
217
      case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
217
217
        // If upper part is all zero we do not need op_sel_hi.
218
217
        if (!isUInt<16>(Fold.ImmToFold)) {
219
194
          if (!(Fold.ImmToFold & 0xffff)) {
220
12
            Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0);
221
12
            Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
222
12
            Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff);
223
12
            return true;
224
12
          }
225
182
          Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
226
182
          Old.ChangeToImmediate(Fold.ImmToFold & 0xffff);
227
182
          return true;
228
182
        }
229
23
        break;
230
23
      default:
231
1
        break;
232
151k
      }
233
151k
    }
234
61.4k
  }
235
151k
236
151k
  if ((Fold.isImm() || 
Fold.isFI()90.0k
||
Fold.isGlobal()89.8k
) &&
Fold.needsShrink()61.6k
) {
237
636
    MachineBasicBlock *MBB = MI->getParent();
238
636
    auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI);
239
636
    if (Liveness != MachineBasicBlock::LQR_Dead)
240
340
      return false;
241
296
242
296
    MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
243
296
    int Op32 = Fold.getShrinkOpcode();
244
296
    MachineOperand &Dst0 = MI->getOperand(0);
245
296
    MachineOperand &Dst1 = MI->getOperand(1);
246
296
    assert(Dst0.isDef() && Dst1.isDef());
247
296
248
296
    bool HaveNonDbgCarryUse = !MRI.use_nodbg_empty(Dst1.getReg());
249
296
250
296
    const TargetRegisterClass *Dst0RC = MRI.getRegClass(Dst0.getReg());
251
296
    unsigned NewReg0 = MRI.createVirtualRegister(Dst0RC);
252
296
253
296
    MachineInstr *Inst32 = TII.buildShrunkInst(*MI, Op32);
254
296
255
296
    if (HaveNonDbgCarryUse) {
256
66
      BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), Dst1.getReg())
257
66
        .addReg(AMDGPU::VCC, RegState::Kill);
258
66
    }
259
296
260
296
    // Keep the old instruction around to avoid breaking iterators, but
261
296
    // replace it with a dummy instruction to remove uses.
262
296
    //
263
296
    // FIXME: We should not invert how this pass looks at operands to avoid
264
296
    // this. Should track set of foldable movs instead of looking for uses
265
296
    // when looking at a use.
266
296
    Dst0.setReg(NewReg0);
267
1.77k
    for (unsigned I = MI->getNumOperands() - 1; I > 0; 
--I1.48k
)
268
1.48k
      MI->RemoveOperand(I);
269
296
    MI->setDesc(TII.get(AMDGPU::IMPLICIT_DEF));
270
296
271
296
    if (Fold.isCommuted())
272
296
      TII.commuteInstruction(*Inst32, false);
273
296
    return true;
274
296
  }
275
150k
276
150k
  assert(!Fold.needsShrink() && "not handled");
277
150k
278
150k
  if (Fold.isImm()) {
279
60.7k
    Old.ChangeToImmediate(Fold.ImmToFold);
280
60.7k
    return true;
281
60.7k
  }
282
89.8k
283
89.8k
  if (Fold.isGlobal()) {
284
115
    Old.ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(),
285
115
                   Fold.OpToFold->getTargetFlags());
286
115
    return true;
287
115
  }
288
89.7k
289
89.7k
  if (Fold.isFI()) {
290
131
    Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
291
131
    return true;
292
131
  }
293
89.6k
294
89.6k
  MachineOperand *New = Fold.OpToFold;
295
89.6k
  Old.substVirtReg(New->getReg(), New->getSubReg(), TRI);
296
89.6k
  Old.setIsUndef(New->isUndef());
297
89.6k
  return true;
298
89.6k
}
299
300
static bool isUseMIInFoldList(ArrayRef<FoldCandidate> FoldList,
301
89.2k
                              const MachineInstr *MI) {
302
89.2k
  for (auto Candidate : FoldList) {
303
6.23k
    if (Candidate.UseMI == MI)
304
0
      return true;
305
6.23k
  }
306
89.2k
  return false;
307
89.2k
}
308
309
static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
310
                             MachineInstr *MI, unsigned OpNo,
311
                             MachineOperand *OpToFold,
312
239k
                             const SIInstrInfo *TII) {
313
239k
  if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
314
89.3k
    // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
315
89.3k
    unsigned Opc = MI->getOpcode();
316
89.3k
    if ((Opc == AMDGPU::V_MAC_F32_e64 || 
Opc == AMDGPU::V_MAC_F16_e6489.1k
||
317
89.3k
         
Opc == AMDGPU::V_FMAC_F32_e6489.0k
) &&
318
89.3k
        
(int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)283
) {
319
153
      bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64;
320
153
      bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
321
153
      unsigned NewOpc = IsFMA ?
322
137
        
AMDGPU::V_FMA_F3216
: (IsF32 ?
AMDGPU::V_MAD_F32134
:
AMDGPU::V_MAD_F163
);
323
153
324
153
      // Check if changing this to a v_mad_{f16, f32} instruction will allow us
325
153
      // to fold the operand.
326
153
      MI->setDesc(TII->get(NewOpc));
327
153
      bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII);
328
153
      if (FoldAsMAD) {
329
81
        MI->untieRegOperand(OpNo);
330
81
        return true;
331
81
      }
332
72
      MI->setDesc(TII->get(Opc));
333
72
    }
334
89.3k
335
89.3k
    // Special case for s_setreg_b32
336
89.3k
    
if (89.2k
Opc == AMDGPU::S_SETREG_B3289.2k
&&
OpToFold->isImm()52
) {
337
52
      MI->setDesc(TII->get(AMDGPU::S_SETREG_IMM32_B32));
338
52
      FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));
339
52
      return true;
340
52
    }
341
89.2k
342
89.2k
    // If we are already folding into another operand of MI, then
343
89.2k
    // we can't commute the instruction, otherwise we risk making the
344
89.2k
    // other fold illegal.
345
89.2k
    if (isUseMIInFoldList(FoldList, MI))
346
0
      return false;
347
89.2k
348
89.2k
    unsigned CommuteOpNo = OpNo;
349
89.2k
350
89.2k
    // Operand is not legal, so try to commute the instruction to
351
89.2k
    // see if this makes it possible to fold.
352
89.2k
    unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex;
353
89.2k
    unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
354
89.2k
    bool CanCommute = TII->findCommutedOpIndices(*MI, CommuteIdx0, CommuteIdx1);
355
89.2k
356
89.2k
    if (CanCommute) {
357
21.2k
      if (CommuteIdx0 == OpNo)
358
9.87k
        CommuteOpNo = CommuteIdx1;
359
11.3k
      else if (CommuteIdx1 == OpNo)
360
10.6k
        CommuteOpNo = CommuteIdx0;
361
21.2k
    }
362
89.2k
363
89.2k
364
89.2k
    // One of operands might be an Imm operand, and OpNo may refer to it after
365
89.2k
    // the call of commuteInstruction() below. Such situations are avoided
366
89.2k
    // here explicitly as OpNo must be a register operand to be a candidate
367
89.2k
    // for memory folding.
368
89.2k
    if (CanCommute && 
(21.2k
!MI->getOperand(CommuteIdx0).isReg()21.2k
||
369
21.2k
                       
!MI->getOperand(CommuteIdx1).isReg()21.2k
))
370
5.50k
      return false;
371
83.7k
372
83.7k
    if (!CanCommute ||
373
83.7k
        
!TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1)15.7k
)
374
68.4k
      return false;
375
15.2k
376
15.2k
    if (!TII->isOperandLegal(*MI, CommuteOpNo, OpToFold)) {
377
13.9k
      if ((Opc == AMDGPU::V_ADD_I32_e64 ||
378
13.9k
           
Opc == AMDGPU::V_SUB_I32_e6413.2k
||
379
13.9k
           
Opc == AMDGPU::V_SUBREV_I32_e6413.2k
) && // FIXME
380
13.9k
          
(730
OpToFold->isImm()730
||
OpToFold->isFI()230
||
OpToFold->isGlobal()188
)) {
381
640
        MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
382
640
383
640
        // Verify the other operand is a VGPR, otherwise we would violate the
384
640
        // constant bus restriction.
385
640
        unsigned OtherIdx = CommuteOpNo == CommuteIdx0 ? 
CommuteIdx1436
:
CommuteIdx0204
;
386
640
        MachineOperand &OtherOp = MI->getOperand(OtherIdx);
387
640
        if (!OtherOp.isReg() ||
388
640
            !TII->getRegisterInfo().isVGPR(MRI, OtherOp.getReg()))
389
4
          return false;
390
636
391
636
        assert(MI->getOperand(1).isDef());
392
636
393
636
        // Make sure to get the 32-bit version of the commuted opcode.
394
636
        unsigned MaybeCommutedOpc = MI->getOpcode();
395
636
        int Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc);
396
636
397
636
        FoldList.push_back(FoldCandidate(MI, CommuteOpNo, OpToFold, true,
398
636
                                         Op32));
399
636
        return true;
400
636
      }
401
13.3k
402
13.3k
      TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1);
403
13.3k
      return false;
404
13.3k
    }
405
1.34k
406
1.34k
    FoldList.push_back(FoldCandidate(MI, CommuteOpNo, OpToFold, true));
407
1.34k
    return true;
408
1.34k
  }
409
149k
410
149k
  FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));
411
149k
  return true;
412
149k
}
413
414
// If the use operand doesn't care about the value, this may be an operand only
415
// used for register indexing, in which case it is unsafe to fold.
416
static bool isUseSafeToFold(const SIInstrInfo *TII,
417
                            const MachineInstr &MI,
418
456k
                            const MachineOperand &UseMO) {
419
456k
  return !UseMO.isUndef() && 
!TII->isSDWA(MI)455k
;
420
456k
  //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg());
421
456k
}
422
423
static bool tryToFoldACImm(const SIInstrInfo *TII,
424
                           const MachineOperand &OpToFold,
425
                           MachineInstr *UseMI,
426
                           unsigned UseOpIdx,
427
490k
                           SmallVectorImpl<FoldCandidate> &FoldList) {
428
490k
  const MCInstrDesc &Desc = UseMI->getDesc();
429
490k
  const MCOperandInfo *OpInfo = Desc.OpInfo;
430
490k
  if (!OpInfo || 
UseOpIdx >= Desc.getNumOperands()489k
)
431
13.3k
    return false;
432
476k
433
476k
  uint8_t OpTy = OpInfo[UseOpIdx].OperandType;
434
476k
  if (OpTy < AMDGPU::OPERAND_REG_INLINE_AC_FIRST ||
435
476k
      
OpTy > AMDGPU::OPERAND_REG_INLINE_AC_LAST558
)
436
476k
    return false;
437
558
438
558
  if (OpToFold.isImm() && 
TII->isInlineConstant(OpToFold, OpTy)0
) {
439
0
    UseMI->getOperand(UseOpIdx).ChangeToImmediate(OpToFold.getImm());
440
0
    return true;
441
0
  }
442
558
443
558
  if (!OpToFold.isReg())
444
0
    return false;
445
558
446
558
  unsigned UseReg = OpToFold.getReg();
447
558
  if (!TargetRegisterInfo::isVirtualRegister(UseReg))
448
0
    return false;
449
558
450
558
  if (llvm::find_if(FoldList, [UseMI](const FoldCandidate &FC) {
451
0
        return FC.UseMI == UseMI; }) != FoldList.end())
452
0
    return false;
453
558
454
558
  MachineRegisterInfo &MRI = UseMI->getParent()->getParent()->getRegInfo();
455
558
  const MachineInstr *Def = MRI.getUniqueVRegDef(UseReg);
456
558
  if (!Def || !Def->isRegSequence())
457
44
    return false;
458
514
459
514
  int64_t Imm;
460
514
  MachineOperand *Op;
461
1.03k
  for (unsigned I = 1, E = Def->getNumExplicitOperands(); I < E; 
I += 2517
) {
462
1.02k
    const MachineOperand &Sub = Def->getOperand(I);
463
1.02k
    if (!Sub.isReg() || Sub.getSubReg())
464
0
      return false;
465
1.02k
    MachineInstr *SubDef = MRI.getUniqueVRegDef(Sub.getReg());
466
1.02k
    while (SubDef && !SubDef->isMoveImmediate() &&
467
1.02k
           
!SubDef->getOperand(1).isImm()4
&&
TII->isFoldableCopy(*SubDef)4
)
468
2
      SubDef = MRI.getUniqueVRegDef(SubDef->getOperand(1).getReg());
469
1.02k
    if (!SubDef || !SubDef->isMoveImmediate() || 
!SubDef->getOperand(1).isImm()1.02k
)
470
349
      return false;
471
677
    Op = &SubDef->getOperand(1);
472
677
    auto SubImm = Op->getImm();
473
677
    if (I == 1) {
474
184
      if (!TII->isInlineConstant(SubDef->getOperand(1), OpTy))
475
0
        return false;
476
184
477
184
      Imm = SubImm;
478
184
      continue;
479
184
    }
480
493
    if (Imm != SubImm)
481
160
      return false; // Can only fold splat constants
482
493
  }
483
514
484
514
  FoldList.push_back(FoldCandidate(UseMI, UseOpIdx, Op));
485
5
  return true;
486
514
}
487
488
void SIFoldOperands::foldOperand(
489
  MachineOperand &OpToFold,
490
  MachineInstr *UseMI,
491
  int UseOpIdx,
492
  SmallVectorImpl<FoldCandidate> &FoldList,
493
456k
  SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
494
456k
  const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
495
456k
496
456k
  if (!isUseSafeToFold(TII, *UseMI, UseOp))
497
1.04k
    return;
498
454k
499
454k
  // FIXME: Fold operands with subregs.
500
454k
  if (UseOp.isReg() && OpToFold.isReg()) {
501
343k
    if (UseOp.isImplicit() || 
UseOp.getSubReg() != AMDGPU::NoSubRegister343k
)
502
6.36k
      return;
503
337k
504
337k
    // Don't fold subregister extracts into tied operands, only if it is a full
505
337k
    // copy since a subregister use tied to a full register def doesn't really
506
337k
    // make sense. e.g. don't fold:
507
337k
    //
508
337k
    // %1 = COPY %0:sub1
509
337k
    // %2<tied3> = V_MAC_{F16, F32} %3, %4, %1<tied0>
510
337k
    //
511
337k
    //  into
512
337k
    // %2<tied3> = V_MAC_{F16, F32} %3, %4, %0:sub1<tied0>
513
337k
    if (UseOp.isTied() && 
OpToFold.getSubReg() != AMDGPU::NoSubRegister1.04k
)
514
426
      return;
515
448k
  }
516
448k
517
448k
  // Special case for REG_SEQUENCE: We can't fold literals into
518
448k
  // REG_SEQUENCE instructions, so we have to fold them into the
519
448k
  // uses of REG_SEQUENCE.
520
448k
  if (UseMI->isRegSequence()) {
521
133k
    unsigned RegSeqDstReg = UseMI->getOperand(0).getReg();
522
133k
    unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
523
133k
524
133k
    MachineRegisterInfo::use_iterator Next;
525
133k
    for (MachineRegisterInfo::use_iterator
526
133k
           RSUse = MRI->use_begin(RegSeqDstReg), RSE = MRI->use_end();
527
308k
         RSUse != RSE; 
RSUse = Next175k
) {
528
175k
      Next = std::next(RSUse);
529
175k
530
175k
      MachineInstr *RSUseMI = RSUse->getParent();
531
175k
532
175k
      if (tryToFoldACImm(TII, UseMI->getOperand(0), RSUseMI,
533
175k
                         RSUse.getOperandNo(), FoldList))
534
5
        continue;
535
175k
536
175k
      if (RSUse->getSubReg() != RegSeqDstSubReg)
537
174k
        continue;
538
1.04k
539
1.04k
      foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList,
540
1.04k
                  CopiesToReplace);
541
1.04k
    }
542
133k
543
133k
    return;
544
133k
  }
545
315k
546
315k
  if (tryToFoldACImm(TII, OpToFold, UseMI, UseOpIdx, FoldList))
547
0
    return;
548
315k
549
315k
  if (frameIndexMayFold(TII, *UseMI, UseOpIdx, OpToFold)) {
550
20
    // Sanity check that this is a stack access.
551
20
    // FIXME: Should probably use stack pseudos before frame lowering.
552
20
    MachineOperand *SOff = TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
553
20
    if (!SOff->isReg() || 
(15
SOff->getReg() != MFI->getScratchWaveOffsetReg()15
&&
554
15
                           
SOff->getReg() != MFI->getStackPtrOffsetReg()3
))
555
5
      return;
556
15
557
15
    if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
558
15
        MFI->getScratchRSrcReg())
559
1
      return;
560
14
561
14
    // A frame index will resolve to a positive constant, so it should always be
562
14
    // safe to fold the addressing mode, even pre-GFX9.
563
14
    UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex());
564
14
    SOff->setReg(MFI->getStackPtrOffsetReg());
565
14
    return;
566
14
  }
567
315k
568
315k
  bool FoldingImmLike =
569
315k
      OpToFold.isImm() || 
OpToFold.isFI()240k
||
OpToFold.isGlobal()240k
;
570
315k
571
315k
  if (FoldingImmLike && 
UseMI->isCopy()75.3k
) {
572
14.4k
    unsigned DestReg = UseMI->getOperand(0).getReg();
573
14.4k
    const TargetRegisterClass *DestRC
574
14.4k
      = TargetRegisterInfo::isVirtualRegister(DestReg) ?
575
13.8k
      MRI->getRegClass(DestReg) :
576
14.4k
      
TRI->getPhysRegClass(DestReg)617
;
577
14.4k
578
14.4k
    unsigned SrcReg  = UseMI->getOperand(1).getReg();
579
14.4k
    if (TargetRegisterInfo::isVirtualRegister(DestReg) &&
580
14.4k
      
TargetRegisterInfo::isVirtualRegister(SrcReg)13.8k
) {
581
13.8k
      const TargetRegisterClass * SrcRC = MRI->getRegClass(SrcReg);
582
13.8k
      if (TRI->isSGPRClass(SrcRC) && 
TRI->hasVectorRegisters(DestRC)10.7k
) {
583
4.54k
        MachineRegisterInfo::use_iterator NextUse;
584
4.54k
        SmallVector<FoldCandidate, 4> CopyUses;
585
4.54k
        for (MachineRegisterInfo::use_iterator
586
4.54k
          Use = MRI->use_begin(DestReg), E = MRI->use_end();
587
17.0k
          Use != E; 
Use = NextUse12.5k
) {
588
12.5k
          NextUse = std::next(Use);
589
12.5k
          FoldCandidate FC = FoldCandidate(Use->getParent(),
590
12.5k
           Use.getOperandNo(), &UseMI->getOperand(1));
591
12.5k
          CopyUses.push_back(FC);
592
12.5k
       }
593
12.5k
        for (auto & F : CopyUses) {
594
12.5k
          foldOperand(*F.OpToFold, F.UseMI, F.UseOpNo,
595
12.5k
           FoldList, CopiesToReplace);
596
12.5k
        }
597
4.54k
      }
598
13.8k
    }
599
14.4k
600
14.4k
    if (DestRC == &AMDGPU::AGPR_32RegClass &&
601
14.4k
        
TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)2
) {
602
2
      UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32));
603
2
      UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
604
2
      CopiesToReplace.push_back(UseMI);
605
2
      return;
606
2
    }
607
14.4k
608
14.4k
    // In order to fold immediates into copies, we need to change the
609
14.4k
    // copy to a MOV.
610
14.4k
611
14.4k
    unsigned MovOp = TII->getMovOpcode(DestRC);
612
14.4k
    if (MovOp == AMDGPU::COPY)
613
0
      return;
614
14.4k
615
14.4k
    UseMI->setDesc(TII->get(MovOp));
616
14.4k
    CopiesToReplace.push_back(UseMI);
617
300k
  } else {
618
300k
    if (UseMI->isCopy() && 
OpToFold.isReg()72.5k
&&
619
300k
        
TargetRegisterInfo::isVirtualRegister(UseMI->getOperand(0).getReg())72.5k
&&
620
300k
        
TRI->isVectorRegister(*MRI, UseMI->getOperand(0).getReg())61.0k
&&
621
300k
        
TRI->isVectorRegister(*MRI, UseMI->getOperand(1).getReg())44.4k
&&
622
300k
        
!UseMI->getOperand(1).getSubReg()4.58k
) {
623
4.58k
      unsigned Size = TII->getOpSize(*UseMI, 1);
624
4.58k
      UseMI->getOperand(1).setReg(OpToFold.getReg());
625
4.58k
      UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
626
4.58k
      UseMI->getOperand(1).setIsKill(false);
627
4.58k
      CopiesToReplace.push_back(UseMI);
628
4.58k
      OpToFold.setIsKill(false);
629
4.58k
      if (Size != 4)
630
6
        return;
631
4.58k
      if (TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&
632
4.58k
          
TRI->isVGPR(*MRI, UseMI->getOperand(1).getReg())0
)
633
0
        UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32));
634
4.58k
      else if (TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&
635
4.58k
               TRI->isAGPR(*MRI, UseMI->getOperand(1).getReg()))
636
528
        UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_READ_B32));
637
4.58k
      return;
638
4.58k
    }
639
296k
640
296k
    unsigned UseOpc = UseMI->getOpcode();
641
296k
    if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
642
296k
        
(295k
UseOpc == AMDGPU::V_READLANE_B32295k
&&
643
295k
         (int)UseOpIdx ==
644
130
         AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
645
130
      // %vgpr = V_MOV_B32 imm
646
130
      // %sgpr = V_READFIRSTLANE_B32 %vgpr
647
130
      // =>
648
130
      // %sgpr = S_MOV_B32 imm
649
130
      if (FoldingImmLike) {
650
19
        if (execMayBeModifiedBeforeUse(*MRI,
651
19
                                       UseMI->getOperand(UseOpIdx).getReg(),
652
19
                                       *OpToFold.getParent(),
653
19
                                       *UseMI))
654
3
          return;
655
16
656
16
        UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
657
16
658
16
        // FIXME: ChangeToImmediate should clear subreg
659
16
        UseMI->getOperand(1).setSubReg(0);
660
16
        if (OpToFold.isImm())
661
15
          UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
662
1
        else
663
1
          UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getIndex());
664
16
        UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane)
665
16
        return;
666
16
      }
667
111
668
111
      if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) {
669
37
        if (execMayBeModifiedBeforeUse(*MRI,
670
37
                                       UseMI->getOperand(UseOpIdx).getReg(),
671
37
                                       *OpToFold.getParent(),
672
37
                                       *UseMI))
673
3
          return;
674
34
675
34
        // %vgpr = COPY %sgpr0
676
34
        // %sgpr1 = V_READFIRSTLANE_B32 %vgpr
677
34
        // =>
678
34
        // %sgpr1 = COPY %sgpr0
679
34
        UseMI->setDesc(TII->get(AMDGPU::COPY));
680
34
        UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane)
681
34
        return;
682
34
      }
683
111
    }
684
295k
685
295k
    const MCInstrDesc &UseDesc = UseMI->getDesc();
686
295k
687
295k
    // Don't fold into target independent nodes.  Target independent opcodes
688
295k
    // don't have defined register classes.
689
295k
    if (UseDesc.isVariadic() ||
690
295k
        
UseOp.isImplicit()292k
||
691
295k
        
UseDesc.OpInfo[UseOpIdx].RegClass == -1292k
)
692
71.4k
      return;
693
238k
  }
694
238k
695
238k
  if (!FoldingImmLike) {
696
164k
    tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
697
164k
698
164k
    // FIXME: We could try to change the instruction from 64-bit to 32-bit
699
164k
    // to enable more folding opportunites.  The shrink operands pass
700
164k
    // already does this.
701
164k
    return;
702
164k
  }
703
74.1k
704
74.1k
705
74.1k
  const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc();
706
74.1k
  const TargetRegisterClass *FoldRC =
707
74.1k
    TRI->getRegClass(FoldDesc.OpInfo[0].RegClass);
708
74.1k
709
74.1k
  // Split 64-bit constants into 32-bits for folding.
710
74.1k
  if (UseOp.getSubReg() && 
AMDGPU::getRegBitWidth(FoldRC->getID()) == 644.00k
) {
711
3.99k
    unsigned UseReg = UseOp.getReg();
712
3.99k
    const TargetRegisterClass *UseRC = MRI->getRegClass(UseReg);
713
3.99k
714
3.99k
    if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64)
715
0
      return;
716
3.99k
717
3.99k
    APInt Imm(64, OpToFold.getImm());
718
3.99k
    if (UseOp.getSubReg() == AMDGPU::sub0) {
719
1.90k
      Imm = Imm.getLoBits(32);
720
2.08k
    } else {
721
2.08k
      assert(UseOp.getSubReg() == AMDGPU::sub1);
722
2.08k
      Imm = Imm.getHiBits(32);
723
2.08k
    }
724
3.99k
725
3.99k
    MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
726
3.99k
    tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII);
727
3.99k
    return;
728
3.99k
  }
729
70.1k
730
70.1k
731
70.1k
732
70.1k
  tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
733
70.1k
}
734
735
static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
736
3.72k
                                  uint32_t LHS, uint32_t RHS) {
737
3.72k
  switch (Opcode) {
738
3.72k
  case AMDGPU::V_AND_B32_e64:
739
13
  case AMDGPU::V_AND_B32_e32:
740
13
  case AMDGPU::S_AND_B32:
741
13
    Result = LHS & RHS;
742
13
    return true;
743
13
  case AMDGPU::V_OR_B32_e64:
744
10
  case AMDGPU::V_OR_B32_e32:
745
10
  case AMDGPU::S_OR_B32:
746
10
    Result = LHS | RHS;
747
10
    return true;
748
10
  case AMDGPU::V_XOR_B32_e64:
749
0
  case AMDGPU::V_XOR_B32_e32:
750
0
  case AMDGPU::S_XOR_B32:
751
0
    Result = LHS ^ RHS;
752
0
    return true;
753
9
  case AMDGPU::V_LSHL_B32_e64:
754
9
  case AMDGPU::V_LSHL_B32_e32:
755
9
  case AMDGPU::S_LSHL_B32:
756
9
    // The instruction ignores the high bits for out of bounds shifts.
757
9
    Result = LHS << (RHS & 31);
758
9
    return true;
759
9
  case AMDGPU::V_LSHLREV_B32_e64:
760
3
  case AMDGPU::V_LSHLREV_B32_e32:
761
3
    Result = RHS << (LHS & 31);
762
3
    return true;
763
8
  case AMDGPU::V_LSHR_B32_e64:
764
8
  case AMDGPU::V_LSHR_B32_e32:
765
8
  case AMDGPU::S_LSHR_B32:
766
8
    Result = LHS >> (RHS & 31);
767
8
    return true;
768
8
  case AMDGPU::V_LSHRREV_B32_e64:
769
3
  case AMDGPU::V_LSHRREV_B32_e32:
770
3
    Result = RHS >> (LHS & 31);
771
3
    return true;
772
8
  case AMDGPU::V_ASHR_I32_e64:
773
8
  case AMDGPU::V_ASHR_I32_e32:
774
8
  case AMDGPU::S_ASHR_I32:
775
8
    Result = static_cast<int32_t>(LHS) >> (RHS & 31);
776
8
    return true;
777
8
  case AMDGPU::V_ASHRREV_I32_e64:
778
3
  case AMDGPU::V_ASHRREV_I32_e32:
779
3
    Result = static_cast<int32_t>(RHS) >> (LHS & 31);
780
3
    return true;
781
3.66k
  default:
782
3.66k
    return false;
783
3.72k
  }
784
3.72k
}
785
786
78
static unsigned getMovOpc(bool IsScalar) {
787
78
  return IsScalar ? 
AMDGPU::S_MOV_B3211
:
AMDGPU::V_MOV_B32_e3267
;
788
78
}
789
790
/// Remove any leftover implicit operands from mutating the instruction. e.g.
791
/// if we replace an s_and_b32 with a copy, we don't need the implicit scc def
792
/// anymore.
793
208
static void stripExtraCopyOperands(MachineInstr &MI) {
794
208
  const MCInstrDesc &Desc = MI.getDesc();
795
208
  unsigned NumOps = Desc.getNumOperands() +
796
208
                    Desc.getNumImplicitUses() +
797
208
                    Desc.getNumImplicitDefs();
798
208
799
347
  for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; 
--I139
)
800
139
    MI.RemoveOperand(I);
801
208
}
802
803
205
static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) {
804
205
  MI.setDesc(NewDesc);
805
205
  stripExtraCopyOperands(MI);
806
205
}
807
808
static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI,
809
130k
                                               MachineOperand &Op) {
810
130k
  if (Op.isReg()) {
811
128k
    // If this has a subregister, it obviously is a register source.
812
128k
    if (Op.getSubReg() != AMDGPU::NoSubRegister ||
813
128k
        
!TargetRegisterInfo::isVirtualRegister(Op.getReg())118k
)
814
10.7k
      return &Op;
815
118k
816
118k
    MachineInstr *Def = MRI.getVRegDef(Op.getReg());
817
118k
    if (Def && 
Def->isMoveImmediate()118k
) {
818
66.6k
      MachineOperand &ImmSrc = Def->getOperand(1);
819
66.6k
      if (ImmSrc.isImm())
820
66.5k
        return &ImmSrc;
821
52.9k
    }
822
118k
  }
823
52.9k
824
52.9k
  return &Op;
825
52.9k
}
826
827
// Try to simplify operations with a constant that may appear after instruction
828
// selection.
829
// TODO: See if a frame index with a fixed offset can fold.
830
static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
831
                              const SIInstrInfo *TII,
832
                              MachineInstr *MI,
833
171k
                              MachineOperand *ImmOp) {
834
171k
  unsigned Opc = MI->getOpcode();
835
171k
  if (Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
836
171k
      
Opc == AMDGPU::S_NOT_B32171k
) {
837
10
    MI->getOperand(1).ChangeToImmediate(~ImmOp->getImm());
838
10
    mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
839
10
    return true;
840
10
  }
841
171k
842
171k
  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
843
171k
  if (Src1Idx == -1)
844
106k
    return false;
845
65.1k
846
65.1k
  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
847
65.1k
  MachineOperand *Src0 = getImmOrMaterializedImm(MRI, MI->getOperand(Src0Idx));
848
65.1k
  MachineOperand *Src1 = getImmOrMaterializedImm(MRI, MI->getOperand(Src1Idx));
849
65.1k
850
65.1k
  if (!Src0->isImm() && 
!Src1->isImm()39.2k
)
851
1.00k
    return false;
852
64.1k
853
64.1k
  if (MI->getOpcode() == AMDGPU::V_LSHL_OR_B32) {
854
52
    if (Src0->isImm() && 
Src0->getImm() == 026
) {
855
8
      // v_lshl_or_b32 0, X, Y -> copy Y
856
8
      // v_lshl_or_b32 0, X, K -> v_mov_b32 K
857
8
      bool UseCopy = TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->isReg();
858
8
      MI->RemoveOperand(Src1Idx);
859
8
      MI->RemoveOperand(Src0Idx);
860
8
861
8
      MI->setDesc(TII->get(UseCopy ? 
AMDGPU::COPY7
:
AMDGPU::V_MOV_B32_e321
));
862
8
      return true;
863
8
    }
864
64.0k
  }
865
64.0k
866
64.0k
  // and k0, k1 -> v_mov_b32 (k0 & k1)
867
64.0k
  // or k0, k1 -> v_mov_b32 (k0 | k1)
868
64.0k
  // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
869
64.0k
  if (Src0->isImm() && 
Src1->isImm()25.8k
) {
870
3.72k
    int32_t NewImm;
871
3.72k
    if (!evalBinaryInstruction(Opc, NewImm, Src0->getImm(), Src1->getImm()))
872
3.66k
      return false;
873
57
874
57
    const SIRegisterInfo &TRI = TII->getRegisterInfo();
875
57
    bool IsSGPR = TRI.isSGPRReg(MRI, MI->getOperand(0).getReg());
876
57
877
57
    // Be careful to change the right operand, src0 may belong to a different
878
57
    // instruction.
879
57
    MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
880
57
    MI->RemoveOperand(Src1Idx);
881
57
    mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR)));
882
57
    return true;
883
57
  }
884
60.3k
885
60.3k
  if (!MI->isCommutable())
886
12.3k
    return false;
887
47.9k
888
47.9k
  if (Src0->isImm() && 
!Src1->isImm()15.8k
) {
889
15.8k
    std::swap(Src0, Src1);
890
15.8k
    std::swap(Src0Idx, Src1Idx);
891
15.8k
  }
892
47.9k
893
47.9k
  int32_t Src1Val = static_cast<int32_t>(Src1->getImm());
894
47.9k
  if (Opc == AMDGPU::V_OR_B32_e64 ||
895
47.9k
      
Opc == AMDGPU::V_OR_B32_e3247.3k
||
896
47.9k
      
Opc == AMDGPU::S_OR_B3247.0k
) {
897
1.08k
    if (Src1Val == 0) {
898
116
      // y = or x, 0 => y = copy x
899
116
      MI->RemoveOperand(Src1Idx);
900
116
      mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
901
967
    } else if (Src1Val == -1) {
902
2
      // y = or x, -1 => y = v_mov_b32 -1
903
2
      MI->RemoveOperand(Src1Idx);
904
2
      mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
905
2
    } else
906
965
      return false;
907
118
908
118
    return true;
909
118
  }
910
46.9k
911
46.9k
  if (MI->getOpcode() == AMDGPU::V_AND_B32_e64 ||
912
46.9k
      
MI->getOpcode() == AMDGPU::V_AND_B32_e3238.3k
||
913
46.9k
      
MI->getOpcode() == AMDGPU::S_AND_B3237.5k
) {
914
12.7k
    if (Src1Val == 0) {
915
4
      // y = and x, 0 => y = v_mov_b32 0
916
4
      MI->RemoveOperand(Src0Idx);
917
4
      mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
918
12.7k
    } else if (Src1Val == -1) {
919
3
      // y = and x, -1 => y = copy x
920
3
      MI->RemoveOperand(Src1Idx);
921
3
      mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
922
3
      stripExtraCopyOperands(*MI);
923
3
    } else
924
12.7k
      return false;
925
7
926
7
    return true;
927
7
  }
928
34.1k
929
34.1k
  if (MI->getOpcode() == AMDGPU::V_XOR_B32_e64 ||
930
34.1k
      
MI->getOpcode() == AMDGPU::V_XOR_B32_e3234.0k
||
931
34.1k
      
MI->getOpcode() == AMDGPU::S_XOR_B3233.8k
) {
932
397
    if (Src1Val == 0) {
933
6
      // y = xor x, 0 => y = copy x
934
6
      MI->RemoveOperand(Src1Idx);
935
6
      mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
936
6
      return true;
937
6
    }
938
34.1k
  }
939
34.1k
940
34.1k
  return false;
941
34.1k
}
942
943
// Try to fold an instruction into a simpler one
944
static bool tryFoldInst(const SIInstrInfo *TII,
945
1.31M
                        MachineInstr *MI) {
946
1.31M
  unsigned Opc = MI->getOpcode();
947
1.31M
948
1.31M
  if (Opc == AMDGPU::V_CNDMASK_B32_e32    ||
949
1.31M
      
Opc == AMDGPU::V_CNDMASK_B32_e641.31M
||
950
1.31M
      
Opc == AMDGPU::V_CNDMASK_B64_PSEUDO1.30M
) {
951
14.7k
    const MachineOperand *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
952
14.7k
    const MachineOperand *Src1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1);
953
14.7k
    int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
954
14.7k
    int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
955
14.7k
    if (Src1->isIdenticalTo(*Src0) &&
956
14.7k
        
(77
Src1ModIdx == -177
||
!MI->getOperand(Src1ModIdx).getImm()76
) &&
957
14.7k
        
(7
Src0ModIdx == -17
||
!MI->getOperand(Src0ModIdx).getImm()6
)) {
958
7
      LLVM_DEBUG(dbgs() << "Folded " << *MI << " into ");
959
7
      auto &NewDesc =
960
7
          TII->get(Src0->isReg() ? 
(unsigned)AMDGPU::COPY2
:
getMovOpc(false)5
);
961
7
      int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
962
7
      if (Src2Idx != -1)
963
6
        MI->RemoveOperand(Src2Idx);
964
7
      MI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
965
7
      if (Src1ModIdx != -1)
966
6
        MI->RemoveOperand(Src1ModIdx);
967
7
      if (Src0ModIdx != -1)
968
6
        MI->RemoveOperand(Src0ModIdx);
969
7
      mutateCopyOp(*MI, NewDesc);
970
7
      LLVM_DEBUG(dbgs() << *MI << '\n');
971
7
      return true;
972
7
    }
973
1.31M
  }
974
1.31M
975
1.31M
  return false;
976
1.31M
}
977
978
void SIFoldOperands::foldInstOperand(MachineInstr &MI,
979
390k
                                     MachineOperand &OpToFold) const {
980
390k
  // We need mutate the operands of new mov instructions to add implicit
981
390k
  // uses of EXEC, but adding them invalidates the use_iterator, so defer
982
390k
  // this.
983
390k
  SmallVector<MachineInstr *, 4> CopiesToReplace;
984
390k
  SmallVector<FoldCandidate, 4> FoldList;
985
390k
  MachineOperand &Dst = MI.getOperand(0);
986
390k
987
390k
  bool FoldingImm = OpToFold.isImm() || 
OpToFold.isFI()286k
||
OpToFold.isGlobal()285k
;
988
390k
  if (FoldingImm) {
989
106k
    unsigned NumLiteralUses = 0;
990
106k
    MachineOperand *NonInlineUse = nullptr;
991
106k
    int NonInlineUseOpNo = -1;
992
106k
993
106k
    MachineRegisterInfo::use_iterator NextUse;
994
106k
    for (MachineRegisterInfo::use_iterator
995
106k
           Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();
996
279k
         Use != E; 
Use = NextUse173k
) {
997
173k
      NextUse = std::next(Use);
998
173k
      MachineInstr *UseMI = Use->getParent();
999
173k
      unsigned OpNo = Use.getOperandNo();
1000
173k
1001
173k
      // Folding the immediate may reveal operations that can be constant
1002
173k
      // folded or replaced with a copy. This can happen for example after
1003
173k
      // frame indices are lowered to constants or from splitting 64-bit
1004
173k
      // constants.
1005
173k
      //
1006
173k
      // We may also encounter cases where one or both operands are
1007
173k
      // immediates materialized into a register, which would ordinarily not
1008
173k
      // be folded due to multiple uses or operand constraints.
1009
173k
1010
173k
      if (OpToFold.isImm() && 
tryConstantFoldOp(*MRI, TII, UseMI, &OpToFold)171k
) {
1011
206
        LLVM_DEBUG(dbgs() << "Constant folded " << *UseMI << '\n');
1012
206
1013
206
        // Some constant folding cases change the same immediate's use to a new
1014
206
        // instruction, e.g. and x, 0 -> 0. Make sure we re-visit the user
1015
206
        // again. The same constant folded instruction could also have a second
1016
206
        // use operand.
1017
206
        NextUse = MRI->use_begin(Dst.getReg());
1018
206
        FoldList.clear();
1019
206
        continue;
1020
206
      }
1021
173k
1022
173k
      // Try to fold any inline immediate uses, and then only fold other
1023
173k
      // constants if they have one use.
1024
173k
      //
1025
173k
      // The legality of the inline immediate must be checked based on the use
1026
173k
      // operand, not the defining instruction, because 32-bit instructions
1027
173k
      // with 32-bit inline immediate sources may be used to materialize
1028
173k
      // constants used in 16-bit operands.
1029
173k
      //
1030
173k
      // e.g. it is unsafe to fold:
1031
173k
      //  s_mov_b32 s0, 1.0    // materializes 0x3f800000
1032
173k
      //  v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00
1033
173k
1034
173k
      // Folding immediates with more than one use will increase program size.
1035
173k
      // FIXME: This will also reduce register usage, which may be better
1036
173k
      // in some cases. A better heuristic is needed.
1037
173k
      if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) {
1038
58.1k
        foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace);
1039
114k
      } else if (frameIndexMayFold(TII, *UseMI, OpNo, OpToFold)) {
1040
20
        foldOperand(OpToFold, UseMI, OpNo, FoldList,
1041
20
                    CopiesToReplace);
1042
114k
      } else {
1043
114k
        if (++NumLiteralUses == 1) {
1044
72.0k
          NonInlineUse = &*Use;
1045
72.0k
          NonInlineUseOpNo = OpNo;
1046
72.0k
        }
1047
114k
      }
1048
173k
    }
1049
106k
1050
106k
    if (NumLiteralUses == 1) {
1051
53.6k
      MachineInstr *UseMI = NonInlineUse->getParent();
1052
53.6k
      foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList, CopiesToReplace);
1053
53.6k
    }
1054
284k
  } else {
1055
284k
    // Folding register.
1056
284k
    SmallVector <MachineRegisterInfo::use_iterator, 4> UsesToProcess;
1057
284k
    for (MachineRegisterInfo::use_iterator
1058
284k
           Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();
1059
615k
         Use != E; 
++Use330k
) {
1060
330k
      UsesToProcess.push_back(Use);
1061
330k
    }
1062
330k
    for (auto U : UsesToProcess) {
1063
330k
      MachineInstr *UseMI = U->getParent();
1064
330k
1065
330k
      foldOperand(OpToFold, UseMI, U.getOperandNo(),
1066
330k
        FoldList, CopiesToReplace);
1067
330k
    }
1068
284k
  }
1069
390k
1070
390k
  MachineFunction *MF = MI.getParent()->getParent();
1071
390k
  // Make sure we add EXEC uses to any new v_mov instructions created.
1072
390k
  for (MachineInstr *Copy : CopiesToReplace)
1073
19.0k
    Copy->addImplicitDefUseOperands(*MF);
1074
390k
1075
390k
  for (FoldCandidate &Fold : FoldList) {
1076
151k
    if (updateOperand(Fold, *TII, *TRI, *ST)) {
1077
151k
      // Clear kill flags.
1078
151k
      if (Fold.isReg()) {
1079
89.6k
        assert(Fold.OpToFold && Fold.OpToFold->isReg());
1080
89.6k
        // FIXME: Probably shouldn't bother trying to fold if not an
1081
89.6k
        // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
1082
89.6k
        // copies.
1083
89.6k
        MRI->clearKillFlags(Fold.OpToFold->getReg());
1084
89.6k
      }
1085
151k
      LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "
1086
151k
                        << static_cast<int>(Fold.UseOpNo) << " of "
1087
151k
                        << *Fold.UseMI << '\n');
1088
151k
      tryFoldInst(TII, Fold.UseMI);
1089
151k
    } else 
if (340
Fold.isCommuted()340
) {
1090
340
      // Restoring instruction's original operand order if fold has failed.
1091
340
      TII->commuteInstruction(*Fold.UseMI, false);
1092
340
    }
1093
151k
  }
1094
390k
}
1095
1096
// Clamp patterns are canonically selected to v_max_* instructions, so only
1097
// handle them.
1098
639k
const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
1099
639k
  unsigned Op = MI.getOpcode();
1100
639k
  switch (Op) {
1101
639k
  case AMDGPU::V_MAX_F32_e64:
1102
2.48k
  case AMDGPU::V_MAX_F16_e64:
1103
2.48k
  case AMDGPU::V_MAX_F64:
1104
2.48k
  case AMDGPU::V_PK_MAX_F16: {
1105
2.48k
    if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
1106
2.11k
      return nullptr;
1107
366
1108
366
    // Make sure sources are identical.
1109
366
    const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1110
366
    const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1111
366
    if (!Src0->isReg() || !Src1->isReg() ||
1112
366
        
Src0->getReg() != Src1->getReg()365
||
1113
366
        
Src0->getSubReg() != Src1->getSubReg()362
||
1114
366
        
Src0->getSubReg() != AMDGPU::NoSubRegister362
)
1115
4
      return nullptr;
1116
362
1117
362
    // Can't fold up if we have modifiers.
1118
362
    if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
1119
1
      return nullptr;
1120
361
1121
361
    unsigned Src0Mods
1122
361
      = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
1123
361
    unsigned Src1Mods
1124
361
      = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
1125
361
1126
361
    // Having a 0 op_sel_hi would require swizzling the output in the source
1127
361
    // instruction, which we can't do.
1128
361
    unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? 
SISrcMods::OP_SEL_135
1129
361
                                                      : 
0u326
;
1130
361
    if (Src0Mods != UnsetMods && 
Src1Mods != UnsetMods75
)
1131
75
      return nullptr;
1132
286
    return Src0;
1133
286
  }
1134
636k
  default:
1135
636k
    return nullptr;
1136
639k
  }
1137
639k
}
1138
1139
// We obviously have multiple uses in a clamp since the register is used twice
1140
// in the same instruction.
1141
324
static bool hasOneNonDBGUseInst(const MachineRegisterInfo &MRI, unsigned Reg) {
1142
324
  int Count = 0;
1143
324
  for (auto I = MRI.use_instr_nodbg_begin(Reg), E = MRI.use_instr_nodbg_end();
1144
648
       I != E; 
++I324
) {
1145
345
    if (++Count > 1)
1146
21
      return false;
1147
345
  }
1148
324
1149
324
  
return true303
;
1150
324
}
1151
1152
// FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
1153
639k
bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
1154
639k
  const MachineOperand *ClampSrc = isClamp(MI);
1155
639k
  if (!ClampSrc || 
!hasOneNonDBGUseInst(*MRI, ClampSrc->getReg())286
)
1156
639k
    return false;
1157
269
1158
269
  MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
1159
269
1160
269
  // The type of clamp must be compatible.
1161
269
  if (TII->getClampMask(*Def) != TII->getClampMask(MI))
1162
98
    return false;
1163
171
1164
171
  MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
1165
171
  if (!DefClamp)
1166
0
    return false;
1167
171
1168
171
  LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def
1169
171
                    << '\n');
1170
171
1171
171
  // Clamp is applied after omod, so it is OK if omod is set.
1172
171
  DefClamp->setImm(1);
1173
171
  MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
1174
171
  MI.eraseFromParent();
1175
171
  return true;
1176
171
}
1177
1178
39
static int getOModValue(unsigned Opc, int64_t Val) {
1179
39
  switch (Opc) {
1180
39
  case AMDGPU::V_MUL_F32_e64: {
1181
38
    switch (static_cast<uint32_t>(Val)) {
1182
38
    case 0x3f000000: // 0.5
1183
20
      return SIOutMods::DIV2;
1184
38
    case 0x40000000: // 2.0
1185
2
      return SIOutMods::MUL2;
1186
38
    case 0x40800000: // 4.0
1187
8
      return SIOutMods::MUL4;
1188
38
    default:
1189
8
      return SIOutMods::NONE;
1190
0
    }
1191
0
  }
1192
1
  case AMDGPU::V_MUL_F16_e64: {
1193
1
    switch (static_cast<uint16_t>(Val)) {
1194
1
    case 0x3800: // 0.5
1195
1
      return SIOutMods::DIV2;
1196
1
    case 0x4000: // 2.0
1197
0
      return SIOutMods::MUL2;
1198
1
    case 0x4400: // 4.0
1199
0
      return SIOutMods::MUL4;
1200
1
    default:
1201
0
      return SIOutMods::NONE;
1202
0
    }
1203
0
  }
1204
0
  default:
1205
0
    llvm_unreachable("invalid mul opcode");
1206
39
  }
1207
39
}
1208
1209
// FIXME: Does this really not support denormals with f16?
1210
// FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
1211
// handled, so will anything other than that break?
1212
std::pair<const MachineOperand *, int>
1213
732
SIFoldOperands::isOMod(const MachineInstr &MI) const {
1214
732
  unsigned Op = MI.getOpcode();
1215
732
  switch (Op) {
1216
732
  case AMDGPU::V_MUL_F32_e64:
1217
57
  case AMDGPU::V_MUL_F16_e64: {
1218
57
    // If output denormals are enabled, omod is ignored.
1219
57
    if ((Op == AMDGPU::V_MUL_F32_e64 && 
ST->hasFP32Denormals()54
) ||
1220
57
        
(53
Op == AMDGPU::V_MUL_F16_e6453
&&
ST->hasFP16Denormals()3
))
1221
6
      return std::make_pair(nullptr, SIOutMods::NONE);
1222
51
1223
51
    const MachineOperand *RegOp = nullptr;
1224
51
    const MachineOperand *ImmOp = nullptr;
1225
51
    const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1226
51
    const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1227
51
    if (Src0->isImm()) {
1228
0
      ImmOp = Src0;
1229
0
      RegOp = Src1;
1230
51
    } else if (Src1->isImm()) {
1231
39
      ImmOp = Src1;
1232
39
      RegOp = Src0;
1233
39
    } else
1234
12
      return std::make_pair(nullptr, SIOutMods::NONE);
1235
39
1236
39
    int OMod = getOModValue(Op, ImmOp->getImm());
1237
39
    if (OMod == SIOutMods::NONE ||
1238
39
        
TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers)31
||
1239
39
        
TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)27
||
1240
39
        
TII->hasModifiersSet(MI, AMDGPU::OpName::omod)27
||
1241
39
        
TII->hasModifiersSet(MI, AMDGPU::OpName::clamp)27
)
1242
12
      return std::make_pair(nullptr, SIOutMods::NONE);
1243
27
1244
27
    return std::make_pair(RegOp, OMod);
1245
27
  }
1246
101
  case AMDGPU::V_ADD_F32_e64:
1247
101
  case AMDGPU::V_ADD_F16_e64: {
1248
101
    // If output denormals are enabled, omod is ignored.
1249
101
    if ((Op == AMDGPU::V_ADD_F32_e64 && 
ST->hasFP32Denormals()93
) ||
1250
101
        
(89
Op == AMDGPU::V_ADD_F16_e6489
&&
ST->hasFP16Denormals()8
))
1251
18
      return std::make_pair(nullptr, SIOutMods::NONE);
1252
83
1253
83
    // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
1254
83
    const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1255
83
    const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1256
83
1257
83
    if (Src0->isReg() && Src1->isReg() && 
Src0->getReg() == Src1->getReg()27
&&
1258
83
        
Src0->getSubReg() == Src1->getSubReg()25
&&
1259
83
        
!TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers)25
&&
1260
83
        
!TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)17
&&
1261
83
        
!TII->hasModifiersSet(MI, AMDGPU::OpName::clamp)13
&&
1262
83
        
!TII->hasModifiersSet(MI, AMDGPU::OpName::omod)11
)
1263
11
      return std::make_pair(Src0, SIOutMods::MUL2);
1264
72
1265
72
    return std::make_pair(nullptr, SIOutMods::NONE);
1266
72
  }
1267
574
  default:
1268
574
    return std::make_pair(nullptr, SIOutMods::NONE);
1269
732
  }
1270
732
}
1271
1272
// FIXME: Does this need to check IEEE bit on function?
1273
732
bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) {
1274
732
  const MachineOperand *RegOp;
1275
732
  int OMod;
1276
732
  std::tie(RegOp, OMod) = isOMod(MI);
1277
732
  if (OMod == SIOutMods::NONE || 
!RegOp->isReg()38
||
1278
732
      
RegOp->getSubReg() != AMDGPU::NoSubRegister38
||
1279
732
      
!hasOneNonDBGUseInst(*MRI, RegOp->getReg())38
)
1280
698
    return false;
1281
34
1282
34
  MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());
1283
34
  MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
1284
34
  if (!DefOMod || 
DefOMod->getImm() != SIOutMods::NONE32
)
1285
6
    return false;
1286
28
1287
28
  // Clamp is applied after omod. If the source already has clamp set, don't
1288
28
  // fold it.
1289
28
  if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
1290
8
    return false;
1291
20
1292
20
  LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def << '\n');
1293
20
1294
20
  DefOMod->setImm(OMod);
1295
20
  MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
1296
20
  MI.eraseFromParent();
1297
20
  return true;
1298
20
}
1299
1300
50.5k
bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
1301
50.5k
  if (skipFunction(MF.getFunction()))
1302
16
    return false;
1303
50.5k
1304
50.5k
  MRI = &MF.getRegInfo();
1305
50.5k
  ST = &MF.getSubtarget<GCNSubtarget>();
1306
50.5k
  TII = ST->getInstrInfo();
1307
50.5k
  TRI = &TII->getRegisterInfo();
1308
50.5k
  MFI = MF.getInfo<SIMachineFunctionInfo>();
1309
50.5k
1310
50.5k
  // omod is ignored by hardware if IEEE bit is enabled. omod also does not
1311
50.5k
  // correctly handle signed zeros.
1312
50.5k
  //
1313
50.5k
  // FIXME: Also need to check strictfp
1314
50.5k
  bool IsIEEEMode = MFI->getMode().IEEE;
1315
50.5k
  bool HasNSZ = MFI->hasNoSignedZerosFPMath();
1316
50.5k
1317
57.3k
  for (MachineBasicBlock *MBB : depth_first(&MF)) {
1318
57.3k
    MachineBasicBlock::iterator I, Next;
1319
1.22M
    for (I = MBB->begin(); I != MBB->end(); 
I = Next1.16M
) {
1320
1.16M
      Next = std::next(I);
1321
1.16M
      MachineInstr &MI = *I;
1322
1.16M
1323
1.16M
      tryFoldInst(TII, &MI);
1324
1.16M
1325
1.16M
      if (!TII->isFoldableCopy(MI)) {
1326
639k
        // TODO: Omod might be OK if there is NSZ only on the source
1327
639k
        // instruction, and not the omod multiply.
1328
639k
        if (IsIEEEMode || 
(50.5k
!HasNSZ50.5k
&&
!MI.getFlag(MachineInstr::FmNsz)49.8k
) ||
1329
639k
            
!tryFoldOMod(MI)732
)
1330
639k
          tryFoldClamp(MI);
1331
639k
        continue;
1332
639k
      }
1333
527k
1334
527k
      MachineOperand &OpToFold = MI.getOperand(1);
1335
527k
      bool FoldingImm =
1336
527k
          OpToFold.isImm() || 
OpToFold.isFI()416k
||
OpToFold.isGlobal()415k
;
1337
527k
1338
527k
      // FIXME: We could also be folding things like TargetIndexes.
1339
527k
      if (!FoldingImm && 
!OpToFold.isReg()415k
)
1340
0
        continue;
1341
527k
1342
527k
      if (OpToFold.isReg() &&
1343
527k
          
!TargetRegisterInfo::isVirtualRegister(OpToFold.getReg())415k
)
1344
108k
        continue;
1345
419k
1346
419k
      // Prevent folding operands backwards in the function. For example,
1347
419k
      // the COPY opcode must not be replaced by 1 in this example:
1348
419k
      //
1349
419k
      //    %3 = COPY %vgpr0; VGPR_32:%3
1350
419k
      //    ...
1351
419k
      //    %vgpr0 = V_MOV_B32_e32 1, implicit %exec
1352
419k
      MachineOperand &Dst = MI.getOperand(0);
1353
419k
      if (Dst.isReg() &&
1354
419k
          !TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
1355
28.3k
        continue;
1356
390k
1357
390k
      foldInstOperand(MI, OpToFold);
1358
390k
    }
1359
57.3k
  }
1360
50.5k
  return false;
1361
50.5k
}