Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
Line
Count
Source (jump to first uncovered line)
1
//===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
/// The pass tries to use the 32-bit encoding for instructions when possible.
8
//===----------------------------------------------------------------------===//
9
//
10
11
#include "AMDGPU.h"
12
#include "AMDGPUSubtarget.h"
13
#include "SIInstrInfo.h"
14
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
15
#include "llvm/ADT/Statistic.h"
16
#include "llvm/CodeGen/MachineFunctionPass.h"
17
#include "llvm/CodeGen/MachineInstrBuilder.h"
18
#include "llvm/CodeGen/MachineRegisterInfo.h"
19
#include "llvm/IR/Constants.h"
20
#include "llvm/IR/Function.h"
21
#include "llvm/IR/LLVMContext.h"
22
#include "llvm/Support/Debug.h"
23
#include "llvm/Support/raw_ostream.h"
24
#include "llvm/Target/TargetMachine.h"
25
26
#define DEBUG_TYPE "si-shrink-instructions"
27
28
STATISTIC(NumInstructionsShrunk,
29
          "Number of 64-bit instruction reduced to 32-bit.");
30
STATISTIC(NumLiteralConstantsFolded,
31
          "Number of literal constants folded into 32-bit instructions.");
32
33
using namespace llvm;
34
35
namespace {
36
37
class SIShrinkInstructions : public MachineFunctionPass {
38
public:
39
  static char ID;
40
41
  void shrinkMIMG(MachineInstr &MI);
42
43
public:
44
4.84k
  SIShrinkInstructions() : MachineFunctionPass(ID) {
45
4.84k
  }
46
47
  bool runOnMachineFunction(MachineFunction &MF) override;
48
49
55.5k
  StringRef getPassName() const override { return "SI Shrink Instructions"; }
50
51
4.79k
  void getAnalysisUsage(AnalysisUsage &AU) const override {
52
4.79k
    AU.setPreservesCFG();
53
4.79k
    MachineFunctionPass::getAnalysisUsage(AU);
54
4.79k
  }
55
};
56
57
} // End anonymous namespace.
58
59
INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE,
60
                "SI Shrink Instructions", false, false)
61
62
char SIShrinkInstructions::ID = 0;
63
64
4.83k
FunctionPass *llvm::createSIShrinkInstructionsPass() {
65
4.83k
  return new SIShrinkInstructions();
66
4.83k
}
67
68
/// This function checks \p MI for operands defined by a move immediate
69
/// instruction and then folds the literal constant into the instruction if it
70
/// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions.
71
static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
72
57.1k
                           MachineRegisterInfo &MRI, bool TryToCommute = true) {
73
57.1k
  assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI));
74
57.1k
75
57.1k
  int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
76
57.1k
77
57.1k
  // Try to fold Src0
78
57.1k
  MachineOperand &Src0 = MI.getOperand(Src0Idx);
79
57.1k
  if (Src0.isReg()) {
80
37.8k
    unsigned Reg = Src0.getReg();
81
37.8k
    if (TargetRegisterInfo::isVirtualRegister(Reg) && 
MRI.hasOneUse(Reg)23.2k
) {
82
13.3k
      MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
83
13.3k
      if (Def && Def->isMoveImmediate()) {
84
1.00k
        MachineOperand &MovSrc = Def->getOperand(1);
85
1.00k
        bool ConstantFolded = false;
86
1.00k
87
1.00k
        if (MovSrc.isImm() && 
(942
isInt<32>(MovSrc.getImm())942
||
88
942
                               
isUInt<32>(MovSrc.getImm())0
)) {
89
942
          // It's possible to have only one component of a super-reg defined by
90
942
          // a single mov, so we need to clear any subregister flag.
91
942
          Src0.setSubReg(0);
92
942
          Src0.ChangeToImmediate(MovSrc.getImm());
93
942
          ConstantFolded = true;
94
942
        } else 
if (58
MovSrc.isFI()58
) {
95
12
          Src0.setSubReg(0);
96
12
          Src0.ChangeToFrameIndex(MovSrc.getIndex());
97
12
          ConstantFolded = true;
98
46
        } else if (MovSrc.isGlobal()) {
99
30
          Src0.ChangeToGA(MovSrc.getGlobal(), MovSrc.getOffset(),
100
30
                          MovSrc.getTargetFlags());
101
30
          ConstantFolded = true;
102
30
        }
103
1.00k
104
1.00k
        if (ConstantFolded) {
105
984
          assert(MRI.use_empty(Reg));
106
984
          Def->eraseFromParent();
107
984
          ++NumLiteralConstantsFolded;
108
984
          return true;
109
984
        }
110
56.1k
      }
111
13.3k
    }
112
37.8k
  }
113
56.1k
114
56.1k
  // We have failed to fold src0, so commute the instruction and try again.
115
56.1k
  if (TryToCommute && 
MI.isCommutable()46.6k
) {
116
35.2k
    if (TII->commuteInstruction(MI)) {
117
9.54k
      if (foldImmediates(MI, TII, MRI, false))
118
7
        return true;
119
9.53k
120
9.53k
      // Commute back.
121
9.53k
      TII->commuteInstruction(MI);
122
9.53k
    }
123
35.2k
  }
124
56.1k
125
56.1k
  
return false56.1k
;
126
56.1k
}
127
128
28.5k
static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
129
28.5k
  return isInt<16>(Src.getImm()) &&
130
28.5k
    !TII->isInlineConstant(*Src.getParent(),
131
16.3k
                           Src.getParent()->getOperandNo(&Src));
132
28.5k
}
133
134
40
static bool isKUImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
135
40
  return isUInt<16>(Src.getImm()) &&
136
40
    !TII->isInlineConstant(*Src.getParent(),
137
20
                           Src.getParent()->getOperandNo(&Src));
138
40
}
139
140
static bool isKImmOrKUImmOperand(const SIInstrInfo *TII,
141
                                 const MachineOperand &Src,
142
595
                                 bool &IsUnsigned) {
143
595
  if (isInt<16>(Src.getImm())) {
144
579
    IsUnsigned = false;
145
579
    return !TII->isInlineConstant(Src);
146
579
  }
147
16
148
16
  if (isUInt<16>(Src.getImm())) {
149
8
    IsUnsigned = true;
150
8
    return !TII->isInlineConstant(Src);
151
8
  }
152
8
153
8
  return false;
154
8
}
155
156
/// \returns true if the constant in \p Src should be replaced with a bitreverse
157
/// of an inline immediate.
158
static bool isReverseInlineImm(const SIInstrInfo *TII,
159
                               const MachineOperand &Src,
160
39.2k
                               int32_t &ReverseImm) {
161
39.2k
  if (!isInt<32>(Src.getImm()) || 
TII->isInlineConstant(Src)38.6k
)
162
23.6k
    return false;
163
15.5k
164
15.5k
  ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Src.getImm()));
165
15.5k
  return ReverseImm >= -16 && 
ReverseImm <= 6413.5k
;
166
15.5k
}
167
168
/// Copy implicit register operands from specified instruction to this
169
/// instruction that are not part of the instruction definition.
170
static void copyExtraImplicitOps(MachineInstr &NewMI, MachineFunction &MF,
171
47.6k
                                 const MachineInstr &MI) {
172
47.6k
  for (unsigned i = MI.getDesc().getNumOperands() +
173
47.6k
         MI.getDesc().getNumImplicitUses() +
174
47.6k
         MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands();
175
47.6k
       i != e; 
++i0
) {
176
0
    const MachineOperand &MO = MI.getOperand(i);
177
0
    if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask())
178
0
      NewMI.addOperand(MF, MO);
179
0
  }
180
47.6k
}
181
182
811
static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) {
183
811
  // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to
184
811
  // get constants on the RHS.
185
811
  if (!MI.getOperand(0).isReg())
186
20
    TII->commuteInstruction(MI, false, 0, 1);
187
811
188
811
  const MachineOperand &Src1 = MI.getOperand(1);
189
811
  if (!Src1.isImm())
190
66
    return;
191
745
192
745
  int SOPKOpc = AMDGPU::getSOPKOp(MI.getOpcode());
193
745
  if (SOPKOpc == -1)
194
20
    return;
195
725
196
725
  // eq/ne is special because the imm16 can be treated as signed or unsigned,
197
725
  // and initially selectd to the unsigned versions.
198
725
  if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || 
SOPKOpc == AMDGPU::S_CMPK_LG_U32519
) {
199
595
    bool HasUImm;
200
595
    if (isKImmOrKUImmOperand(TII, Src1, HasUImm)) {
201
34
      if (!HasUImm) {
202
26
        SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ?
203
18
          AMDGPU::S_CMPK_EQ_I32 : 
AMDGPU::S_CMPK_LG_I328
;
204
26
      }
205
34
206
34
      MI.setDesc(TII->get(SOPKOpc));
207
34
    }
208
595
209
595
    return;
210
595
  }
211
130
212
130
  const MCInstrDesc &NewDesc = TII->get(SOPKOpc);
213
130
214
130
  if ((TII->sopkIsZext(SOPKOpc) && 
isKUImmOperand(TII, Src1)40
) ||
215
130
      
(114
!TII->sopkIsZext(SOPKOpc)114
&&
isKImmOperand(TII, Src1)90
)) {
216
34
    MI.setDesc(NewDesc);
217
34
  }
218
130
}
219
220
// Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding.
221
240
void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) {
222
240
  const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
223
240
  if (Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA)
224
129
    return;
225
111
226
111
  MachineFunction *MF = MI.getParent()->getParent();
227
111
  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
228
111
  const SIInstrInfo *TII = ST.getInstrInfo();
229
111
  const SIRegisterInfo &TRI = TII->getRegisterInfo();
230
111
  int VAddr0Idx =
231
111
      AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
232
111
  unsigned NewAddrDwords = Info->VAddrDwords;
233
111
  const TargetRegisterClass *RC;
234
111
235
111
  if (Info->VAddrDwords == 2) {
236
0
    RC = &AMDGPU::VReg_64RegClass;
237
111
  } else if (Info->VAddrDwords == 3) {
238
63
    RC = &AMDGPU::VReg_96RegClass;
239
63
  } else 
if (48
Info->VAddrDwords == 448
) {
240
25
    RC = &AMDGPU::VReg_128RegClass;
241
25
  } else 
if (23
Info->VAddrDwords <= 823
) {
242
18
    RC = &AMDGPU::VReg_256RegClass;
243
18
    NewAddrDwords = 8;
244
18
  } else {
245
5
    RC = &AMDGPU::VReg_512RegClass;
246
5
    NewAddrDwords = 16;
247
5
  }
248
111
249
111
  unsigned VgprBase = 0;
250
111
  bool IsUndef = true;
251
111
  bool IsKill = NewAddrDwords == Info->VAddrDwords;
252
518
  for (unsigned i = 0; i < Info->VAddrDwords; 
++i407
) {
253
417
    const MachineOperand &Op = MI.getOperand(VAddr0Idx + i);
254
417
    unsigned Vgpr = TRI.getHWRegIndex(Op.getReg());
255
417
256
417
    if (i == 0) {
257
111
      VgprBase = Vgpr;
258
306
    } else if (VgprBase + i != Vgpr)
259
10
      return;
260
407
261
407
    if (!Op.isUndef())
262
407
      IsUndef = false;
263
407
    if (!Op.isKill())
264
4
      IsKill = false;
265
407
  }
266
111
267
111
  
if (101
VgprBase + NewAddrDwords > 256101
)
268
0
    return;
269
101
270
101
  // Further check for implicit tied operands - this may be present if TFE is
271
101
  // enabled
272
101
  int TFEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe);
273
101
  int LWEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::lwe);
274
101
  unsigned TFEVal = MI.getOperand(TFEIdx).getImm();
275
101
  unsigned LWEVal = MI.getOperand(LWEIdx).getImm();
276
101
  int ToUntie = -1;
277
101
  if (TFEVal || 
LWEVal91
) {
278
12
    // TFE/LWE is enabled so we need to deal with an implicit tied operand
279
48
    for (unsigned i = LWEIdx + 1, e = MI.getNumOperands(); i != e; 
++i36
) {
280
36
      if (MI.getOperand(i).isReg() && 
MI.getOperand(i).isTied()24
&&
281
36
          
MI.getOperand(i).isImplicit()12
) {
282
12
        // This is the tied operand
283
12
        assert(
284
12
            ToUntie == -1 &&
285
12
            "found more than one tied implicit operand when expecting only 1");
286
12
        ToUntie = i;
287
12
        MI.untieRegOperand(ToUntie);
288
12
      }
289
36
    }
290
12
  }
291
101
292
101
  unsigned NewOpcode =
293
101
      AMDGPU::getMIMGOpcode(Info->BaseOpcode, AMDGPU::MIMGEncGfx10Default,
294
101
                            Info->VDataDwords, NewAddrDwords);
295
101
  MI.setDesc(TII->get(NewOpcode));
296
101
  MI.getOperand(VAddr0Idx).setReg(RC->getRegister(VgprBase));
297
101
  MI.getOperand(VAddr0Idx).setIsUndef(IsUndef);
298
101
  MI.getOperand(VAddr0Idx).setIsKill(IsKill);
299
101
300
388
  for (unsigned i = 1; i < Info->VAddrDwords; 
++i287
)
301
287
    MI.RemoveOperand(VAddr0Idx + 1);
302
101
303
101
  if (ToUntie >= 0) {
304
12
    MI.tieOperands(
305
12
        AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata),
306
12
        ToUntie - (Info->VAddrDwords - 1));
307
12
  }
308
101
}
309
310
/// Attempt to shink AND/OR/XOR operations requiring non-inlineable literals.
311
/// For AND or OR, try using S_BITSET{0,1} to clear or set bits.
312
/// If the inverse of the immediate is legal, use ANDN2, ORN2 or
313
/// XNOR (as a ^ b == ~(a ^ ~b)).
314
/// \returns true if the caller should continue the machine function iterator
315
static bool shrinkScalarLogicOp(const GCNSubtarget &ST,
316
                                MachineRegisterInfo &MRI,
317
                                const SIInstrInfo *TII,
318
5.92k
                                MachineInstr &MI) {
319
5.92k
  unsigned Opc = MI.getOpcode();
320
5.92k
  const MachineOperand *Dest = &MI.getOperand(0);
321
5.92k
  MachineOperand *Src0 = &MI.getOperand(1);
322
5.92k
  MachineOperand *Src1 = &MI.getOperand(2);
323
5.92k
  MachineOperand *SrcReg = Src0;
324
5.92k
  MachineOperand *SrcImm = Src1;
325
5.92k
326
5.92k
  if (SrcImm->isImm() &&
327
5.92k
      
!AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST.hasInv2PiInlineImm())1.57k
) {
328
794
    uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm());
329
794
    uint32_t NewImm = 0;
330
794
331
794
    if (Opc == AMDGPU::S_AND_B32) {
332
644
      if (isPowerOf2_32(~Imm)) {
333
20
        NewImm = countTrailingOnes(Imm);
334
20
        Opc = AMDGPU::S_BITSET0_B32;
335
624
      } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
336
4
        NewImm = ~Imm;
337
4
        Opc = AMDGPU::S_ANDN2_B32;
338
4
      }
339
644
    } else 
if (150
Opc == AMDGPU::S_OR_B32150
) {
340
106
      if (isPowerOf2_32(Imm)) {
341
46
        NewImm = countTrailingZeros(Imm);
342
46
        Opc = AMDGPU::S_BITSET1_B32;
343
60
      } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
344
4
        NewImm = ~Imm;
345
4
        Opc = AMDGPU::S_ORN2_B32;
346
4
      }
347
106
    } else 
if (44
Opc == AMDGPU::S_XOR_B3244
) {
348
44
      if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
349
4
        NewImm = ~Imm;
350
4
        Opc = AMDGPU::S_XNOR_B32;
351
4
      }
352
44
    } else {
353
0
      llvm_unreachable("unexpected opcode");
354
0
    }
355
794
356
794
    if ((Opc == AMDGPU::S_ANDN2_B32 || 
Opc == AMDGPU::S_ORN2_B32790
) &&
357
794
        
SrcImm == Src08
) {
358
0
      if (!TII->commuteInstruction(MI, false, 1, 2))
359
0
        NewImm = 0;
360
0
    }
361
794
362
794
    if (NewImm != 0) {
363
78
      if (TargetRegisterInfo::isVirtualRegister(Dest->getReg()) &&
364
78
        
SrcReg->isReg()39
) {
365
39
        MRI.setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg());
366
39
        MRI.setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg());
367
39
        return true;
368
39
      }
369
39
370
39
      if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) {
371
27
        MI.setDesc(TII->get(Opc));
372
27
        if (Opc == AMDGPU::S_BITSET0_B32 ||
373
27
            
Opc == AMDGPU::S_BITSET1_B3220
) {
374
21
          Src0->ChangeToImmediate(NewImm);
375
21
          // Remove the immediate and add the tied input.
376
21
          MI.getOperand(2).ChangeToRegister(Dest->getReg(), false);
377
21
          MI.tieOperands(0, 2);
378
21
        } else {
379
6
          SrcImm->setImm(NewImm);
380
6
        }
381
27
      }
382
39
    }
383
794
  }
384
5.92k
385
5.92k
  
return false5.88k
;
386
5.92k
}
387
388
// This is the same as MachineInstr::readsRegister/modifiesRegister except
389
// it takes subregs into account.
390
static bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R,
391
                          unsigned Reg, unsigned SubReg,
392
151k
                          const SIRegisterInfo &TRI) {
393
194k
  for (const MachineOperand &MO : R) {
394
194k
    if (!MO.isReg())
395
22.7k
      continue;
396
171k
397
171k
    if (TargetRegisterInfo::isPhysicalRegister(Reg) &&
398
171k
        
TargetRegisterInfo::isPhysicalRegister(MO.getReg())74.6k
) {
399
39.3k
      if (TRI.regsOverlap(Reg, MO.getReg()))
400
523
        return true;
401
132k
    } else if (MO.getReg() == Reg &&
402
132k
               
TargetRegisterInfo::isVirtualRegister(Reg)3.69k
) {
403
3.69k
      LaneBitmask Overlap = TRI.getSubRegIndexLaneMask(SubReg) &
404
3.69k
                            TRI.getSubRegIndexLaneMask(MO.getSubReg());
405
3.69k
      if (Overlap.any())
406
50
        return true;
407
3.69k
    }
408
171k
  }
409
151k
  
return false150k
;
410
151k
}
411
412
static bool instReadsReg(const MachineInstr *MI,
413
                         unsigned Reg, unsigned SubReg,
414
60.6k
                         const SIRegisterInfo &TRI) {
415
60.6k
  return instAccessReg(MI->uses(), Reg, SubReg, TRI);
416
60.6k
}
417
418
static bool instModifiesReg(const MachineInstr *MI,
419
                            unsigned Reg, unsigned SubReg,
420
90.5k
                            const SIRegisterInfo &TRI) {
421
90.5k
  return instAccessReg(MI->defs(), Reg, SubReg, TRI);
422
90.5k
}
423
424
static TargetInstrInfo::RegSubRegPair
425
getSubRegForIndex(unsigned Reg, unsigned Sub, unsigned I,
426
42
                  const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI) {
427
42
  if (TRI.getRegSizeInBits(Reg, MRI) != 32) {
428
28
    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
429
4
      Reg = TRI.getSubReg(Reg, TRI.getSubRegFromChannel(I));
430
24
    } else {
431
24
      LaneBitmask LM = TRI.getSubRegIndexLaneMask(Sub);
432
24
      Sub = TRI.getSubRegFromChannel(I + countTrailingZeros(LM.getAsInteger()));
433
24
    }
434
28
  }
435
42
  return TargetInstrInfo::RegSubRegPair(Reg, Sub);
436
42
}
437
438
// Match:
439
// mov t, x
440
// mov x, y
441
// mov y, t
442
//
443
// =>
444
//
445
// mov t, x (t is potentially dead and move eliminated)
446
// v_swap_b32 x, y
447
//
448
// Returns next valid instruction pointer if was able to create v_swap_b32.
449
//
450
// This shall not be done too early not to prevent possible folding which may
451
// remove matched moves, and this should prefereably be done before RA to
452
// release saved registers and also possibly after RA which can insert copies
453
// too.
454
//
455
// This is really just a generic peephole that is not a canocical shrinking,
456
// although requirements match the pass placement and it reduces code size too.
457
static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
458
74.8k
                               const SIInstrInfo *TII) {
459
74.8k
  assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
460
74.8k
         MovT.getOpcode() == AMDGPU::COPY);
461
74.8k
462
74.8k
  unsigned T = MovT.getOperand(0).getReg();
463
74.8k
  unsigned Tsub = MovT.getOperand(0).getSubReg();
464
74.8k
  MachineOperand &Xop = MovT.getOperand(1);
465
74.8k
466
74.8k
  if (!Xop.isReg())
467
5.72k
    return nullptr;
468
69.1k
  unsigned X = Xop.getReg();
469
69.1k
  unsigned Xsub = Xop.getSubReg();
470
69.1k
471
69.1k
  unsigned Size = TII->getOpSize(MovT, 0) / 4;
472
69.1k
473
69.1k
  const SIRegisterInfo &TRI = TII->getRegisterInfo();
474
69.1k
  if (!TRI.isVGPR(MRI, X))
475
51.6k
    return nullptr;
476
17.4k
477
22.3k
  
for (MachineOperand &YTop : MRI.use_nodbg_operands(T))17.4k
{
478
22.3k
    if (YTop.getSubReg() != Tsub)
479
1.40k
      continue;
480
20.9k
481
20.9k
    MachineInstr &MovY = *YTop.getParent();
482
20.9k
    if ((MovY.getOpcode() != AMDGPU::V_MOV_B32_e32 &&
483
20.9k
         
MovY.getOpcode() != AMDGPU::COPY20.1k
) ||
484
20.9k
        
MovY.getOperand(1).getSubReg() != Tsub5.89k
)
485
15.0k
      continue;
486
5.89k
487
5.89k
    unsigned Y = MovY.getOperand(0).getReg();
488
5.89k
    unsigned Ysub = MovY.getOperand(0).getSubReg();
489
5.89k
490
5.89k
    if (!TRI.isVGPR(MRI, Y) || MovT.getParent() != MovY.getParent())
491
329
      continue;
492
5.56k
493
5.56k
    MachineInstr *MovX = nullptr;
494
5.56k
    auto I = std::next(MovT.getIterator()), E = MovT.getParent()->instr_end();
495
35.5k
    for (auto IY = MovY.getIterator(); I != E && 
I != IY33.2k
;
++I29.9k
) {
496
30.5k
      if (instReadsReg(&*I, X, Xsub, TRI) ||
497
30.5k
          
instModifiesReg(&*I, Y, Ysub, TRI)30.3k
||
498
30.5k
          
instModifiesReg(&*I, T, Tsub, TRI)30.2k
||
499
30.5k
          
(30.1k
MovX30.1k
&&
instModifiesReg(&*I, X, Xsub, TRI)8
)) {
500
352
        MovX = nullptr;
501
352
        break;
502
352
      }
503
30.1k
      if (!instReadsReg(&*I, Y, Ysub, TRI)) {
504
29.9k
        if (!MovX && 
instModifiesReg(&*I, X, Xsub, TRI)29.9k
) {
505
57
          MovX = nullptr;
506
57
          break;
507
57
        }
508
29.9k
        continue;
509
29.9k
      }
510
164
      if (MovX ||
511
164
          (I->getOpcode() != AMDGPU::V_MOV_B32_e32 &&
512
164
           
I->getOpcode() != AMDGPU::COPY132
) ||
513
164
          
I->getOperand(0).getReg() != X52
||
514
164
          
I->getOperand(0).getSubReg() != Xsub25
) {
515
140
        MovX = nullptr;
516
140
        break;
517
140
      }
518
24
      MovX = &*I;
519
24
    }
520
5.56k
521
5.56k
    if (!MovX || 
I == E20
)
522
5.55k
      continue;
523
14
524
14
    LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << *MovX << MovY);
525
14
526
35
    for (unsigned I = 0; I < Size; 
++I21
) {
527
21
      TargetInstrInfo::RegSubRegPair X1, Y1;
528
21
      X1 = getSubRegForIndex(X, Xsub, I, TRI, MRI);
529
21
      Y1 = getSubRegForIndex(Y, Ysub, I, TRI, MRI);
530
21
      BuildMI(*MovT.getParent(), MovX->getIterator(), MovT.getDebugLoc(),
531
21
                TII->get(AMDGPU::V_SWAP_B32))
532
21
        .addDef(X1.Reg, 0, X1.SubReg)
533
21
        .addDef(Y1.Reg, 0, Y1.SubReg)
534
21
        .addReg(Y1.Reg, 0, Y1.SubReg)
535
21
        .addReg(X1.Reg, 0, X1.SubReg).getInstr();
536
21
    }
537
14
    MovX->eraseFromParent();
538
14
    MovY.eraseFromParent();
539
14
    MachineInstr *Next = &*std::next(MovT.getIterator());
540
14
    if (MRI.use_nodbg_empty(T))
541
12
      MovT.eraseFromParent();
542
2
    else
543
2
      Xop.setIsKill(false);
544
14
545
14
    return Next;
546
14
  }
547
17.4k
548
17.4k
  
return nullptr17.4k
;
549
17.4k
}
550
551
50.7k
bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
552
50.7k
  if (skipFunction(MF.getFunction()))
553
21
    return false;
554
50.7k
555
50.7k
  MachineRegisterInfo &MRI = MF.getRegInfo();
556
50.7k
  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
557
50.7k
  const SIInstrInfo *TII = ST.getInstrInfo();
558
50.7k
  unsigned VCCReg = ST.isWave32() ? 
AMDGPU::VCC_LO3.84k
:
AMDGPU::VCC46.8k
;
559
50.7k
560
50.7k
  std::vector<unsigned> I1Defs;
561
50.7k
562
50.7k
  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
563
108k
                                                  BI != BE; 
++BI57.4k
) {
564
57.4k
565
57.4k
    MachineBasicBlock &MBB = *BI;
566
57.4k
    MachineBasicBlock::iterator I, Next;
567
1.01M
    for (I = MBB.begin(); I != MBB.end(); 
I = Next953k
) {
568
953k
      Next = std::next(I);
569
953k
      MachineInstr &MI = *I;
570
953k
571
953k
      if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) {
572
77.1k
        // If this has a literal constant source that is the same as the
573
77.1k
        // reversed bits of an inline immediate, replace with a bitreverse of
574
77.1k
        // that constant. This saves 4 bytes in the common case of materializing
575
77.1k
        // sign bits.
576
77.1k
577
77.1k
        // Test if we are after regalloc. We only want to do this after any
578
77.1k
        // optimizations happen because this will confuse them.
579
77.1k
        // XXX - not exactly a check for post-regalloc run.
580
77.1k
        MachineOperand &Src = MI.getOperand(1);
581
77.1k
        if (Src.isImm() &&
582
77.1k
            
TargetRegisterInfo::isPhysicalRegister(MI.getOperand(0).getReg())22.0k
) {
583
12.6k
          int32_t ReverseImm;
584
12.6k
          if (isReverseInlineImm(TII, Src, ReverseImm)) {
585
97
            MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32));
586
97
            Src.setImm(ReverseImm);
587
97
            continue;
588
97
          }
589
953k
        }
590
77.1k
      }
591
953k
592
953k
      if (ST.hasSwap() && 
(239k
MI.getOpcode() == AMDGPU::V_MOV_B32_e32239k
||
593
239k
                           
MI.getOpcode() == AMDGPU::COPY217k
)) {
594
74.8k
        if (auto *NextMI = matchSwap(MI, MRI, TII)) {
595
14
          Next = NextMI->getIterator();
596
14
          continue;
597
14
        }
598
953k
      }
599
953k
600
953k
      // Combine adjacent s_nops to use the immediate operand encoding how long
601
953k
      // to wait.
602
953k
      //
603
953k
      // s_nop N
604
953k
      // s_nop M
605
953k
      //  =>
606
953k
      // s_nop (N + M)
607
953k
      if (MI.getOpcode() == AMDGPU::S_NOP &&
608
953k
          
Next != MBB.end()2.52k
&&
609
953k
          
(*Next).getOpcode() == AMDGPU::S_NOP2.52k
) {
610
826
611
826
        MachineInstr &NextMI = *Next;
612
826
        // The instruction encodes the amount to wait with an offset of 1,
613
826
        // i.e. 0 is wait 1 cycle. Convert both to cycles and then convert back
614
826
        // after adding.
615
826
        uint8_t Nop0 = MI.getOperand(0).getImm() + 1;
616
826
        uint8_t Nop1 = NextMI.getOperand(0).getImm() + 1;
617
826
618
826
        // Make sure we don't overflow the bounds.
619
826
        if (Nop0 + Nop1 <= 8) {
620
790
          NextMI.getOperand(0).setImm(Nop0 + Nop1 - 1);
621
790
          MI.eraseFromParent();
622
790
        }
623
826
624
826
        continue;
625
826
      }
626
952k
627
952k
      // FIXME: We also need to consider movs of constant operands since
628
952k
      // immediate operands are not folded if they have more than one use, and
629
952k
      // the operand folding pass is unaware if the immediate will be free since
630
952k
      // it won't know if the src == dest constraint will end up being
631
952k
      // satisfied.
632
952k
      if (MI.getOpcode() == AMDGPU::S_ADD_I32 ||
633
952k
          
MI.getOpcode() == AMDGPU::S_MUL_I32949k
) {
634
4.02k
        const MachineOperand *Dest = &MI.getOperand(0);
635
4.02k
        MachineOperand *Src0 = &MI.getOperand(1);
636
4.02k
        MachineOperand *Src1 = &MI.getOperand(2);
637
4.02k
638
4.02k
        if (!Src0->isReg() && 
Src1->isReg()14
) {
639
10
          if (TII->commuteInstruction(MI, false, 1, 2))
640
4
            std::swap(Src0, Src1);
641
10
        }
642
4.02k
643
4.02k
        // FIXME: This could work better if hints worked with subregisters. If
644
4.02k
        // we have a vector add of a constant, we usually don't get the correct
645
4.02k
        // allocation due to the subregister usage.
646
4.02k
        if (TargetRegisterInfo::isVirtualRegister(Dest->getReg()) &&
647
4.02k
            
Src0->isReg()1.98k
) {
648
1.97k
          MRI.setRegAllocationHint(Dest->getReg(), 0, Src0->getReg());
649
1.97k
          MRI.setRegAllocationHint(Src0->getReg(), 0, Dest->getReg());
650
1.97k
          continue;
651
1.97k
        }
652
2.04k
653
2.04k
        if (Src0->isReg() && 
Src0->getReg() == Dest->getReg()2.03k
) {
654
860
          if (Src1->isImm() && 
isKImmOperand(TII, *Src1)405
) {
655
91
            unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ?
656
83
              AMDGPU::S_ADDK_I32 : 
AMDGPU::S_MULK_I328
;
657
91
658
91
            MI.setDesc(TII->get(Opc));
659
91
            MI.tieOperands(0, 1);
660
91
          }
661
860
        }
662
2.04k
      }
663
952k
664
952k
      // Try to use s_cmpk_*
665
952k
      
if (950k
MI.isCompare()950k
&&
TII->isSOPC(MI)12.5k
) {
666
811
        shrinkScalarCompare(TII, MI);
667
811
        continue;
668
811
      }
669
950k
670
950k
      // Try to use S_MOVK_I32, which will save 4 bytes for small immediates.
671
950k
      if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
672
73.0k
        const MachineOperand &Dst = MI.getOperand(0);
673
73.0k
        MachineOperand &Src = MI.getOperand(1);
674
73.0k
675
73.0k
        if (Src.isImm() &&
676
73.0k
            
TargetRegisterInfo::isPhysicalRegister(Dst.getReg())51.5k
) {
677
28.0k
          int32_t ReverseImm;
678
28.0k
          if (isKImmOperand(TII, Src))
679
1.48k
            MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
680
26.5k
          else if (isReverseInlineImm(TII, Src, ReverseImm)) {
681
106
            MI.setDesc(TII->get(AMDGPU::S_BREV_B32));
682
106
            Src.setImm(ReverseImm);
683
106
          }
684
28.0k
        }
685
73.0k
686
73.0k
        continue;
687
73.0k
      }
688
877k
689
877k
      // Shrink scalar logic operations.
690
877k
      if (MI.getOpcode() == AMDGPU::S_AND_B32 ||
691
877k
          
MI.getOpcode() == AMDGPU::S_OR_B32872k
||
692
877k
          
MI.getOpcode() == AMDGPU::S_XOR_B32871k
) {
693
5.92k
        if (shrinkScalarLogicOp(ST, MRI, TII, MI))
694
39
          continue;
695
876k
      }
696
876k
697
876k
      if (TII->isMIMG(MI.getOpcode()) &&
698
876k
          
ST.getGeneration() >= AMDGPUSubtarget::GFX102.52k
&&
699
876k
          MF.getProperties().hasProperty(
700
480
              MachineFunctionProperties::Property::NoVRegs)) {
701
240
        shrinkMIMG(MI);
702
240
        continue;
703
240
      }
704
876k
705
876k
      if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
706
792k
        continue;
707
83.8k
708
83.8k
      if (!TII->canShrink(MI, MRI)) {
709
26.9k
        // Try commuting the instruction and see if that enables us to shrink
710
26.9k
        // it.
711
26.9k
        if (!MI.isCommutable() || 
!TII->commuteInstruction(MI)21.1k
||
712
26.9k
            
!TII->canShrink(MI, MRI)20.6k
)
713
12.6k
          continue;
714
71.2k
      }
715
71.2k
716
71.2k
      // getVOPe32 could be -1 here if we started with an instruction that had
717
71.2k
      // a 32-bit encoding and then commuted it to an instruction that did not.
718
71.2k
      if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
719
0
        continue;
720
71.2k
721
71.2k
      int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
722
71.2k
723
71.2k
      if (TII->isVOPC(Op32)) {
724
7.98k
        unsigned DstReg = MI.getOperand(0).getReg();
725
7.98k
        if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
726
3.96k
          // VOPC instructions can only write to the VCC register. We can't
727
3.96k
          // force them to use VCC here, because this is only one register and
728
3.96k
          // cannot deal with sequences which would require multiple copies of
729
3.96k
          // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
730
3.96k
          //
731
3.96k
          // So, instead of forcing the instruction to write to VCC, we provide
732
3.96k
          // a hint to the register allocator to use VCC and then we will run
733
3.96k
          // this pass again after RA and shrink it if it outputs to VCC.
734
3.96k
          MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, VCCReg);
735
3.96k
          continue;
736
3.96k
        }
737
4.02k
        if (DstReg != VCCReg)
738
1.21k
          continue;
739
66.0k
      }
740
66.0k
741
66.0k
      if (Op32 == AMDGPU::V_CNDMASK_B32_e32) {
742
7.91k
        // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC
743
7.91k
        // instructions.
744
7.91k
        const MachineOperand *Src2 =
745
7.91k
            TII->getNamedOperand(MI, AMDGPU::OpName::src2);
746
7.91k
        if (!Src2->isReg())
747
0
          continue;
748
7.91k
        unsigned SReg = Src2->getReg();
749
7.91k
        if (TargetRegisterInfo::isVirtualRegister(SReg)) {
750
3.94k
          MRI.setRegAllocationHint(SReg, 0, VCCReg);
751
3.94k
          continue;
752
3.94k
        }
753
3.97k
        if (SReg != VCCReg)
754
967
          continue;
755
61.1k
      }
756
61.1k
757
61.1k
      // Check for the bool flag output for instructions like V_ADD_I32_e64.
758
61.1k
      const MachineOperand *SDst = TII->getNamedOperand(MI,
759
61.1k
                                                        AMDGPU::OpName::sdst);
760
61.1k
761
61.1k
      // Check the carry-in operand for v_addc_u32_e64.
762
61.1k
      const MachineOperand *Src2 = TII->getNamedOperand(MI,
763
61.1k
                                                        AMDGPU::OpName::src2);
764
61.1k
765
61.1k
      if (SDst) {
766
28.8k
        bool Next = false;
767
28.8k
768
28.8k
        if (SDst->getReg() != VCCReg) {
769
13.4k
          if (TargetRegisterInfo::isVirtualRegister(SDst->getReg()))
770
13.0k
            MRI.setRegAllocationHint(SDst->getReg(), 0, VCCReg);
771
13.4k
          Next = true;
772
13.4k
        }
773
28.8k
774
28.8k
        // All of the instructions with carry outs also have an SGPR input in
775
28.8k
        // src2.
776
28.8k
        if (Src2 && 
Src2->getReg() != VCCReg12.4k
) {
777
6.50k
          if (TargetRegisterInfo::isVirtualRegister(Src2->getReg()))
778
6.24k
            MRI.setRegAllocationHint(Src2->getReg(), 0, VCCReg);
779
6.50k
          Next = true;
780
6.50k
        }
781
28.8k
782
28.8k
        if (Next)
783
13.5k
          continue;
784
47.6k
      }
785
47.6k
786
47.6k
      // We can shrink this instruction
787
47.6k
      LLVM_DEBUG(dbgs() << "Shrinking " << MI);
788
47.6k
789
47.6k
      MachineInstr *Inst32 = TII->buildShrunkInst(MI, Op32);
790
47.6k
      ++NumInstructionsShrunk;
791
47.6k
792
47.6k
      // Copy extra operands not present in the instruction definition.
793
47.6k
      copyExtraImplicitOps(*Inst32, MF, MI);
794
47.6k
795
47.6k
      MI.eraseFromParent();
796
47.6k
      foldImmediates(*Inst32, TII, MRI);
797
47.6k
798
47.6k
      LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n');
799
47.6k
    }
800
57.4k
  }
801
50.7k
  return false;
802
50.7k
}