Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
Line
Count
Source (jump to first uncovered line)
1
//===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
/// \file This pass tries to apply several peephole SDWA patterns.
10
///
11
/// E.g. original:
12
///   V_LSHRREV_B32_e32 %0, 16, %1
13
///   V_ADD_I32_e32 %2, %0, %3
14
///   V_LSHLREV_B32_e32 %4, 16, %2
15
///
16
/// Replace:
17
///   V_ADD_I32_sdwa %4, %1, %3
18
///       dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
19
///
20
//===----------------------------------------------------------------------===//
21
22
#include "AMDGPU.h"
23
#include "AMDGPUSubtarget.h"
24
#include "SIDefines.h"
25
#include "SIInstrInfo.h"
26
#include "SIRegisterInfo.h"
27
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
28
#include "Utils/AMDGPUBaseInfo.h"
29
#include "llvm/ADT/None.h"
30
#include "llvm/ADT/Optional.h"
31
#include "llvm/ADT/STLExtras.h"
32
#include "llvm/ADT/SmallVector.h"
33
#include "llvm/ADT/Statistic.h"
34
#include "llvm/CodeGen/MachineBasicBlock.h"
35
#include "llvm/CodeGen/MachineFunction.h"
36
#include "llvm/CodeGen/MachineFunctionPass.h"
37
#include "llvm/CodeGen/MachineInstr.h"
38
#include "llvm/CodeGen/MachineInstrBuilder.h"
39
#include "llvm/CodeGen/MachineOperand.h"
40
#include "llvm/CodeGen/MachineRegisterInfo.h"
41
#include "llvm/CodeGen/TargetRegisterInfo.h"
42
#include "llvm/Config/llvm-config.h"
43
#include "llvm/MC/LaneBitmask.h"
44
#include "llvm/MC/MCInstrDesc.h"
45
#include "llvm/Pass.h"
46
#include "llvm/Support/Debug.h"
47
#include "llvm/Support/raw_ostream.h"
48
#include <algorithm>
49
#include <cassert>
50
#include <cstdint>
51
#include <memory>
52
#include <unordered_map>
53
54
using namespace llvm;
55
56
#define DEBUG_TYPE "si-peephole-sdwa"
57
58
STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found.");
59
STATISTIC(NumSDWAInstructionsPeepholed,
60
          "Number of instruction converted to SDWA.");
61
62
namespace {
63
64
class SDWAOperand;
65
class SDWADstOperand;
66
67
class SIPeepholeSDWA : public MachineFunctionPass {
68
public:
69
  using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>;
70
71
private:
72
  MachineRegisterInfo *MRI;
73
  const SIRegisterInfo *TRI;
74
  const SIInstrInfo *TII;
75
76
  std::unordered_map<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
77
  std::unordered_map<MachineInstr *, SDWAOperandsVector> PotentialMatches;
78
  SmallVector<MachineInstr *, 8> ConvertedInstructions;
79
80
  Optional<int64_t> foldToImm(const MachineOperand &Op) const;
81
82
public:
83
  static char ID;
84
85
2.39k
  SIPeepholeSDWA() : MachineFunctionPass(ID) {
86
2.39k
    initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry());
87
2.39k
  }
88
89
  bool runOnMachineFunction(MachineFunction &MF) override;
90
  void matchSDWAOperands(MachineBasicBlock &MBB);
91
  std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
92
  bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST) const;
93
  void pseudoOpConvertToVOP2(MachineInstr &MI,
94
                             const GCNSubtarget &ST) const;
95
  bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
96
  void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const;
97
98
27.5k
  StringRef getPassName() const override { return "SI Peephole SDWA"; }
99
100
2.37k
  void getAnalysisUsage(AnalysisUsage &AU) const override {
101
2.37k
    AU.setPreservesCFG();
102
2.37k
    MachineFunctionPass::getAnalysisUsage(AU);
103
2.37k
  }
104
};
105
106
class SDWAOperand {
107
private:
108
  MachineOperand *Target; // Operand that would be used in converted instruction
109
  MachineOperand *Replaced; // Operand that would be replace by Target
110
111
public:
112
  SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp)
113
29.6k
      : Target(TargetOp), Replaced(ReplacedOp) {
114
29.6k
    assert(Target->isReg());
115
29.6k
    assert(Replaced->isReg());
116
29.6k
  }
117
118
29.6k
  virtual ~SDWAOperand() = default;
119
120
  virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0;
121
  virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0;
122
123
4.99k
  MachineOperand *getTargetOperand() const { return Target; }
124
33.0k
  MachineOperand *getReplacedOperand() const { return Replaced; }
125
37.2k
  MachineInstr *getParentInst() const { return Target->getParent(); }
126
127
29.6k
  MachineRegisterInfo *getMRI() const {
128
29.6k
    return &getParentInst()->getParent()->getParent()->getRegInfo();
129
29.6k
  }
130
131
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
132
  virtual void print(raw_ostream& OS) const = 0;
133
  void dump() const { print(dbgs()); }
134
#endif
135
};
136
137
using namespace AMDGPU::SDWA;
138
139
class SDWASrcOperand : public SDWAOperand {
140
private:
141
  SdwaSel SrcSel;
142
  bool Abs;
143
  bool Neg;
144
  bool Sext;
145
146
public:
147
  SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
148
                 SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false,
149
                 bool Sext_ = false)
150
      : SDWAOperand(TargetOp, ReplacedOp),
151
26.4k
        SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {}
152
153
  MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
154
  bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
155
156
2.21k
  SdwaSel getSrcSel() const { return SrcSel; }
157
0
  bool getAbs() const { return Abs; }
158
0
  bool getNeg() const { return Neg; }
159
0
  bool getSext() const { return Sext; }
160
161
  uint64_t getSrcMods(const SIInstrInfo *TII,
162
                      const MachineOperand *SrcOp) const;
163
164
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
165
  void print(raw_ostream& OS) const override;
166
#endif
167
};
168
169
class SDWADstOperand : public SDWAOperand {
170
private:
171
  SdwaSel DstSel;
172
  DstUnused DstUn;
173
174
public:
175
176
  SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
177
                 SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD)
178
3.17k
    : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
179
180
  MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
181
  bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
182
183
604
  SdwaSel getDstSel() const { return DstSel; }
184
573
  DstUnused getDstUnused() const { return DstUn; }
185
186
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
187
  void print(raw_ostream& OS) const override;
188
#endif
189
};
190
191
class SDWADstPreserveOperand : public SDWADstOperand {
192
private:
193
  MachineOperand *Preserve;
194
195
public:
196
  SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
197
                         MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD)
198
      : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE),
199
4
        Preserve(PreserveOp) {}
200
201
  bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
202
203
4
  MachineOperand *getPreservedOperand() const { return Preserve; }
204
205
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
206
  void print(raw_ostream& OS) const override;
207
#endif
208
};
209
210
} // end anonymous namespace
211
212
INITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA", false, false)
213
214
char SIPeepholeSDWA::ID = 0;
215
216
char &llvm::SIPeepholeSDWAID = SIPeepholeSDWA::ID;
217
218
0
FunctionPass *llvm::createSIPeepholeSDWAPass() {
219
0
  return new SIPeepholeSDWA();
220
0
}
221
222
223
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
224
static raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) {
225
  switch(Sel) {
226
  case BYTE_0: OS << "BYTE_0"; break;
227
  case BYTE_1: OS << "BYTE_1"; break;
228
  case BYTE_2: OS << "BYTE_2"; break;
229
  case BYTE_3: OS << "BYTE_3"; break;
230
  case WORD_0: OS << "WORD_0"; break;
231
  case WORD_1: OS << "WORD_1"; break;
232
  case DWORD:  OS << "DWORD"; break;
233
  }
234
  return OS;
235
}
236
237
static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) {
238
  switch(Un) {
239
  case UNUSED_PAD: OS << "UNUSED_PAD"; break;
240
  case UNUSED_SEXT: OS << "UNUSED_SEXT"; break;
241
  case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break;
242
  }
243
  return OS;
244
}
245
246
static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) {
247
  Operand.print(OS);
248
  return OS;
249
}
250
251
LLVM_DUMP_METHOD
252
void SDWASrcOperand::print(raw_ostream& OS) const {
253
  OS << "SDWA src: " << *getTargetOperand()
254
    << " src_sel:" << getSrcSel()
255
    << " abs:" << getAbs() << " neg:" << getNeg()
256
    << " sext:" << getSext() << '\n';
257
}
258
259
LLVM_DUMP_METHOD
260
void SDWADstOperand::print(raw_ostream& OS) const {
261
  OS << "SDWA dst: " << *getTargetOperand()
262
    << " dst_sel:" << getDstSel()
263
    << " dst_unused:" << getDstUnused() << '\n';
264
}
265
266
LLVM_DUMP_METHOD
267
void SDWADstPreserveOperand::print(raw_ostream& OS) const {
268
  OS << "SDWA preserve dst: " << *getTargetOperand()
269
    << " dst_sel:" << getDstSel()
270
    << " preserve:" << *getPreservedOperand() << '\n';
271
}
272
273
#endif
274
275
2.78k
static void copyRegOperand(MachineOperand &To, const MachineOperand &From) {
276
2.78k
  assert(To.isReg() && From.isReg());
277
2.78k
  To.setReg(From.getReg());
278
2.78k
  To.setSubReg(From.getSubReg());
279
2.78k
  To.setIsUndef(From.isUndef());
280
2.78k
  if (To.isUse()) {
281
2.20k
    To.setIsKill(From.isKill());
282
2.20k
  } else {
283
573
    To.setIsDead(From.isDead());
284
573
  }
285
2.78k
}
286
287
43.1k
static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) {
288
43.1k
  return LHS.isReg() &&
289
43.1k
         
RHS.isReg()42.6k
&&
290
43.1k
         
LHS.getReg() == RHS.getReg()42.6k
&&
291
43.1k
         
LHS.getSubReg() == RHS.getSubReg()41.8k
;
292
43.1k
}
293
294
static MachineOperand *findSingleRegUse(const MachineOperand *Reg,
295
26.4k
                                        const MachineRegisterInfo *MRI) {
296
26.4k
  if (!Reg->isReg() || !Reg->isDef())
297
0
    return nullptr;
298
26.4k
299
26.4k
  MachineOperand *ResMO = nullptr;
300
26.4k
  for (MachineOperand &UseMO : MRI->use_nodbg_operands(Reg->getReg())) {
301
24.0k
    // If there exist use of subreg of Reg then return nullptr
302
24.0k
    if (!isSameReg(UseMO, *Reg))
303
0
      return nullptr;
304
24.0k
305
24.0k
    // Check that there is only one instruction that uses Reg
306
24.0k
    if (!ResMO) {
307
22.0k
      ResMO = &UseMO;
308
22.0k
    } else 
if (2.01k
ResMO->getParent() != UseMO.getParent()2.01k
) {
309
1.82k
      return nullptr;
310
1.82k
    }
311
24.0k
  }
312
26.4k
313
26.4k
  
return ResMO24.6k
;
314
26.4k
}
315
316
static MachineOperand *findSingleRegDef(const MachineOperand *Reg,
317
13.4k
                                        const MachineRegisterInfo *MRI) {
318
13.4k
  if (!Reg->isReg())
319
0
    return nullptr;
320
13.4k
321
13.4k
  MachineInstr *DefInstr = MRI->getUniqueVRegDef(Reg->getReg());
322
13.4k
  if (!DefInstr)
323
12
    return nullptr;
324
13.4k
325
13.4k
  for (auto &DefMO : DefInstr->defs()) {
326
13.4k
    if (DefMO.isReg() && DefMO.getReg() == Reg->getReg())
327
13.4k
      return &DefMO;
328
13.4k
  }
329
13.4k
330
13.4k
  // Ignore implicit defs.
331
13.4k
  
return nullptr6
;
332
13.4k
}
333
334
uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
335
2.20k
                                    const MachineOperand *SrcOp) const {
336
2.20k
  uint64_t Mods = 0;
337
2.20k
  const auto *MI = SrcOp->getParent();
338
2.20k
  if (TII->getNamedOperand(*MI, AMDGPU::OpName::src0) == SrcOp) {
339
1.05k
    if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) {
340
1.05k
      Mods = Mod->getImm();
341
1.05k
    }
342
1.15k
  } else if (TII->getNamedOperand(*MI, AMDGPU::OpName::src1) == SrcOp) {
343
1.15k
    if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers)) {
344
1.15k
      Mods = Mod->getImm();
345
1.15k
    }
346
1.15k
  }
347
2.20k
  if (Abs || Neg) {
348
0
    assert(!Sext &&
349
0
           "Float and integer src modifiers can't be set simulteniously");
350
0
    Mods |= Abs ? SISrcMods::ABS : 0u;
351
0
    Mods ^= Neg ? SISrcMods::NEG : 0u;
352
2.20k
  } else if (Sext) {
353
633
    Mods |= SISrcMods::SEXT;
354
633
  }
355
2.20k
356
2.20k
  return Mods;
357
2.20k
}
358
359
26.4k
MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) {
360
26.4k
  // For SDWA src operand potential instruction is one that use register
361
26.4k
  // defined by parent instruction
362
26.4k
  MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI());
363
26.4k
  if (!PotentialMO)
364
6.27k
    return nullptr;
365
20.1k
366
20.1k
  return PotentialMO->getParent();
367
20.1k
}
368
369
2.23k
bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
370
2.23k
  // Find operand in instruction that matches source operand and replace it with
371
2.23k
  // target operand. Set corresponding src_sel
372
2.23k
  bool IsPreserveSrc = false;
373
2.23k
  MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
374
2.23k
  MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);
375
2.23k
  MachineOperand *SrcMods =
376
2.23k
      TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
377
2.23k
  assert(Src && (Src->isReg() || Src->isImm()));
378
2.23k
  if (!isSameReg(*Src, *getReplacedOperand())) {
379
1.17k
    // If this is not src0 then it could be src1
380
1.17k
    Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
381
1.17k
    SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
382
1.17k
    SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
383
1.17k
384
1.17k
    if (!Src ||
385
1.17k
        
!isSameReg(*Src, *getReplacedOperand())1.17k
) {
386
23
      // It's possible this Src is a tied operand for
387
23
      // UNUSED_PRESERVE, in which case we can either
388
23
      // abandon the peephole attempt, or if legal we can
389
23
      // copy the target operand into the tied slot
390
23
      // if the preserve operation will effectively cause the same
391
23
      // result by overwriting the rest of the dst.
392
23
      MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
393
23
      MachineOperand *DstUnused =
394
23
        TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
395
23
396
23
      if (Dst &&
397
23
          DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {
398
4
        // This will work if the tied src is acessing WORD_0, and the dst is
399
4
        // writing WORD_1. Modifiers don't matter because all the bits that
400
4
        // would be impacted are being overwritten by the dst.
401
4
        // Any other case will not work.
402
4
        SdwaSel DstSel = static_cast<SdwaSel>(
403
4
            TII->getNamedImmOperand(MI, AMDGPU::OpName::dst_sel));
404
4
        if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 &&
405
4
            getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) {
406
2
          IsPreserveSrc = true;
407
2
          auto DstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
408
2
                                                   AMDGPU::OpName::vdst);
409
2
          auto TiedIdx = MI.findTiedOperandIdx(DstIdx);
410
2
          Src = &MI.getOperand(TiedIdx);
411
2
          SrcSel = nullptr;
412
2
          SrcMods = nullptr;
413
2
        } else {
414
2
          // Not legal to convert this src
415
2
          return false;
416
2
        }
417
1.17k
      }
418
23
    }
419
1.17k
    assert(Src && Src->isReg());
420
1.17k
421
1.17k
    if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
422
1.17k
         MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
423
1.17k
         MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
424
1.17k
         
MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa1.15k
) &&
425
1.17k
         
!isSameReg(*Src, *getReplacedOperand())31
) {
426
19
      // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to
427
19
      // src2. This is not allowed.
428
19
      return false;
429
19
    }
430
1.15k
431
1.15k
    assert(isSameReg(*Src, *getReplacedOperand()) &&
432
1.15k
           (IsPreserveSrc || (SrcSel && SrcMods)));
433
1.15k
  }
434
2.23k
  copyRegOperand(*Src, *getTargetOperand());
435
2.20k
  if (!IsPreserveSrc) {
436
2.20k
    SrcSel->setImm(getSrcSel());
437
2.20k
    SrcMods->setImm(getSrcMods(TII, Src));
438
2.20k
  }
439
2.20k
  getTargetOperand()->setIsKill(false);
440
2.20k
  return true;
441
2.23k
}
442
443
3.17k
MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) {
444
3.17k
  // For SDWA dst operand potential instruction is one that defines register
445
3.17k
  // that this operand uses
446
3.17k
  MachineRegisterInfo *MRI = getMRI();
447
3.17k
  MachineInstr *ParentMI = getParentInst();
448
3.17k
449
3.17k
  MachineOperand *PotentialMO = findSingleRegDef(getReplacedOperand(), MRI);
450
3.17k
  if (!PotentialMO)
451
12
    return nullptr;
452
3.16k
453
3.16k
  // Check that ParentMI is the only instruction that uses replaced register
454
3.30k
  
for (MachineInstr &UseInst : MRI->use_nodbg_instructions(PotentialMO->getReg()))3.16k
{
455
3.30k
    if (&UseInst != ParentMI)
456
204
      return nullptr;
457
3.30k
  }
458
3.16k
459
3.16k
  
return PotentialMO->getParent()2.96k
;
460
3.16k
}
461
462
604
bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
463
604
  // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused
464
604
465
604
  if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
466
604
       MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
467
604
       MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
468
604
       
MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa582
) &&
469
604
      
getDstSel() != AMDGPU::SDWA::DWORD31
) {
470
31
    // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD
471
31
    return false;
472
31
  }
473
573
474
573
  MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
475
573
  assert(Operand &&
476
573
         Operand->isReg() &&
477
573
         isSameReg(*Operand, *getReplacedOperand()));
478
573
  copyRegOperand(*Operand, *getTargetOperand());
479
573
  MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel);
480
573
  assert(DstSel);
481
573
  DstSel->setImm(getDstSel());
482
573
  MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
483
573
  assert(DstUnused);
484
573
  DstUnused->setImm(getDstUnused());
485
573
486
573
  // Remove original instruction  because it would conflict with our new
487
573
  // instruction by register definition
488
573
  getParentInst()->eraseFromParent();
489
573
  return true;
490
573
}
491
492
bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI,
493
2
                                           const SIInstrInfo *TII) {
494
2
  // MI should be moved right before v_or_b32.
495
2
  // For this we should clear all kill flags on uses of MI src-operands or else
496
2
  // we can encounter problem with use of killed operand.
497
22
  for (MachineOperand &MO : MI.uses()) {
498
22
    if (!MO.isReg())
499
16
      continue;
500
6
    getMRI()->clearKillFlags(MO.getReg());
501
6
  }
502
2
503
2
  // Move MI before v_or_b32
504
2
  auto MBB = MI.getParent();
505
2
  MBB->remove(&MI);
506
2
  MBB->insert(getParentInst(), &MI);
507
2
508
2
  // Add Implicit use of preserved register
509
2
  MachineInstrBuilder MIB(*MBB->getParent(), MI);
510
2
  MIB.addReg(getPreservedOperand()->getReg(),
511
2
             RegState::ImplicitKill,
512
2
             getPreservedOperand()->getSubReg());
513
2
514
2
  // Tie dst to implicit use
515
2
  MI.tieOperands(AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst),
516
2
                 MI.getNumOperands() - 1);
517
2
518
2
  // Convert MI as any other SDWADstOperand and remove v_or_b32
519
2
  return SDWADstOperand::convertToSDWA(MI, TII);
520
2
}
521
522
69.8k
Optional<int64_t> SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const {
523
69.8k
  if (Op.isImm()) {
524
54.1k
    return Op.getImm();
525
54.1k
  }
526
15.6k
527
15.6k
  // If this is not immediate then it can be copy of immediate value, e.g.:
528
15.6k
  // %1 = S_MOV_B32 255;
529
15.6k
  if (Op.isReg()) {
530
15.6k
    for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) {
531
15.6k
      if (!isSameReg(Op, Def))
532
2.39k
        continue;
533
13.2k
534
13.2k
      const MachineInstr *DefInst = Def.getParent();
535
13.2k
      if (!TII->isFoldableCopy(*DefInst))
536
5.32k
        return None;
537
7.90k
538
7.90k
      const MachineOperand &Copied = DefInst->getOperand(1);
539
7.90k
      if (!Copied.isImm())
540
462
        return None;
541
7.44k
542
7.44k
      return Copied.getImm();
543
7.44k
    }
544
15.6k
  }
545
15.6k
546
15.6k
  
return None2.39k
;
547
15.6k
}
548
549
std::unique_ptr<SDWAOperand>
550
694k
SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
551
694k
  unsigned Opcode = MI.getOpcode();
552
694k
  switch (Opcode) {
553
694k
  case AMDGPU::V_LSHRREV_B32_e32:
554
22.4k
  case AMDGPU::V_ASHRREV_I32_e32:
555
22.4k
  case AMDGPU::V_LSHLREV_B32_e32:
556
22.4k
  case AMDGPU::V_LSHRREV_B32_e64:
557
22.4k
  case AMDGPU::V_ASHRREV_I32_e64:
558
22.4k
  case AMDGPU::V_LSHLREV_B32_e64: {
559
22.4k
    // from: v_lshrrev_b32_e32 v1, 16/24, v0
560
22.4k
    // to SDWA src:v0 src_sel:WORD_1/BYTE_3
561
22.4k
562
22.4k
    // from: v_ashrrev_i32_e32 v1, 16/24, v0
563
22.4k
    // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1
564
22.4k
565
22.4k
    // from: v_lshlrev_b32_e32 v1, 16/24, v0
566
22.4k
    // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD
567
22.4k
    MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
568
22.4k
    auto Imm = foldToImm(*Src0);
569
22.4k
    if (!Imm)
570
166
      break;
571
22.2k
572
22.2k
    if (*Imm != 16 && 
*Imm != 2411.3k
)
573
9.79k
      break;
574
12.4k
575
12.4k
    MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
576
12.4k
    MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
577
12.4k
    if (TRI->isPhysicalRegister(Src1->getReg()) ||
578
12.4k
        TRI->isPhysicalRegister(Dst->getReg()))
579
0
      break;
580
12.4k
581
12.4k
    if (Opcode == AMDGPU::V_LSHLREV_B32_e32 ||
582
12.4k
        Opcode == AMDGPU::V_LSHLREV_B32_e64) {
583
2.66k
      return make_unique<SDWADstOperand>(
584
2.66k
          Dst, Src1, *Imm == 16 ? 
WORD_12.64k
:
BYTE_324
, UNUSED_PAD);
585
9.80k
    } else {
586
9.80k
      return make_unique<SDWASrcOperand>(
587
9.80k
          Src1, Dst, *Imm == 16 ? 
WORD_18.26k
:
BYTE_31.53k
, false, false,
588
9.80k
          Opcode != AMDGPU::V_LSHRREV_B32_e32 &&
589
9.80k
          
Opcode != AMDGPU::V_LSHRREV_B32_e649.79k
);
590
9.80k
    }
591
0
    break;
592
0
  }
593
0
594
8.30k
  case AMDGPU::V_LSHRREV_B16_e32:
595
8.30k
  case AMDGPU::V_ASHRREV_I16_e32:
596
8.30k
  case AMDGPU::V_LSHLREV_B16_e32:
597
8.30k
  case AMDGPU::V_LSHRREV_B16_e64:
598
8.30k
  case AMDGPU::V_ASHRREV_I16_e64:
599
8.30k
  case AMDGPU::V_LSHLREV_B16_e64: {
600
8.30k
    // from: v_lshrrev_b16_e32 v1, 8, v0
601
8.30k
    // to SDWA src:v0 src_sel:BYTE_1
602
8.30k
603
8.30k
    // from: v_ashrrev_i16_e32 v1, 8, v0
604
8.30k
    // to SDWA src:v0 src_sel:BYTE_1 sext:1
605
8.30k
606
8.30k
    // from: v_lshlrev_b16_e32 v1, 8, v0
607
8.30k
    // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD
608
8.30k
    MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
609
8.30k
    auto Imm = foldToImm(*Src0);
610
8.30k
    if (!Imm || 
*Imm != 88.17k
)
611
5.11k
      break;
612
3.19k
613
3.19k
    MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
614
3.19k
    MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
615
3.19k
616
3.19k
    if (TRI->isPhysicalRegister(Src1->getReg()) ||
617
3.19k
        TRI->isPhysicalRegister(Dst->getReg()))
618
0
      break;
619
3.19k
620
3.19k
    if (Opcode == AMDGPU::V_LSHLREV_B16_e32 ||
621
3.19k
        Opcode == AMDGPU::V_LSHLREV_B16_e64) {
622
504
      return make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
623
2.68k
    } else {
624
2.68k
      return make_unique<SDWASrcOperand>(
625
2.68k
            Src1, Dst, BYTE_1, false, false,
626
2.68k
            Opcode != AMDGPU::V_LSHRREV_B16_e32 &&
627
2.68k
            Opcode != AMDGPU::V_LSHRREV_B16_e64);
628
2.68k
    }
629
0
    break;
630
0
  }
631
0
632
10.1k
  case AMDGPU::V_BFE_I32:
633
10.1k
  case AMDGPU::V_BFE_U32: {
634
10.1k
    // e.g.:
635
10.1k
    // from: v_bfe_u32 v1, v0, 8, 8
636
10.1k
    // to SDWA src:v0 src_sel:BYTE_1
637
10.1k
638
10.1k
    // offset | width | src_sel
639
10.1k
    // ------------------------
640
10.1k
    // 0      | 8     | BYTE_0
641
10.1k
    // 0      | 16    | WORD_0
642
10.1k
    // 0      | 32    | DWORD ?
643
10.1k
    // 8      | 8     | BYTE_1
644
10.1k
    // 16     | 8     | BYTE_2
645
10.1k
    // 16     | 16    | WORD_1
646
10.1k
    // 24     | 8     | BYTE_3
647
10.1k
648
10.1k
    MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
649
10.1k
    auto Offset = foldToImm(*Src1);
650
10.1k
    if (!Offset)
651
26
      break;
652
10.1k
653
10.1k
    MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
654
10.1k
    auto Width = foldToImm(*Src2);
655
10.1k
    if (!Width)
656
30
      break;
657
10.1k
658
10.1k
    SdwaSel SrcSel = DWORD;
659
10.1k
660
10.1k
    if (*Offset == 0 && 
*Width == 88.53k
)
661
2.07k
      SrcSel = BYTE_0;
662
8.05k
    else if (*Offset == 0 && 
*Width == 166.46k
)
663
3.58k
      SrcSel = WORD_0;
664
4.47k
    else if (*Offset == 0 && 
*Width == 322.87k
)
665
0
      SrcSel = DWORD;
666
4.47k
    else if (*Offset == 8 && 
*Width == 8356
)
667
266
      SrcSel = BYTE_1;
668
4.20k
    else if (*Offset == 16 && 
*Width == 81.07k
)
669
1.02k
      SrcSel = BYTE_2;
670
3.18k
    else if (*Offset == 16 && 
*Width == 1658
)
671
4
      SrcSel = WORD_1;
672
3.18k
    else if (*Offset == 24 && 
*Width == 856
)
673
0
      SrcSel = BYTE_3;
674
3.18k
    else
675
3.18k
      break;
676
6.94k
677
6.94k
    MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
678
6.94k
    MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
679
6.94k
680
6.94k
    if (TRI->isPhysicalRegister(Src0->getReg()) ||
681
6.94k
        TRI->isPhysicalRegister(Dst->getReg()))
682
0
      break;
683
6.94k
684
6.94k
    return make_unique<SDWASrcOperand>(
685
6.94k
          Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32);
686
6.94k
  }
687
6.94k
688
11.1k
  case AMDGPU::V_AND_B32_e32:
689
11.1k
  case AMDGPU::V_AND_B32_e64: {
690
11.1k
    // e.g.:
691
11.1k
    // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0
692
11.1k
    // to SDWA src:v0 src_sel:WORD_0/BYTE_0
693
11.1k
694
11.1k
    MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
695
11.1k
    MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
696
11.1k
    auto ValSrc = Src1;
697
11.1k
    auto Imm = foldToImm(*Src0);
698
11.1k
699
11.1k
    if (!Imm) {
700
7.53k
      Imm = foldToImm(*Src1);
701
7.53k
      ValSrc = Src0;
702
7.53k
    }
703
11.1k
704
11.1k
    if (!Imm || 
(10.9k
*Imm != 0x0000ffff10.9k
&&
*Imm != 0x000000ff5.99k
))
705
4.16k
      break;
706
7.03k
707
7.03k
    MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
708
7.03k
709
7.03k
    if (TRI->isPhysicalRegister(ValSrc->getReg()) ||
710
7.03k
        TRI->isPhysicalRegister(Dst->getReg()))
711
0
      break;
712
7.03k
713
7.03k
    return make_unique<SDWASrcOperand>(
714
7.03k
        ValSrc, Dst, *Imm == 0x0000ffff ? 
WORD_04.91k
:
BYTE_02.12k
);
715
7.03k
  }
716
7.03k
717
7.03k
  case AMDGPU::V_OR_B32_e32:
718
5.34k
  case AMDGPU::V_OR_B32_e64: {
719
5.34k
    // Patterns for dst_unused:UNUSED_PRESERVE.
720
5.34k
    // e.g., from:
721
5.34k
    // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD
722
5.34k
    //                           src1_sel:WORD_1 src2_sel:WORD1
723
5.34k
    // v_add_f16_e32 v3, v1, v2
724
5.34k
    // v_or_b32_e32 v4, v0, v3
725
5.34k
    // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3
726
5.34k
727
5.34k
    // Check if one of operands of v_or_b32 is SDWA instruction
728
5.34k
    using CheckRetType = Optional<std::pair<MachineOperand *, MachineOperand *>>;
729
5.34k
    auto CheckOROperandsForSDWA =
730
10.4k
      [&](const MachineOperand *Op1, const MachineOperand *Op2) -> CheckRetType {
731
10.4k
        if (!Op1 || !Op1->isReg() || 
!Op29.93k
||
!Op2->isReg()9.93k
)
732
984
          return CheckRetType(None);
733
9.44k
734
9.44k
        MachineOperand *Op1Def = findSingleRegDef(Op1, MRI);
735
9.44k
        if (!Op1Def)
736
6
          return CheckRetType(None);
737
9.44k
738
9.44k
        MachineInstr *Op1Inst = Op1Def->getParent();
739
9.44k
        if (!TII->isSDWA(*Op1Inst))
740
8.58k
          return CheckRetType(None);
741
856
742
856
        MachineOperand *Op2Def = findSingleRegDef(Op2, MRI);
743
856
        if (!Op2Def)
744
0
          return CheckRetType(None);
745
856
746
856
        return CheckRetType(std::make_pair(Op1Def, Op2Def));
747
856
      };
748
5.34k
749
5.34k
    MachineOperand *OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
750
5.34k
    MachineOperand *OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
751
5.34k
    assert(OrSDWA && OrOther);
752
5.34k
    auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
753
5.34k
    if (!Res) {
754
5.08k
      OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
755
5.08k
      OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
756
5.08k
      assert(OrSDWA && OrOther);
757
5.08k
      Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
758
5.08k
      if (!Res)
759
4.49k
        break;
760
856
    }
761
856
762
856
    MachineOperand *OrSDWADef = Res->first;
763
856
    MachineOperand *OrOtherDef = Res->second;
764
856
    assert(OrSDWADef && OrOtherDef);
765
856
766
856
    MachineInstr *SDWAInst = OrSDWADef->getParent();
767
856
    MachineInstr *OtherInst = OrOtherDef->getParent();
768
856
769
856
    // Check that OtherInstr is actually bitwise compatible with SDWAInst = their
770
856
    // destination patterns don't overlap. Compatible instruction can be either
771
856
    // regular instruction with compatible bitness or SDWA instruction with
772
856
    // correct dst_sel
773
856
    // SDWAInst | OtherInst bitness / OtherInst dst_sel
774
856
    // -----------------------------------------------------
775
856
    // DWORD    | no                    / no
776
856
    // WORD_0   | no                    / BYTE_2/3, WORD_1
777
856
    // WORD_1   | 8/16-bit instructions / BYTE_0/1, WORD_0
778
856
    // BYTE_0   | no                    / BYTE_1/2/3, WORD_1
779
856
    // BYTE_1   | 8-bit                 / BYTE_0/2/3, WORD_1
780
856
    // BYTE_2   | 8/16-bit              / BYTE_0/1/3. WORD_0
781
856
    // BYTE_3   | 8/16/24-bit           / BYTE_0/1/2, WORD_0
782
856
    // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK
783
856
    // but v_add_f32 is not.
784
856
785
856
    // TODO: add support for non-SDWA instructions as OtherInst.
786
856
    // For now this only works with SDWA instructions. For regular instructions
787
856
    // there is no way to determine if the instruction writes only 8/16/24-bit
788
856
    // out of full register size and all registers are at min 32-bit wide.
789
856
    if (!TII->isSDWA(*OtherInst))
790
846
      break;
791
10
792
10
    SdwaSel DstSel = static_cast<SdwaSel>(
793
10
      TII->getNamedImmOperand(*SDWAInst, AMDGPU::OpName::dst_sel));;
794
10
    SdwaSel OtherDstSel = static_cast<SdwaSel>(
795
10
      TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_sel));
796
10
797
10
    bool DstSelAgree = false;
798
10
    switch (DstSel) {
799
10
    
case WORD_0: DstSelAgree = (0
(OtherDstSel == BYTE_2)0
||
800
0
                                (OtherDstSel == BYTE_3) ||
801
0
                                (OtherDstSel == WORD_1));
802
0
      break;
803
10
    
case WORD_1: DstSelAgree = (0
(OtherDstSel == BYTE_0)0
||
804
0
                                (OtherDstSel == BYTE_1) ||
805
0
                                (OtherDstSel == WORD_0));
806
0
      break;
807
10
    
case BYTE_0: DstSelAgree = (0
(OtherDstSel == BYTE_1)0
||
808
0
                                (OtherDstSel == BYTE_2) ||
809
0
                                (OtherDstSel == BYTE_3) ||
810
0
                                (OtherDstSel == WORD_1));
811
0
      break;
812
10
    
case BYTE_1: DstSelAgree = (4
(OtherDstSel == BYTE_0)4
||
813
4
                                (OtherDstSel == BYTE_2) ||
814
4
                                (OtherDstSel == BYTE_3) ||
815
4
                                (OtherDstSel == WORD_1));
816
4
      break;
817
10
    
case BYTE_2: DstSelAgree = (0
(OtherDstSel == BYTE_0)0
||
818
0
                                (OtherDstSel == BYTE_1) ||
819
0
                                (OtherDstSel == BYTE_3) ||
820
0
                                (OtherDstSel == WORD_0));
821
0
      break;
822
10
    
case BYTE_3: DstSelAgree = (0
(OtherDstSel == BYTE_0)0
||
823
0
                                (OtherDstSel == BYTE_1) ||
824
0
                                (OtherDstSel == BYTE_2) ||
825
0
                                (OtherDstSel == WORD_0));
826
0
      break;
827
10
    
default: DstSelAgree = false6
;
828
10
    }
829
10
830
10
    if (!DstSelAgree)
831
6
      break;
832
4
833
4
    // Also OtherInst dst_unused should be UNUSED_PAD
834
4
    DstUnused OtherDstUnused = static_cast<DstUnused>(
835
4
      TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_unused));
836
4
    if (OtherDstUnused != DstUnused::UNUSED_PAD)
837
0
      break;
838
4
839
4
    // Create DstPreserveOperand
840
4
    MachineOperand *OrDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
841
4
    assert(OrDst && OrDst->isReg());
842
4
843
4
    return make_unique<SDWADstPreserveOperand>(
844
4
      OrDst, OrSDWADef, OrOtherDef, DstSel);
845
4
846
4
  }
847
664k
  }
848
664k
849
664k
  return std::unique_ptr<SDWAOperand>(nullptr);
850
664k
}
851
852
36.2k
void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
853
694k
  for (MachineInstr &MI : MBB) {
854
694k
    if (auto Operand = matchSDWAOperand(MI)) {
855
29.6k
      LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n');
856
29.6k
      SDWAOperands[&MI] = std::move(Operand);
857
29.6k
      ++NumSDWAPatternsFound;
858
29.6k
    }
859
694k
  }
860
36.2k
}
861
862
// Convert the V_ADDC_U32_e64 into V_ADDC_U32_e32, and
863
// V_ADD_I32_e64 into V_ADD_I32_e32. This allows isConvertibleToSDWA
864
// to perform its transformation on V_ADD_I32_e32 into V_ADD_I32_sdwa.
865
//
866
// We are transforming from a VOP3 into a VOP2 form of the instruction.
867
//   %19:vgpr_32 = V_AND_B32_e32 255,
868
//       killed %16:vgpr_32, implicit $exec
869
//   %47:vgpr_32, %49:sreg_64_xexec = V_ADD_I32_e64
870
//       %26.sub0:vreg_64, %19:vgpr_32, implicit $exec
871
//  %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64
872
//       %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec
873
//
874
// becomes
875
//   %47:vgpr_32 = V_ADD_I32_sdwa
876
//       0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0,
877
//       implicit-def $vcc, implicit $exec
878
//  %48:vgpr_32 = V_ADDC_U32_e32
879
//       0, %26.sub1:vreg_64, implicit-def $vcc, implicit $vcc, implicit $exec
880
void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
881
28
                                           const GCNSubtarget &ST) const {
882
28
  int Opc = MI.getOpcode();
883
28
  assert((Opc == AMDGPU::V_ADD_I32_e64 || Opc == AMDGPU::V_SUB_I32_e64) &&
884
28
         "Currently only handles V_ADD_I32_e64 or V_SUB_I32_e64");
885
28
886
28
  // Can the candidate MI be shrunk?
887
28
  if (!TII->canShrink(MI, *MRI))
888
0
    return;
889
28
  Opc = AMDGPU::getVOPe32(Opc);
890
28
  // Find the related ADD instruction.
891
28
  const MachineOperand *Sdst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
892
28
  if (!Sdst)
893
0
    return;
894
28
  MachineOperand *NextOp = findSingleRegUse(Sdst, MRI);
895
28
  if (!NextOp)
896
2
    return;
897
26
  MachineInstr &MISucc = *NextOp->getParent();
898
26
  // Can the successor be shrunk?
899
26
  if (!TII->canShrink(MISucc, *MRI))
900
0
    return;
901
26
  int SuccOpc = AMDGPU::getVOPe32(MISucc.getOpcode());
902
26
  // Make sure the carry in/out are subsequently unused.
903
26
  MachineOperand *CarryIn = TII->getNamedOperand(MISucc, AMDGPU::OpName::src2);
904
26
  if (!CarryIn)
905
0
    return;
906
26
  MachineOperand *CarryOut = TII->getNamedOperand(MISucc, AMDGPU::OpName::sdst);
907
26
  if (!CarryOut)
908
0
    return;
909
26
  if (!MRI->hasOneUse(CarryIn->getReg()) || !MRI->use_empty(CarryOut->getReg()))
910
2
    return;
911
24
  // Make sure VCC or its subregs are dead before MI.
912
24
  MachineBasicBlock &MBB = *MI.getParent();
913
24
  auto Liveness = MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25);
914
24
  if (Liveness != MachineBasicBlock::LQR_Dead)
915
2
    return;
916
22
  // Check if VCC is referenced in range of (MI,MISucc].
917
22
  for (auto I = std::next(MI.getIterator()), E = MISucc.getIterator();
918
30
       I != E; 
++I8
) {
919
12
    if (I->modifiesRegister(AMDGPU::VCC, TRI))
920
4
      return;
921
12
  }
922
22
  // Make the two new e32 instruction variants.
923
22
  // Replace MI with V_{SUB|ADD}_I32_e32
924
22
  auto NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc));
925
18
  NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst));
926
18
  NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0));
927
18
  NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1));
928
18
  MI.eraseFromParent();
929
18
  // Replace MISucc with V_{SUBB|ADDC}_U32_e32
930
18
  auto NewInst = BuildMI(MBB, MISucc, MISucc.getDebugLoc(), TII->get(SuccOpc));
931
18
  NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::vdst));
932
18
  NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src0));
933
18
  NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src1));
934
18
  MISucc.eraseFromParent();
935
18
}
936
937
bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI,
938
11.5k
                                         const GCNSubtarget &ST) const {
939
11.5k
  // Check if this is already an SDWA instruction
940
11.5k
  unsigned Opc = MI.getOpcode();
941
11.5k
  if (TII->isSDWA(Opc))
942
89
    return true;
943
11.4k
944
11.4k
  // Check if this instruction has opcode that supports SDWA
945
11.4k
  if (AMDGPU::getSDWAOp(Opc) == -1)
946
10.7k
    Opc = AMDGPU::getVOPe32(Opc);
947
11.4k
948
11.4k
  if (AMDGPU::getSDWAOp(Opc) == -1)
949
7.23k
    return false;
950
4.24k
951
4.24k
  if (!ST.hasSDWAOmod() && 
TII->hasModifiersSet(MI, AMDGPU::OpName::omod)3.02k
)
952
14
    return false;
953
4.23k
954
4.23k
  if (TII->isVOPC(Opc)) {
955
86
    if (!ST.hasSDWASdst()) {
956
62
      const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
957
62
      if (SDst && 
(58
SDst->getReg() != AMDGPU::VCC58
&&
958
58
                   
SDst->getReg() != AMDGPU::VCC_LO49
))
959
49
        return false;
960
37
    }
961
37
962
37
    if (!ST.hasSDWAOutModsVOPC() &&
963
37
        
(24
TII->hasModifiersSet(MI, AMDGPU::OpName::clamp)24
||
964
24
         
TII->hasModifiersSet(MI, AMDGPU::OpName::omod)18
))
965
6
      return false;
966
4.14k
967
4.14k
  } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst) ||
968
4.14k
             
!TII->getNamedOperand(MI, AMDGPU::OpName::vdst)4.13k
) {
969
10
    return false;
970
10
  }
971
4.16k
972
4.16k
  if (!ST.hasSDWAMac() && 
(1.21k
Opc == AMDGPU::V_FMAC_F16_e321.21k
||
973
1.21k
                           
Opc == AMDGPU::V_FMAC_F32_e321.18k
||
974
1.21k
                           
Opc == AMDGPU::V_MAC_F16_e321.16k
||
975
1.21k
                           
Opc == AMDGPU::V_MAC_F32_e321.13k
))
976
94
    return false;
977
4.07k
978
4.07k
  // Check if target supports this SDWA opcode
979
4.07k
  if (TII->pseudoToMCOpcode(Opc) == -1)
980
210
    return false;
981
3.86k
982
3.86k
  // FIXME: has SDWA but require handling of implicit VCC use
983
3.86k
  if (Opc == AMDGPU::V_CNDMASK_B32_e32)
984
121
    return false;
985
3.74k
986
3.74k
  return true;
987
3.74k
}
988
989
bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
990
3.27k
                                   const SDWAOperandsVector &SDWAOperands) {
991
3.27k
992
3.27k
  LLVM_DEBUG(dbgs() << "Convert instruction:" << MI);
993
3.27k
994
3.27k
  // Convert to sdwa
995
3.27k
  int SDWAOpcode;
996
3.27k
  unsigned Opcode = MI.getOpcode();
997
3.27k
  if (TII->isSDWA(Opcode)) {
998
74
    SDWAOpcode = Opcode;
999
3.20k
  } else {
1000
3.20k
    SDWAOpcode = AMDGPU::getSDWAOp(Opcode);
1001
3.20k
    if (SDWAOpcode == -1)
1002
2.49k
      SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode));
1003
3.20k
  }
1004
3.27k
  assert(SDWAOpcode != -1);
1005
3.27k
1006
3.27k
  const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode);
1007
3.27k
1008
3.27k
  // Create SDWA version of instruction MI and initialize its operands
1009
3.27k
  MachineInstrBuilder SDWAInst =
1010
3.27k
    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc);
1011
3.27k
1012
3.27k
  // Copy dst, if it is present in original then should also be present in SDWA
1013
3.27k
  MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
1014
3.27k
  if (Dst) {
1015
3.24k
    assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1);
1016
3.24k
    SDWAInst.add(*Dst);
1017
3.24k
  } else 
if (31
(Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst))31
) {
1018
23
    assert(Dst &&
1019
23
           AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1);
1020
23
    SDWAInst.add(*Dst);
1021
23
  } else {
1022
8
    assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1);
1023
8
    SDWAInst.addReg(TRI->getVCC(), RegState::Define);
1024
8
  }
1025
3.27k
1026
3.27k
  // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and
1027
3.27k
  // src0_modifiers (except for v_nop_sdwa, but it can't get here)
1028
3.27k
  MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1029
3.27k
  assert(
1030
3.27k
    Src0 &&
1031
3.27k
    AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0) != -1 &&
1032
3.27k
    AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_modifiers) != -1);
1033
3.27k
  if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers))
1034
519
    SDWAInst.addImm(Mod->getImm());
1035
2.75k
  else
1036
2.75k
    SDWAInst.addImm(0);
1037
3.27k
  SDWAInst.add(*Src0);
1038
3.27k
1039
3.27k
  // Copy src1 if present, initialize src1_modifiers.
1040
3.27k
  MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1041
3.27k
  if (Src1) {
1042
3.01k
    assert(
1043
3.01k
      AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1) != -1 &&
1044
3.01k
      AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_modifiers) != -1);
1045
3.01k
    if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers))
1046
334
      SDWAInst.addImm(Mod->getImm());
1047
2.68k
    else
1048
2.68k
      SDWAInst.addImm(0);
1049
3.01k
    SDWAInst.add(*Src1);
1050
3.01k
  }
1051
3.27k
1052
3.27k
  if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa ||
1053
3.27k
      SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa ||
1054
3.27k
      SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
1055
3.27k
      
SDWAOpcode == AMDGPU::V_MAC_F32_sdwa3.25k
) {
1056
31
    // v_mac_f16/32 has additional src2 operand tied to vdst
1057
31
    MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
1058
31
    assert(Src2);
1059
31
    SDWAInst.add(*Src2);
1060
31
  }
1061
3.27k
1062
3.27k
  // Copy clamp if present, initialize otherwise
1063
3.27k
  assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1);
1064
3.27k
  MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp);
1065
3.27k
  if (Clamp) {
1066
626
    SDWAInst.add(*Clamp);
1067
2.65k
  } else {
1068
2.65k
    SDWAInst.addImm(0);
1069
2.65k
  }
1070
3.27k
1071
3.27k
  // Copy omod if present, initialize otherwise if needed
1072
3.27k
  if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1) {
1073
549
    MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod);
1074
549
    if (OMod) {
1075
491
      SDWAInst.add(*OMod);
1076
491
    } else {
1077
58
      SDWAInst.addImm(0);
1078
58
    }
1079
549
  }
1080
3.27k
1081
3.27k
  // Copy dst_sel if present, initialize otherwise if needed
1082
3.27k
  if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1) {
1083
3.24k
    MachineOperand *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel);
1084
3.24k
    if (DstSel) {
1085
74
      SDWAInst.add(*DstSel);
1086
3.17k
    } else {
1087
3.17k
      SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1088
3.17k
    }
1089
3.24k
  }
1090
3.27k
1091
3.27k
  // Copy dst_unused if present, initialize otherwise if needed
1092
3.27k
  if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1) {
1093
3.24k
    MachineOperand *DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
1094
3.24k
    if (DstUnused) {
1095
74
      SDWAInst.add(*DstUnused);
1096
3.17k
    } else {
1097
3.17k
      SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD);
1098
3.17k
    }
1099
3.24k
  }
1100
3.27k
1101
3.27k
  // Copy src0_sel if present, initialize otherwise
1102
3.27k
  assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_sel) != -1);
1103
3.27k
  MachineOperand *Src0Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);
1104
3.27k
  if (Src0Sel) {
1105
74
    SDWAInst.add(*Src0Sel);
1106
3.20k
  } else {
1107
3.20k
    SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1108
3.20k
  }
1109
3.27k
1110
3.27k
  // Copy src1_sel if present, initialize otherwise if needed
1111
3.27k
  if (Src1) {
1112
3.01k
    assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_sel) != -1);
1113
3.01k
    MachineOperand *Src1Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
1114
3.01k
    if (Src1Sel) {
1115
70
      SDWAInst.add(*Src1Sel);
1116
2.94k
    } else {
1117
2.94k
      SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1118
2.94k
    }
1119
3.01k
  }
1120
3.27k
1121
3.27k
  // Check for a preserved register that needs to be copied.
1122
3.27k
  auto DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
1123
3.27k
  if (DstUnused &&
1124
3.27k
      
DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE74
) {
1125
4
    // We expect, if we are here, that the instruction was already in it's SDWA form,
1126
4
    // with a tied operand.
1127
4
    assert(Dst && Dst->isTied());
1128
4
    assert(Opcode == static_cast<unsigned int>(SDWAOpcode));
1129
4
    // We also expect a vdst, since sdst can't preserve.
1130
4
    auto PreserveDstIdx = AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst);
1131
4
    assert(PreserveDstIdx != -1);
1132
4
1133
4
    auto TiedIdx = MI.findTiedOperandIdx(PreserveDstIdx);
1134
4
    auto Tied = MI.getOperand(TiedIdx);
1135
4
1136
4
    SDWAInst.add(Tied);
1137
4
    SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1);
1138
4
  }
1139
3.27k
1140
3.27k
  // Apply all sdwa operand patterns.
1141
3.27k
  bool Converted = false;
1142
3.83k
  for (auto &Operand : SDWAOperands) {
1143
3.83k
    LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand);
1144
3.83k
    // There should be no intesection between SDWA operands and potential MIs
1145
3.83k
    // e.g.:
1146
3.83k
    // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0
1147
3.83k
    // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0
1148
3.83k
    // v_add_u32 v3, v4, v2
1149
3.83k
    //
1150
3.83k
    // In that example it is possible that we would fold 2nd instruction into 3rd
1151
3.83k
    // (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that was
1152
3.83k
    // already destroyed). So if SDWAOperand is also a potential MI then do not
1153
3.83k
    // apply it.
1154
3.83k
    if (PotentialMatches.count(Operand->getParentInst()) == 0)
1155
2.83k
      Converted |= Operand->convertToSDWA(*SDWAInst, TII);
1156
3.83k
  }
1157
3.27k
  if (Converted) {
1158
2.29k
    ConvertedInstructions.push_back(SDWAInst);
1159
2.29k
  } else {
1160
986
    SDWAInst->eraseFromParent();
1161
986
    return false;
1162
986
  }
1163
2.29k
1164
2.29k
  LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
1165
2.29k
  ++NumSDWAInstructionsPeepholed;
1166
2.29k
1167
2.29k
  MI.eraseFromParent();
1168
2.29k
  return true;
1169
2.29k
}
1170
1171
// If an instruction was converted to SDWA it should not have immediates or SGPR
1172
// operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs.
1173
void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI,
1174
2.29k
                                            const GCNSubtarget &ST) const {
1175
2.29k
  const MCInstrDesc &Desc = TII->get(MI.getOpcode());
1176
2.29k
  unsigned ConstantBusCount = 0;
1177
20.3k
  for (MachineOperand &Op : MI.explicit_uses()) {
1178
20.3k
    if (!Op.isImm() && 
!(3.73k
Op.isReg()3.73k
&&
!TRI->isVGPR(*MRI, Op.getReg())3.73k
))
1179
3.36k
      continue;
1180
16.9k
1181
16.9k
    unsigned I = MI.getOperandNo(&Op);
1182
16.9k
    if (Desc.OpInfo[I].RegClass == -1 ||
1183
16.9k
       
!TRI->hasVGPRs(TRI->getRegClass(Desc.OpInfo[I].RegClass))982
)
1184
15.9k
      continue;
1185
982
1186
982
    if (ST.hasSDWAScalar() && 
ConstantBusCount == 0307
&&
Op.isReg()302
&&
1187
982
        
TRI->isSGPRReg(*MRI, Op.getReg())145
) {
1188
145
      ++ConstantBusCount;
1189
145
      continue;
1190
145
    }
1191
837
1192
837
    unsigned VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1193
837
    auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1194
837
                        TII->get(AMDGPU::V_MOV_B32_e32), VGPR);
1195
837
    if (Op.isImm())
1196
606
      Copy.addImm(Op.getImm());
1197
231
    else if (Op.isReg())
1198
231
      Copy.addReg(Op.getReg(), Op.isKill() ? 
RegState::Kill20
:
0211
,
1199
231
                  Op.getSubReg());
1200
837
    Op.ChangeToRegister(VGPR, false);
1201
837
  }
1202
2.29k
}
1203
1204
25.2k
bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
1205
25.2k
  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1206
25.2k
1207
25.2k
  if (!ST.hasSDWA() || 
skipFunction(MF.getFunction())15.7k
)
1208
9.49k
    return false;
1209
15.7k
1210
15.7k
  MRI = &MF.getRegInfo();
1211
15.7k
  TRI = ST.getRegisterInfo();
1212
15.7k
  TII = ST.getInstrInfo();
1213
15.7k
1214
15.7k
  // Find all SDWA operands in MF.
1215
15.7k
  bool Ret = false;
1216
17.3k
  for (MachineBasicBlock &MBB : MF) {
1217
17.3k
    bool Changed = false;
1218
18.1k
    do {
1219
18.1k
      // Preprocess the ADD/SUB pairs so they could be SDWA'ed.
1220
18.1k
      // Look for a possible ADD or SUB that resulted from a previously lowered
1221
18.1k
      // V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2
1222
18.1k
      // lowers the pair of instructions into e32 form.
1223
18.1k
      matchSDWAOperands(MBB);
1224
18.1k
      for (const auto &OperandPair : SDWAOperands) {
1225
14.8k
        const auto &Operand = OperandPair.second;
1226
14.8k
        MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
1227
14.8k
        if (PotentialMI &&
1228
14.8k
           
(11.5k
PotentialMI->getOpcode() == AMDGPU::V_ADD_I32_e6411.5k
||
1229
11.5k
            
PotentialMI->getOpcode() == AMDGPU::V_SUB_I32_e6411.5k
))
1230
28
          pseudoOpConvertToVOP2(*PotentialMI, ST);
1231
14.8k
      }
1232
18.1k
      SDWAOperands.clear();
1233
18.1k
1234
18.1k
      // Generate potential match list.
1235
18.1k
      matchSDWAOperands(MBB);
1236
18.1k
1237
18.1k
      for (const auto &OperandPair : SDWAOperands) {
1238
14.8k
        const auto &Operand = OperandPair.second;
1239
14.8k
        MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
1240
14.8k
        if (PotentialMI && 
isConvertibleToSDWA(*PotentialMI, ST)11.5k
) {
1241
3.83k
          PotentialMatches[PotentialMI].push_back(Operand.get());
1242
3.83k
        }
1243
14.8k
      }
1244
18.1k
1245
18.1k
      for (auto &PotentialPair : PotentialMatches) {
1246
3.27k
        MachineInstr &PotentialMI = *PotentialPair.first;
1247
3.27k
        convertToSDWA(PotentialMI, PotentialPair.second);
1248
3.27k
      }
1249
18.1k
1250
18.1k
      PotentialMatches.clear();
1251
18.1k
      SDWAOperands.clear();
1252
18.1k
1253
18.1k
      Changed = !ConvertedInstructions.empty();
1254
18.1k
1255
18.1k
      if (Changed)
1256
752
        Ret = true;
1257
20.4k
      while (!ConvertedInstructions.empty())
1258
2.29k
        legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);
1259
18.1k
    } while (Changed);
1260
17.3k
  }
1261
15.7k
1262
15.7k
  return Ret;
1263
15.7k
}