Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp
Line
Count
Source (jump to first uncovered line)
1
//===-- GCNRegBankReassign.cpp - Reassign registers after regalloc --------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
/// \file
10
/// \brief Try to reassign registers on GFX10+ to reduce register bank
11
/// conflicts.
12
///
13
/// On GFX10 registers are organized in banks. VGPRs have 4 banks assigned in
14
/// a round-robin fashion: v0, v4, v8... belong to bank 0. v1, v5, v9... to
15
/// bank 1, etc. SGPRs have 8 banks and allocated in pairs, so that s0:s1,
16
/// s16:s17, s32:s33 are at bank 0. s2:s3, s18:s19, s34:s35 are at bank 1 etc.
17
///
18
/// The shader can read one dword from each of these banks once per cycle.
19
/// If an instruction has to read more register operands from the same bank
20
/// an additional cycle is needed. HW attempts to pre-load registers through
21
/// input operand gathering, but a stall cycle may occur if that fails. For
22
/// example V_FMA_F32 V111 = V0 + V4 * V8 will need 3 cycles to read operands,
23
/// potentially incuring 2 stall cycles.
24
///
25
/// The pass tries to reassign registers to reduce bank conflicts.
26
///
27
/// In this pass bank numbers 0-3 are VGPR banks and 4-11 are SGPR banks, so
28
/// that 4 has to be subtracted from an SGPR bank number to get the real value.
29
/// This also corresponds to bit numbers in bank masks used in the pass.
30
///
31
//===----------------------------------------------------------------------===//
32
33
#include "AMDGPU.h"
34
#include "AMDGPUSubtarget.h"
35
#include "SIInstrInfo.h"
36
#include "SIMachineFunctionInfo.h"
37
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
38
#include "llvm/ADT/SmallSet.h"
39
#include "llvm/ADT/Statistic.h"
40
#include "llvm/CodeGen/LiveInterval.h"
41
#include "llvm/CodeGen/LiveIntervals.h"
42
#include "llvm/CodeGen/LiveRegMatrix.h"
43
#include "llvm/CodeGen/MachineFunctionPass.h"
44
#include "llvm/CodeGen/MachineLoopInfo.h"
45
#include "llvm/CodeGen/VirtRegMap.h"
46
#include "llvm/Support/MathExtras.h"
47
48
using namespace llvm;
49
50
static cl::opt<unsigned> VerifyStallCycles("amdgpu-verify-regbanks-reassign",
51
  cl::desc("Verify stall cycles in the regbanks reassign pass"),
52
  cl::value_desc("0|1|2"),
53
  cl::init(0), cl::Hidden);
54
55
#define DEBUG_TYPE "amdgpu-regbanks-reassign"
56
57
109k
#define NUM_VGPR_BANKS 4
58
41.3k
#define NUM_SGPR_BANKS 8
59
4.03k
#define NUM_BANKS (NUM_VGPR_BANKS + NUM_SGPR_BANKS)
60
37.6k
#define SGPR_BANK_OFFSET NUM_VGPR_BANKS
61
22.1k
#define VGPR_BANK_MASK 0xf
62
18.5k
#define SGPR_BANK_MASK 0xff0
63
18.5k
#define SGPR_BANK_SHIFTED_MASK (SGPR_BANK_MASK >> SGPR_BANK_OFFSET)
64
65
STATISTIC(NumStallsDetected,
66
          "Number of operand read stalls detected");
67
STATISTIC(NumStallsRecovered,
68
          "Number of operand read stalls recovered");
69
70
namespace {
71
72
class GCNRegBankReassign : public MachineFunctionPass {
73
74
  class OperandMask {
75
  public:
76
    OperandMask(unsigned r, unsigned s, unsigned m)
77
38.9k
      : Reg(r), SubReg(s), Mask(m) {}
78
    unsigned Reg;
79
    unsigned SubReg;
80
    unsigned Mask;
81
  };
82
83
  class Candidate {
84
  public:
85
    Candidate(MachineInstr *mi, unsigned reg, unsigned freebanks,
86
              unsigned weight)
87
534
      : MI(mi), Reg(reg), FreeBanks(freebanks), Weight(weight) {}
88
89
2.08k
    bool operator< (const Candidate& RHS) const { return Weight < RHS.Weight; }
90
91
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
92
    void dump(const GCNRegBankReassign *P) const {
93
      MI->dump();
94
      dbgs() << P->printReg(Reg) << " to banks ";
95
      dumpFreeBanks(FreeBanks);
96
      dbgs() << " weight " << Weight << '\n';
97
    }
98
#endif
99
100
    MachineInstr *MI;
101
    unsigned Reg;
102
    unsigned FreeBanks;
103
    unsigned Weight;
104
  };
105
106
  class CandidateList : public std::list<Candidate> {
107
  public:
108
    // Speedup subsequent sort.
109
534
    void push(const Candidate&& C) {
110
534
      if (C.Weight) 
push_back(C)210
;
111
324
      else push_front(C);
112
534
    }
113
  };
114
115
public:
116
  static char ID;
117
118
public:
119
2.39k
  GCNRegBankReassign() : MachineFunctionPass(ID) {
120
2.39k
    initializeGCNRegBankReassignPass(*PassRegistry::getPassRegistry());
121
2.39k
  }
122
123
  bool runOnMachineFunction(MachineFunction &MF) override;
124
125
27.6k
  StringRef getPassName() const override { return "GCN RegBank Reassign"; }
126
127
2.37k
  void getAnalysisUsage(AnalysisUsage &AU) const override {
128
2.37k
    AU.addRequired<MachineLoopInfo>();
129
2.37k
    AU.addRequired<LiveIntervals>();
130
2.37k
    AU.addRequired<VirtRegMap>();
131
2.37k
    AU.addRequired<LiveRegMatrix>();
132
2.37k
    AU.setPreservesAll();
133
2.37k
    MachineFunctionPass::getAnalysisUsage(AU);
134
2.37k
  }
135
136
private:
137
  const GCNSubtarget *ST;
138
139
  const MachineRegisterInfo *MRI;
140
141
  const SIRegisterInfo *TRI;
142
143
  MachineLoopInfo *MLI;
144
145
  VirtRegMap *VRM;
146
147
  LiveRegMatrix *LRM;
148
149
  LiveIntervals *LIS;
150
151
  unsigned MaxNumVGPRs;
152
153
  unsigned MaxNumSGPRs;
154
155
  BitVector RegsUsed;
156
157
  SmallVector<OperandMask, 8> OperandMasks;
158
159
  CandidateList Candidates;
160
161
  const MCPhysReg *CSRegs;
162
163
  // Returns bank for a phys reg.
164
  unsigned getPhysRegBank(unsigned Reg) const;
165
166
  // Return a bit set for each register bank used. 4 banks for VGPRs and
167
  // 8 banks for SGPRs.
168
  // Registers already processed and recorded in RegsUsed are excluded.
169
  // If Bank is not -1 assume Reg:SubReg to belong to that Bank.
170
  unsigned getRegBankMask(unsigned Reg, unsigned SubReg, int Bank);
171
172
  // Return number of stalls in the instructions.
173
  // UsedBanks has bits set for the banks used by all operands.
174
  // If Reg and Bank provided substitute the Reg with the Bank.
175
  unsigned analyzeInst(const MachineInstr& MI, unsigned& UsedBanks,
176
                       unsigned Reg = AMDGPU::NoRegister, int Bank = -1);
177
178
  // Return true if register is regular VGPR or SGPR or their tuples.
179
  // Returns false for special registers like m0, vcc etc.
180
  bool isReassignable(unsigned Reg) const;
181
182
  // Check if registers' defs are old and may be pre-loaded.
183
  // Returns 0 if both registers are old enough, 1 or 2 if one or both
184
  // registers will not likely be pre-loaded.
185
  unsigned getOperandGatherWeight(const MachineInstr& MI,
186
                                  unsigned Reg1,
187
                                  unsigned Reg2,
188
                                  unsigned StallCycles) const;
189
190
191
  // Find all bank bits in UsedBanks where Mask can be relocated to.
192
  unsigned getFreeBanks(unsigned Mask, unsigned UsedBanks) const;
193
194
  // Find all bank bits in UsedBanks where Mask can be relocated to.
195
  // Bank is relative to the register and not its subregister component.
196
  // Returns 0 is a register is not reassignable.
197
  unsigned getFreeBanks(unsigned Reg, unsigned SubReg, unsigned Mask,
198
                        unsigned UsedBanks) const;
199
200
  // Add cadidate instruction to the work list.
201
  void collectCandidates(MachineInstr& MI, unsigned UsedBanks,
202
                         unsigned StallCycles);
203
204
  // Collect cadidate instructions across function. Returns a number stall
205
  // cycles detected. Only counts stalls if Collect is false.
206
  unsigned collectCandidates(MachineFunction &MF, bool Collect = true);
207
208
  // Remove all candidates that read specified register.
209
  void removeCandidates(unsigned Reg);
210
211
  // Compute stalls within the uses of SrcReg replaced by a register from
212
  // Bank. If Bank is -1 does not perform substitution. If Collect is set
213
  // candidates are collected and added to work list.
214
  unsigned computeStallCycles(unsigned SrcReg,
215
                              unsigned Reg = AMDGPU::NoRegister,
216
                              int Bank = -1, bool Collect = false);
217
218
  // Search for a register in Bank unused within LI.
219
  // Returns phys reg or NoRegister.
220
  unsigned scavengeReg(LiveInterval& LI, unsigned Bank) const;
221
222
  // Try to reassign candidate. Returns number or stall cycles saved.
223
  unsigned tryReassign(Candidate &C);
224
225
  bool verifyCycles(MachineFunction &MF,
226
                    unsigned OriginalCycles, unsigned CyclesSaved);
227
228
229
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
230
public:
231
  Printable printReg(unsigned Reg, unsigned SubReg = 0) const {
232
    return Printable([Reg, SubReg, this](raw_ostream &OS) {
233
      if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
234
        OS << llvm::printReg(Reg, TRI);
235
        return;
236
      }
237
      if (!VRM->isAssignedReg(Reg))
238
        OS << "<unassigned> " << llvm::printReg(Reg, TRI);
239
      else
240
        OS << llvm::printReg(Reg, TRI) << '('
241
           << llvm::printReg(VRM->getPhys(Reg), TRI) << ')';
242
      if (SubReg)
243
        OS << ':' << TRI->getSubRegIndexName(SubReg);
244
    });
245
  }
246
247
  static Printable printBank(unsigned Bank) {
248
    return Printable([Bank](raw_ostream &OS) {
249
      OS << ((Bank >= SGPR_BANK_OFFSET) ? Bank - SGPR_BANK_OFFSET : Bank);
250
    });
251
  }
252
253
  static void dumpFreeBanks(unsigned FreeBanks) {
254
    for (unsigned L = 0; L < NUM_BANKS; ++L)
255
      if (FreeBanks & (1 << L))
256
        dbgs() << printBank(L) << ' ';
257
  }
258
#endif
259
};
260
261
} // End anonymous namespace.
262
263
101k
INITIALIZE_PASS_BEGIN(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign",
264
101k
                      false, false)
265
101k
INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
266
101k
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
267
101k
INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
268
101k
INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
269
101k
INITIALIZE_PASS_END(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign",
270
                    false, false)
271
272
273
char GCNRegBankReassign::ID = 0;
274
275
char &llvm::GCNRegBankReassignID = GCNRegBankReassign::ID;
276
277
23.5k
unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg) const {
278
23.5k
  assert (TargetRegisterInfo::isPhysicalRegister(Reg));
279
23.5k
280
23.5k
  const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
281
23.5k
  unsigned Size = TRI->getRegSizeInBits(*RC);
282
23.5k
  if (Size > 32)
283
20.6k
    Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
284
23.5k
285
23.5k
  if (TRI->hasVGPRs(RC)) {
286
23.4k
    Reg -= AMDGPU::VGPR0;
287
23.4k
    return Reg % NUM_VGPR_BANKS;
288
23.4k
  }
289
121
290
121
  Reg = TRI->getEncodingValue(Reg) / 2;
291
121
  return Reg % NUM_SGPR_BANKS + SGPR_BANK_OFFSET;
292
121
}
293
294
unsigned GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg,
295
38.9k
                                            int Bank) {
296
38.9k
  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
297
29.5k
    if (!VRM->isAssignedReg(Reg))
298
0
      return 0;
299
29.5k
300
29.5k
    Reg = VRM->getPhys(Reg);
301
29.5k
    if (!Reg)
302
0
      return 0;
303
29.5k
    if (SubReg)
304
6.27k
      Reg = TRI->getSubReg(Reg, SubReg);
305
29.5k
  }
306
38.9k
307
38.9k
  const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
308
38.9k
  unsigned Size = TRI->getRegSizeInBits(*RC) / 32;
309
38.9k
  if (Size > 1)
310
11.8k
    Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
311
38.9k
312
38.9k
  if (TRI->hasVGPRs(RC)) {
313
19.9k
    // VGPRs have 4 banks assigned in a round-robin fashion.
314
19.9k
    Reg -= AMDGPU::VGPR0;
315
19.9k
    unsigned Mask = (1 << Size) - 1;
316
19.9k
    unsigned Used = 0;
317
19.9k
    // Bitmask lacks an extract method
318
44.3k
    for (unsigned I = 0; I < Size; 
++I24.3k
)
319
24.3k
      if (RegsUsed.test(Reg + I))
320
164
        Used |= 1 << I;
321
19.9k
    RegsUsed.set(Reg, Reg + Size);
322
19.9k
    Mask &= ~Used;
323
19.9k
    Mask <<= (Bank == -1) ? 
Reg % 18.5k
NUM_VGPR_BANKS18.5k
:
unsigned(Bank)1.46k
;
324
19.9k
    return (Mask | (Mask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK;
325
19.9k
  }
326
18.9k
327
18.9k
  // SGPRs have 8 banks holding 2 consequitive registers each.
328
18.9k
  Reg = TRI->getEncodingValue(Reg) / 2;
329
18.9k
  unsigned StartBit = AMDGPU::VGPR_32RegClass.getNumRegs();
330
18.9k
  if (Reg + StartBit >= RegsUsed.size())
331
565
    return 0;
332
18.3k
333
18.3k
  if (Size > 1)
334
8.01k
    Size /= 2;
335
18.3k
  unsigned Mask = (1 << Size) - 1;
336
18.3k
  unsigned Used = 0;
337
39.1k
  for (unsigned I = 0; I < Size; 
++I20.8k
)
338
20.8k
    if (RegsUsed.test(StartBit + Reg + I))
339
583
      Used |= 1 << I;
340
18.3k
  RegsUsed.set(StartBit + Reg, StartBit + Reg + Size);
341
18.3k
  Mask &= ~Used;
342
18.3k
  Mask <<= (Bank == -1) ? 
Reg % 18.3k
NUM_SGPR_BANKS18.3k
343
18.3k
                        : 
unsigned(Bank - 56
SGPR_BANK_OFFSET56
);
344
18.3k
  Mask = (Mask | (Mask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK;
345
18.3k
  // Reserve 4 bank ids for VGPRs.
346
18.3k
  return Mask << SGPR_BANK_OFFSET;
347
18.3k
}
348
349
unsigned GCNRegBankReassign::analyzeInst(const MachineInstr& MI,
350
                                         unsigned& UsedBanks,
351
                                         unsigned Reg,
352
31.3k
                                         int Bank) {
353
31.3k
  unsigned StallCycles = 0;
354
31.3k
  UsedBanks = 0;
355
31.3k
356
31.3k
  if (MI.isDebugValue())
357
0
    return 0;
358
31.3k
359
31.3k
  RegsUsed.reset();
360
31.3k
  OperandMasks.clear();
361
83.7k
  for (const auto& Op : MI.explicit_uses()) {
362
83.7k
    // Undef can be assigned to any register, so two vregs can be assigned
363
83.7k
    // the same phys reg within the same instruction.
364
83.7k
    if (!Op.isReg() || 
Op.isUndef()39.0k
)
365
44.8k
      continue;
366
38.9k
367
38.9k
    unsigned R = Op.getReg();
368
38.9k
    if (TRI->hasAGPRs(TRI->getRegClassForReg(*MRI, R)))
369
0
      continue;
370
38.9k
371
38.9k
    unsigned ShiftedBank = Bank;
372
38.9k
373
38.9k
    if (Bank != -1 && 
R == Reg3.30k
&&
Op.getSubReg()1.52k
) {
374
408
      unsigned LM = TRI->getSubRegIndexLaneMask(Op.getSubReg()).getAsInteger();
375
408
      if (!(LM & 1) && 
(Bank < 230
NUM_VGPR_BANKS230
)) {
376
230
        // If a register spans all banks we cannot shift it to avoid conflict.
377
230
        if (countPopulation(LM) >= NUM_VGPR_BANKS)
378
230
          
continue0
;
379
230
        ShiftedBank = (Bank + countTrailingZeros(LM)) % NUM_VGPR_BANKS;
380
230
      } else 
if (178
!(LM & 3)178
&&
(Bank >= 0
SGPR_BANK_OFFSET0
)) {
381
0
        // If a register spans all banks we cannot shift it to avoid conflict.
382
0
        if (countPopulation(LM) / 2 >= NUM_SGPR_BANKS)
383
0
          continue;
384
0
        ShiftedBank = SGPR_BANK_OFFSET + (Bank - SGPR_BANK_OFFSET +
385
0
                                          (countTrailingZeros(LM) >> 1)) %
386
0
                                             NUM_SGPR_BANKS;
387
0
      }
388
408
    }
389
38.9k
390
38.9k
    unsigned Mask = getRegBankMask(R, Op.getSubReg(),
391
38.9k
                                   (Reg == R) ? 
ShiftedBank1.52k
:
-137.3k
);
392
38.9k
    StallCycles += countPopulation(UsedBanks & Mask);
393
38.9k
    UsedBanks |= Mask;
394
38.9k
    OperandMasks.push_back(OperandMask(Op.getReg(), Op.getSubReg(), Mask));
395
38.9k
  }
396
31.3k
397
31.3k
  return StallCycles;
398
31.3k
}
399
400
unsigned GCNRegBankReassign::getOperandGatherWeight(const MachineInstr& MI,
401
                                                    unsigned Reg1,
402
                                                    unsigned Reg2,
403
                                                    unsigned StallCycles) const
404
1.09k
{
405
1.09k
  unsigned Defs = 0;
406
1.09k
  MachineBasicBlock::const_instr_iterator Def(MI.getIterator());
407
1.09k
  MachineBasicBlock::const_instr_iterator B(MI.getParent()->instr_begin());
408
3.96k
  for (unsigned S = StallCycles; S && 
Def != B2.92k
&&
Defs != 32.92k
;
--S2.86k
) {
409
2.86k
    if (MI.isDebugInstr())
410
0
      continue;
411
2.86k
    --Def;
412
2.86k
    if (Def->getOpcode() == TargetOpcode::IMPLICIT_DEF)
413
70
      continue;
414
2.79k
    if (Def->modifiesRegister(Reg1, TRI))
415
109
      Defs |= 1;
416
2.79k
    if (Def->modifiesRegister(Reg2, TRI))
417
261
      Defs |= 2;
418
2.79k
  }
419
1.09k
  return countPopulation(Defs);
420
1.09k
}
421
422
2.19k
bool GCNRegBankReassign::isReassignable(unsigned Reg) const {
423
2.19k
  if (TargetRegisterInfo::isPhysicalRegister(Reg) || 
!VRM->isAssignedReg(Reg)784
)
424
1.40k
    return false;
425
784
426
784
  const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
427
784
428
784
  unsigned PhysReg = VRM->getPhys(Reg);
429
784
430
784
  if (Def && 
Def->isCopy()592
&&
Def->getOperand(1).getReg() == PhysReg226
)
431
188
    return false;
432
596
433
3.19k
  
for (auto U : MRI->use_nodbg_operands(Reg))596
{
434
3.19k
    if (U.isImplicit())
435
6
      return false;
436
3.19k
    const MachineInstr *UseInst = U.getParent();
437
3.19k
    if (UseInst->isCopy() && 
UseInst->getOperand(0).getReg() == PhysReg51
)
438
3
      return false;
439
3.19k
  }
440
596
441
596
  const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysReg);
442
587
  if (TRI->hasVGPRs(RC))
443
575
    return true;
444
12
445
12
  unsigned Size = TRI->getRegSizeInBits(*RC);
446
12
  if (Size > 32)
447
5
    PhysReg = TRI->getSubReg(PhysReg, AMDGPU::sub0);
448
12
449
12
  return AMDGPU::SGPR_32RegClass.contains(PhysReg);
450
12
}
451
452
unsigned GCNRegBankReassign::getFreeBanks(unsigned Mask,
453
587
                                          unsigned UsedBanks) const {
454
587
  unsigned Size = countPopulation(Mask);
455
587
  unsigned FreeBanks = 0;
456
587
  unsigned Bank = findFirstSet(Mask);
457
587
458
587
  UsedBanks &= ~Mask;
459
587
460
587
  // Find free VGPR banks
461
587
  if ((Mask & VGPR_BANK_MASK) && 
(Size < 575
NUM_VGPR_BANKS575
)) {
462
2.48k
    for (unsigned I = 0; I < NUM_VGPR_BANKS; 
++I1.98k
) {
463
1.98k
      if (Bank == I)
464
496
        continue;
465
1.48k
      unsigned NewMask = ((1 << Size) - 1) << I;
466
1.48k
      NewMask = (NewMask | (NewMask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK;
467
1.48k
      if (!(UsedBanks & NewMask))
468
1.18k
        FreeBanks |= 1 << I;
469
1.48k
    }
470
496
    return FreeBanks;
471
496
  }
472
91
473
91
  // Find free SGPR banks
474
91
  // SGPR tuples must be aligned, so step is size in banks it
475
91
  // crosses.
476
91
  Bank -= SGPR_BANK_OFFSET;
477
327
  for (unsigned I = 0; I < NUM_SGPR_BANKS; 
I += Size236
) {
478
236
    if (Bank == I)
479
12
      continue;
480
224
    unsigned NewMask = ((1 << Size) - 1) << I;
481
224
    NewMask = (NewMask | (NewMask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK;
482
224
    if (!(UsedBanks & (NewMask << SGPR_BANK_OFFSET)))
483
166
      FreeBanks |= (1 << SGPR_BANK_OFFSET) << I;
484
224
  }
485
91
486
91
  return FreeBanks;
487
91
}
488
489
unsigned GCNRegBankReassign::getFreeBanks(unsigned Reg,
490
                                          unsigned SubReg,
491
                                          unsigned Mask,
492
2.19k
                                          unsigned UsedBanks) const {
493
2.19k
  if (!isReassignable(Reg))
494
1.60k
    return 0;
495
587
496
587
  unsigned FreeBanks = getFreeBanks(Mask, UsedBanks);
497
587
498
587
  unsigned LM = TRI->getSubRegIndexLaneMask(SubReg).getAsInteger();
499
587
  if (!(LM & 1) && 
(Mask & 42
VGPR_BANK_MASK42
)) {
500
42
    unsigned Shift = countTrailingZeros(LM);
501
42
    if (Shift >= NUM_VGPR_BANKS)
502
42
      
return 03
;
503
39
    unsigned VB = FreeBanks & VGPR_BANK_MASK;
504
39
    FreeBanks = ((VB >> Shift) | (VB << (NUM_VGPR_BANKS - Shift))) &
505
39
                VGPR_BANK_MASK;
506
545
  } else if (!(LM & 3) && 
(Mask & 0
SGPR_BANK_MASK0
)) {
507
0
    unsigned Shift = countTrailingZeros(LM) >> 1;
508
0
    if (Shift >= NUM_SGPR_BANKS)
509
0
      return 0;
510
0
    unsigned SB = FreeBanks >> SGPR_BANK_OFFSET;
511
0
    FreeBanks = ((SB >> Shift) | (SB << (NUM_SGPR_BANKS - Shift))) &
512
0
                SGPR_BANK_SHIFTED_MASK;
513
0
    FreeBanks <<= SGPR_BANK_OFFSET;
514
0
  }
515
587
516
587
  
LLVM_DEBUG584
(if (FreeBanks) {
517
584
          dbgs() << "Potential reassignments of " << printReg(Reg, SubReg)
518
584
                 << " to banks: "; dumpFreeBanks(FreeBanks);
519
584
          dbgs() << '\n'; });
520
584
521
584
  return FreeBanks;
522
587
}
523
524
void GCNRegBankReassign::collectCandidates(MachineInstr& MI,
525
                                           unsigned UsedBanks,
526
29.2k
                                           unsigned StallCycles) {
527
29.2k
  LLVM_DEBUG(MI.dump());
528
29.2k
529
29.2k
  if (!StallCycles)
530
28.3k
    return;
531
863
532
863
  LLVM_DEBUG(dbgs() << "Stall cycles = " << StallCycles << '\n');
533
863
534
2.61k
  for (unsigned I = 0, E = OperandMasks.size(); I + 1 < E; 
++I1.75k
) {
535
5.55k
    for (unsigned J = I + 1; J != E; 
++J3.79k
) {
536
3.79k
      if (!(OperandMasks[I].Mask & OperandMasks[J].Mask))
537
2.69k
        continue;
538
1.09k
539
1.09k
      unsigned Reg1 = OperandMasks[I].Reg;
540
1.09k
      unsigned Reg2 = OperandMasks[J].Reg;
541
1.09k
      unsigned SubReg1 = OperandMasks[I].SubReg;
542
1.09k
      unsigned SubReg2 = OperandMasks[J].SubReg;
543
1.09k
      unsigned Mask1 = OperandMasks[I].Mask;
544
1.09k
      unsigned Mask2 = OperandMasks[J].Mask;
545
1.09k
      unsigned Size1 = countPopulation(Mask1);
546
1.09k
      unsigned Size2 = countPopulation(Mask2);
547
1.09k
548
1.09k
      LLVM_DEBUG(dbgs() << "Conflicting operands: " << printReg(Reg1, SubReg1) <<
549
1.09k
                      " and " << printReg(Reg2, SubReg2) << '\n');
550
1.09k
551
1.09k
      unsigned Weight = getOperandGatherWeight(MI, Reg1, Reg2, StallCycles);
552
1.09k
      Weight += MLI->getLoopDepth(MI.getParent()) * 10;
553
1.09k
554
1.09k
      LLVM_DEBUG(dbgs() << "Stall weight = " << Weight << '\n');
555
1.09k
556
1.09k
      unsigned FreeBanks1 = getFreeBanks(Reg1, SubReg1, Mask1, UsedBanks);
557
1.09k
      unsigned FreeBanks2 = getFreeBanks(Reg2, SubReg2, Mask2, UsedBanks);
558
1.09k
      if (FreeBanks1)
559
300
        Candidates.push(Candidate(&MI, Reg1, FreeBanks1, Weight
560
300
                                    + ((Size2 > Size1) ? 
119
:
0281
)));
561
1.09k
      if (FreeBanks2)
562
234
        Candidates.push(Candidate(&MI, Reg2, FreeBanks2, Weight
563
234
                                    + ((Size1 > Size2) ? 
161
:
0173
)));
564
1.09k
    }
565
1.75k
  }
566
863
}
567
568
unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg,
569
                                                unsigned Reg, int Bank,
570
1.21k
                                                bool Collect) {
571
1.21k
  unsigned TotalStallCycles = 0;
572
1.21k
  unsigned UsedBanks = 0;
573
1.21k
  SmallSet<const MachineInstr *, 16> Visited;
574
1.21k
575
2.50k
  for (auto &MI : MRI->use_nodbg_instructions(SrcReg)) {
576
2.50k
    if (MI.isBundle())
577
16
      continue;
578
2.49k
    if (!Visited.insert(&MI).second)
579
0
      continue;
580
2.49k
    unsigned StallCycles = analyzeInst(MI, UsedBanks, Reg, Bank);
581
2.49k
    TotalStallCycles += StallCycles;
582
2.49k
    if (Collect)
583
335
      collectCandidates(MI, UsedBanks, StallCycles);
584
2.49k
  }
585
1.21k
586
1.21k
  return TotalStallCycles;
587
1.21k
}
588
589
unsigned GCNRegBankReassign::scavengeReg(LiveInterval& LI,
590
287
                                         unsigned Bank) const {
591
287
  const TargetRegisterClass *RC = MRI->getRegClass(LI.reg);
592
287
  unsigned MaxNumRegs = (Bank < NUM_VGPR_BANKS) ? 
MaxNumVGPRs199
593
287
                                                : 
MaxNumSGPRs88
;
594
287
  unsigned MaxReg = MaxNumRegs + (Bank < NUM_VGPR_BANKS ? 
AMDGPU::VGPR0199
595
287
                                                        : 
AMDGPU::SGPR088
);
596
287
597
23.5k
  for (unsigned Reg : RC->getRegisters()) {
598
23.5k
    // Check occupancy limit.
599
23.5k
    if (TRI->isSubRegisterEq(Reg, MaxReg))
600
0
      break;
601
23.5k
602
23.5k
    if (!MRI->isAllocatable(Reg) || getPhysRegBank(Reg) != Bank)
603
22.6k
      continue;
604
898
605
64.2k
    
for (unsigned I = 0; 898
CSRegs[I];
++I63.3k
)
606
63.3k
      if (TRI->isSubRegisterEq(Reg, CSRegs[I]) &&
607
63.3k
          
!LRM->isPhysRegUsed(CSRegs[I])86
)
608
16
        return AMDGPU::NoRegister;
609
898
610
898
    
LLVM_DEBUG882
(dbgs() << "Trying register " << printReg(Reg) << '\n');
611
882
612
882
    if (!LRM->checkInterference(LI, Reg))
613
192
      return Reg;
614
882
  }
615
287
616
287
  
return AMDGPU::NoRegister79
;
617
287
}
618
619
310
unsigned GCNRegBankReassign::tryReassign(Candidate &C) {
620
310
  if (!LIS->hasInterval(C.Reg))
621
0
    return 0;
622
310
623
310
  LiveInterval &LI = LIS->getInterval(C.Reg);
624
310
  LLVM_DEBUG(dbgs() << "Try reassign " << printReg(C.Reg) << " in "; C.MI->dump();
625
310
             LI.dump());
626
310
627
310
  // For each candidate bank walk all instructions in the range of live
628
310
  // interval and check if replacing the register with one belonging to
629
310
  // the candidate bank reduces conflicts.
630
310
631
310
  unsigned OrigStalls = computeStallCycles(C.Reg);
632
310
  LLVM_DEBUG(dbgs() << "--- Stall cycles in range = " << OrigStalls << '\n');
633
310
  if (!OrigStalls)
634
0
    return 0;
635
310
636
310
  struct BankStall {
637
587
    BankStall(unsigned b, unsigned s) : Bank(b), Stalls(s) {};
638
320
    bool operator< (const BankStall &RHS) const { return Stalls > RHS.Stalls; }
639
310
    unsigned Bank;
640
310
    unsigned Stalls;
641
310
  };
642
310
  SmallVector<BankStall, 8> BankStalls;
643
310
644
4.03k
  for (int Bank = 0; Bank < NUM_BANKS; 
++Bank3.72k
) {
645
3.72k
    if (C.FreeBanks & (1 << Bank)) {
646
708
      LLVM_DEBUG(dbgs() << "Trying bank " << printBank(Bank) << '\n');
647
708
      unsigned Stalls = computeStallCycles(C.Reg, C.Reg, Bank);
648
708
      if (Stalls < OrigStalls) {
649
587
        LLVM_DEBUG(dbgs() << "With bank " << printBank(Bank) << " -> "
650
587
                     << Stalls << '\n');
651
587
        BankStalls.push_back(BankStall((unsigned)Bank, Stalls));
652
587
      }
653
708
    }
654
3.72k
  }
655
310
  std::sort(BankStalls.begin(), BankStalls.end());
656
310
657
310
  unsigned OrigReg = VRM->getPhys(C.Reg);
658
310
  LRM->unassign(LI);
659
405
  while (!BankStalls.empty()) {
660
287
    BankStall BS = BankStalls.pop_back_val();
661
287
    unsigned Reg = scavengeReg(LI, BS.Bank);
662
287
    if (Reg == AMDGPU::NoRegister) {
663
95
      LLVM_DEBUG(dbgs() << "No free registers in bank " << printBank(BS.Bank)
664
95
                   << '\n');
665
95
      continue;
666
95
    }
667
192
    LLVM_DEBUG(dbgs() << "Found free register " << printReg(Reg)
668
192
                 << (LRM->isPhysRegUsed(Reg) ? "" : " (new)")
669
192
                 << " in bank " << printBank(BS.Bank) << '\n');
670
192
671
192
    LRM->assign(LI, Reg);
672
192
673
192
    LLVM_DEBUG(dbgs() << "--- Cycles saved: " << OrigStalls - BS.Stalls << '\n');
674
192
675
192
    return OrigStalls - BS.Stalls;
676
192
  }
677
310
  LRM->assign(LI, OrigReg);
678
118
679
118
  return 0;
680
310
}
681
682
unsigned GCNRegBankReassign::collectCandidates(MachineFunction &MF,
683
2.11k
                                               bool Collect) {
684
2.11k
  unsigned TotalStallCycles = 0;
685
2.11k
686
2.52k
  for (MachineBasicBlock &MBB : MF) {
687
2.52k
688
2.52k
    LLVM_DEBUG(if (Collect) {
689
2.52k
            if (MBB.getName().empty()) dbgs() << "bb." << MBB.getNumber();
690
2.52k
            else dbgs() << MBB.getName(); dbgs() << ":\n";
691
2.52k
          });
692
2.52k
693
28.9k
    for (MachineInstr &MI : MBB.instrs()) {
694
28.9k
      if (MI.isBundle())
695
30
          continue; // we analyze the instructions inside the bundle individually
696
28.8k
697
28.8k
      unsigned UsedBanks = 0;
698
28.8k
      unsigned StallCycles = analyzeInst(MI, UsedBanks);
699
28.8k
700
28.8k
      if (Collect)
701
28.8k
        collectCandidates(MI, UsedBanks, StallCycles);
702
28.8k
703
28.8k
      TotalStallCycles += StallCycles;
704
28.8k
    }
705
2.52k
706
2.52k
    LLVM_DEBUG(if (Collect) { dbgs() << '\n'; });
707
2.52k
  }
708
2.11k
709
2.11k
  return TotalStallCycles;
710
2.11k
}
711
712
192
void GCNRegBankReassign::removeCandidates(unsigned Reg) {
713
1.02k
  Candidates.remove_if([Reg, this](const Candidate& C) {
714
1.02k
    return C.MI->readsRegister(Reg, TRI);
715
1.02k
  });
716
192
}
717
718
bool GCNRegBankReassign::verifyCycles(MachineFunction &MF,
719
                                      unsigned OriginalCycles,
720
0
                                      unsigned CyclesSaved) {
721
0
  unsigned StallCycles = collectCandidates(MF, false);
722
0
  LLVM_DEBUG(dbgs() << "=== After the pass " << StallCycles
723
0
               << " stall cycles left\n");
724
0
  return StallCycles + CyclesSaved == OriginalCycles;
725
0
}
726
727
25.2k
bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) {
728
25.2k
  ST = &MF.getSubtarget<GCNSubtarget>();
729
25.2k
  if (!ST->hasRegisterBanking() || 
skipFunction(MF.getFunction())2.12k
)
730
23.1k
    return false;
731
2.11k
732
2.11k
  MRI = &MF.getRegInfo();
733
2.11k
  TRI = ST->getRegisterInfo();
734
2.11k
  MLI = &getAnalysis<MachineLoopInfo>();
735
2.11k
  VRM = &getAnalysis<VirtRegMap>();
736
2.11k
  LRM = &getAnalysis<LiveRegMatrix>();
737
2.11k
  LIS = &getAnalysis<LiveIntervals>();
738
2.11k
739
2.11k
  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
740
2.11k
  unsigned Occupancy = MFI->getOccupancy();
741
2.11k
  MaxNumVGPRs = ST->getMaxNumVGPRs(MF);
742
2.11k
  MaxNumSGPRs = ST->getMaxNumSGPRs(MF);
743
2.11k
  MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(Occupancy), MaxNumVGPRs);
744
2.11k
  MaxNumSGPRs = std::min(ST->getMaxNumSGPRs(Occupancy, true), MaxNumSGPRs);
745
2.11k
746
2.11k
  CSRegs = MRI->getCalleeSavedRegs();
747
2.11k
748
2.11k
  RegsUsed.resize(AMDGPU::VGPR_32RegClass.getNumRegs() +
749
2.11k
                  TRI->getEncodingValue(AMDGPU::SGPR_NULL) / 2 + 1);
750
2.11k
751
2.11k
  LLVM_DEBUG(dbgs() << "=== RegBanks reassign analysis on function " << MF.getName()
752
2.11k
               << '\n');
753
2.11k
754
2.11k
  unsigned StallCycles = collectCandidates(MF);
755
2.11k
  NumStallsDetected += StallCycles;
756
2.11k
757
2.11k
  LLVM_DEBUG(dbgs() << "=== " << StallCycles << " stall cycles detected in "
758
2.11k
                  "function " << MF.getName() << '\n');
759
2.11k
760
2.11k
  Candidates.sort();
761
2.11k
762
2.11k
  LLVM_DEBUG(dbgs() << "\nCandidates:\n\n";
763
2.11k
        for (auto C : Candidates) C.dump(this);
764
2.11k
        dbgs() << "\n\n");
765
2.11k
766
2.11k
  unsigned CyclesSaved = 0;
767
2.42k
  while (!Candidates.empty()) {
768
310
    Candidate C = Candidates.back();
769
310
    unsigned LocalCyclesSaved = tryReassign(C);
770
310
    CyclesSaved += LocalCyclesSaved;
771
310
772
310
    if (VerifyStallCycles > 1 && 
!verifyCycles(MF, StallCycles, CyclesSaved)0
)
773
0
      report_fatal_error("RegBank reassign stall cycles verification failed.");
774
310
775
310
    Candidates.pop_back();
776
310
    if (LocalCyclesSaved) {
777
192
      removeCandidates(C.Reg);
778
192
      computeStallCycles(C.Reg, AMDGPU::NoRegister, -1, true);
779
192
      Candidates.sort();
780
192
781
192
      LLVM_DEBUG(dbgs() << "\nCandidates:\n\n";
782
192
            for (auto C : Candidates)
783
192
              C.dump(this);
784
192
            dbgs() << "\n\n");
785
192
    }
786
310
  }
787
2.11k
  NumStallsRecovered += CyclesSaved;
788
2.11k
789
2.11k
  LLVM_DEBUG(dbgs() << "=== After the pass " << CyclesSaved
790
2.11k
               << " cycles saved in function " << MF.getName() << '\n');
791
2.11k
792
2.11k
  Candidates.clear();
793
2.11k
794
2.11k
  if (VerifyStallCycles == 1 && 
!verifyCycles(MF, StallCycles, CyclesSaved)0
)
795
0
    report_fatal_error("RegBank reassign stall cycles verification failed.");
796
2.11k
797
2.11k
  RegsUsed.clear();
798
2.11k
799
2.11k
  return CyclesSaved > 0;
800
2.11k
}