/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===-- GCNRegBankReassign.cpp - Reassign registers after regalloc --------===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | // |
9 | | /// \file |
10 | | /// \brief Try to reassign registers on GFX10+ to reduce register bank |
11 | | /// conflicts. |
12 | | /// |
13 | | /// On GFX10 registers are organized in banks. VGPRs have 4 banks assigned in |
14 | | /// a round-robin fashion: v0, v4, v8... belong to bank 0. v1, v5, v9... to |
15 | | /// bank 1, etc. SGPRs have 8 banks and allocated in pairs, so that s0:s1, |
16 | | /// s16:s17, s32:s33 are at bank 0. s2:s3, s18:s19, s34:s35 are at bank 1 etc. |
17 | | /// |
18 | | /// The shader can read one dword from each of these banks once per cycle. |
19 | | /// If an instruction has to read more register operands from the same bank |
20 | | /// an additional cycle is needed. HW attempts to pre-load registers through |
21 | | /// input operand gathering, but a stall cycle may occur if that fails. For |
22 | | /// example V_FMA_F32 V111 = V0 + V4 * V8 will need 3 cycles to read operands, |
23 | | /// potentially incuring 2 stall cycles. |
24 | | /// |
25 | | /// The pass tries to reassign registers to reduce bank conflicts. |
26 | | /// |
27 | | /// In this pass bank numbers 0-3 are VGPR banks and 4-11 are SGPR banks, so |
28 | | /// that 4 has to be subtracted from an SGPR bank number to get the real value. |
29 | | /// This also corresponds to bit numbers in bank masks used in the pass. |
30 | | /// |
31 | | //===----------------------------------------------------------------------===// |
32 | | |
33 | | #include "AMDGPU.h" |
34 | | #include "AMDGPUSubtarget.h" |
35 | | #include "SIInstrInfo.h" |
36 | | #include "SIMachineFunctionInfo.h" |
37 | | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
38 | | #include "llvm/ADT/SmallSet.h" |
39 | | #include "llvm/ADT/Statistic.h" |
40 | | #include "llvm/CodeGen/LiveInterval.h" |
41 | | #include "llvm/CodeGen/LiveIntervals.h" |
42 | | #include "llvm/CodeGen/LiveRegMatrix.h" |
43 | | #include "llvm/CodeGen/MachineFunctionPass.h" |
44 | | #include "llvm/CodeGen/MachineLoopInfo.h" |
45 | | #include "llvm/CodeGen/VirtRegMap.h" |
46 | | #include "llvm/Support/MathExtras.h" |
47 | | |
48 | | using namespace llvm; |
49 | | |
50 | | static cl::opt<unsigned> VerifyStallCycles("amdgpu-verify-regbanks-reassign", |
51 | | cl::desc("Verify stall cycles in the regbanks reassign pass"), |
52 | | cl::value_desc("0|1|2"), |
53 | | cl::init(0), cl::Hidden); |
54 | | |
55 | | #define DEBUG_TYPE "amdgpu-regbanks-reassign" |
56 | | |
57 | 109k | #define NUM_VGPR_BANKS 4 |
58 | 41.3k | #define NUM_SGPR_BANKS 8 |
59 | 4.03k | #define NUM_BANKS (NUM_VGPR_BANKS + NUM_SGPR_BANKS) |
60 | 37.6k | #define SGPR_BANK_OFFSET NUM_VGPR_BANKS |
61 | 22.1k | #define VGPR_BANK_MASK 0xf |
62 | 18.5k | #define SGPR_BANK_MASK 0xff0 |
63 | 18.5k | #define SGPR_BANK_SHIFTED_MASK (SGPR_BANK_MASK >> SGPR_BANK_OFFSET) |
64 | | |
65 | | STATISTIC(NumStallsDetected, |
66 | | "Number of operand read stalls detected"); |
67 | | STATISTIC(NumStallsRecovered, |
68 | | "Number of operand read stalls recovered"); |
69 | | |
70 | | namespace { |
71 | | |
72 | | class GCNRegBankReassign : public MachineFunctionPass { |
73 | | |
74 | | class OperandMask { |
75 | | public: |
76 | | OperandMask(unsigned r, unsigned s, unsigned m) |
77 | 38.9k | : Reg(r), SubReg(s), Mask(m) {} |
78 | | unsigned Reg; |
79 | | unsigned SubReg; |
80 | | unsigned Mask; |
81 | | }; |
82 | | |
83 | | class Candidate { |
84 | | public: |
85 | | Candidate(MachineInstr *mi, unsigned reg, unsigned freebanks, |
86 | | unsigned weight) |
87 | 534 | : MI(mi), Reg(reg), FreeBanks(freebanks), Weight(weight) {} |
88 | | |
89 | 2.08k | bool operator< (const Candidate& RHS) const { return Weight < RHS.Weight; } |
90 | | |
91 | | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
92 | | void dump(const GCNRegBankReassign *P) const { |
93 | | MI->dump(); |
94 | | dbgs() << P->printReg(Reg) << " to banks "; |
95 | | dumpFreeBanks(FreeBanks); |
96 | | dbgs() << " weight " << Weight << '\n'; |
97 | | } |
98 | | #endif |
99 | | |
100 | | MachineInstr *MI; |
101 | | unsigned Reg; |
102 | | unsigned FreeBanks; |
103 | | unsigned Weight; |
104 | | }; |
105 | | |
106 | | class CandidateList : public std::list<Candidate> { |
107 | | public: |
108 | | // Speedup subsequent sort. |
109 | 534 | void push(const Candidate&& C) { |
110 | 534 | if (C.Weight) push_back(C)210 ; |
111 | 324 | else push_front(C); |
112 | 534 | } |
113 | | }; |
114 | | |
115 | | public: |
116 | | static char ID; |
117 | | |
118 | | public: |
119 | 2.39k | GCNRegBankReassign() : MachineFunctionPass(ID) { |
120 | 2.39k | initializeGCNRegBankReassignPass(*PassRegistry::getPassRegistry()); |
121 | 2.39k | } |
122 | | |
123 | | bool runOnMachineFunction(MachineFunction &MF) override; |
124 | | |
125 | 27.6k | StringRef getPassName() const override { return "GCN RegBank Reassign"; } |
126 | | |
127 | 2.37k | void getAnalysisUsage(AnalysisUsage &AU) const override { |
128 | 2.37k | AU.addRequired<MachineLoopInfo>(); |
129 | 2.37k | AU.addRequired<LiveIntervals>(); |
130 | 2.37k | AU.addRequired<VirtRegMap>(); |
131 | 2.37k | AU.addRequired<LiveRegMatrix>(); |
132 | 2.37k | AU.setPreservesAll(); |
133 | 2.37k | MachineFunctionPass::getAnalysisUsage(AU); |
134 | 2.37k | } |
135 | | |
136 | | private: |
137 | | const GCNSubtarget *ST; |
138 | | |
139 | | const MachineRegisterInfo *MRI; |
140 | | |
141 | | const SIRegisterInfo *TRI; |
142 | | |
143 | | MachineLoopInfo *MLI; |
144 | | |
145 | | VirtRegMap *VRM; |
146 | | |
147 | | LiveRegMatrix *LRM; |
148 | | |
149 | | LiveIntervals *LIS; |
150 | | |
151 | | unsigned MaxNumVGPRs; |
152 | | |
153 | | unsigned MaxNumSGPRs; |
154 | | |
155 | | BitVector RegsUsed; |
156 | | |
157 | | SmallVector<OperandMask, 8> OperandMasks; |
158 | | |
159 | | CandidateList Candidates; |
160 | | |
161 | | const MCPhysReg *CSRegs; |
162 | | |
163 | | // Returns bank for a phys reg. |
164 | | unsigned getPhysRegBank(unsigned Reg) const; |
165 | | |
166 | | // Return a bit set for each register bank used. 4 banks for VGPRs and |
167 | | // 8 banks for SGPRs. |
168 | | // Registers already processed and recorded in RegsUsed are excluded. |
169 | | // If Bank is not -1 assume Reg:SubReg to belong to that Bank. |
170 | | unsigned getRegBankMask(unsigned Reg, unsigned SubReg, int Bank); |
171 | | |
172 | | // Return number of stalls in the instructions. |
173 | | // UsedBanks has bits set for the banks used by all operands. |
174 | | // If Reg and Bank provided substitute the Reg with the Bank. |
175 | | unsigned analyzeInst(const MachineInstr& MI, unsigned& UsedBanks, |
176 | | unsigned Reg = AMDGPU::NoRegister, int Bank = -1); |
177 | | |
178 | | // Return true if register is regular VGPR or SGPR or their tuples. |
179 | | // Returns false for special registers like m0, vcc etc. |
180 | | bool isReassignable(unsigned Reg) const; |
181 | | |
182 | | // Check if registers' defs are old and may be pre-loaded. |
183 | | // Returns 0 if both registers are old enough, 1 or 2 if one or both |
184 | | // registers will not likely be pre-loaded. |
185 | | unsigned getOperandGatherWeight(const MachineInstr& MI, |
186 | | unsigned Reg1, |
187 | | unsigned Reg2, |
188 | | unsigned StallCycles) const; |
189 | | |
190 | | |
191 | | // Find all bank bits in UsedBanks where Mask can be relocated to. |
192 | | unsigned getFreeBanks(unsigned Mask, unsigned UsedBanks) const; |
193 | | |
194 | | // Find all bank bits in UsedBanks where Mask can be relocated to. |
195 | | // Bank is relative to the register and not its subregister component. |
196 | | // Returns 0 is a register is not reassignable. |
197 | | unsigned getFreeBanks(unsigned Reg, unsigned SubReg, unsigned Mask, |
198 | | unsigned UsedBanks) const; |
199 | | |
200 | | // Add cadidate instruction to the work list. |
201 | | void collectCandidates(MachineInstr& MI, unsigned UsedBanks, |
202 | | unsigned StallCycles); |
203 | | |
204 | | // Collect cadidate instructions across function. Returns a number stall |
205 | | // cycles detected. Only counts stalls if Collect is false. |
206 | | unsigned collectCandidates(MachineFunction &MF, bool Collect = true); |
207 | | |
208 | | // Remove all candidates that read specified register. |
209 | | void removeCandidates(unsigned Reg); |
210 | | |
211 | | // Compute stalls within the uses of SrcReg replaced by a register from |
212 | | // Bank. If Bank is -1 does not perform substitution. If Collect is set |
213 | | // candidates are collected and added to work list. |
214 | | unsigned computeStallCycles(unsigned SrcReg, |
215 | | unsigned Reg = AMDGPU::NoRegister, |
216 | | int Bank = -1, bool Collect = false); |
217 | | |
218 | | // Search for a register in Bank unused within LI. |
219 | | // Returns phys reg or NoRegister. |
220 | | unsigned scavengeReg(LiveInterval& LI, unsigned Bank) const; |
221 | | |
222 | | // Try to reassign candidate. Returns number or stall cycles saved. |
223 | | unsigned tryReassign(Candidate &C); |
224 | | |
225 | | bool verifyCycles(MachineFunction &MF, |
226 | | unsigned OriginalCycles, unsigned CyclesSaved); |
227 | | |
228 | | |
229 | | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
230 | | public: |
231 | | Printable printReg(unsigned Reg, unsigned SubReg = 0) const { |
232 | | return Printable([Reg, SubReg, this](raw_ostream &OS) { |
233 | | if (TargetRegisterInfo::isPhysicalRegister(Reg)) { |
234 | | OS << llvm::printReg(Reg, TRI); |
235 | | return; |
236 | | } |
237 | | if (!VRM->isAssignedReg(Reg)) |
238 | | OS << "<unassigned> " << llvm::printReg(Reg, TRI); |
239 | | else |
240 | | OS << llvm::printReg(Reg, TRI) << '(' |
241 | | << llvm::printReg(VRM->getPhys(Reg), TRI) << ')'; |
242 | | if (SubReg) |
243 | | OS << ':' << TRI->getSubRegIndexName(SubReg); |
244 | | }); |
245 | | } |
246 | | |
247 | | static Printable printBank(unsigned Bank) { |
248 | | return Printable([Bank](raw_ostream &OS) { |
249 | | OS << ((Bank >= SGPR_BANK_OFFSET) ? Bank - SGPR_BANK_OFFSET : Bank); |
250 | | }); |
251 | | } |
252 | | |
253 | | static void dumpFreeBanks(unsigned FreeBanks) { |
254 | | for (unsigned L = 0; L < NUM_BANKS; ++L) |
255 | | if (FreeBanks & (1 << L)) |
256 | | dbgs() << printBank(L) << ' '; |
257 | | } |
258 | | #endif |
259 | | }; |
260 | | |
261 | | } // End anonymous namespace. |
262 | | |
263 | 101k | INITIALIZE_PASS_BEGIN(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign", |
264 | 101k | false, false) |
265 | 101k | INITIALIZE_PASS_DEPENDENCY(LiveIntervals) |
266 | 101k | INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) |
267 | 101k | INITIALIZE_PASS_DEPENDENCY(VirtRegMap) |
268 | 101k | INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix) |
269 | 101k | INITIALIZE_PASS_END(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign", |
270 | | false, false) |
271 | | |
272 | | |
273 | | char GCNRegBankReassign::ID = 0; |
274 | | |
275 | | char &llvm::GCNRegBankReassignID = GCNRegBankReassign::ID; |
276 | | |
277 | 23.5k | unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg) const { |
278 | 23.5k | assert (TargetRegisterInfo::isPhysicalRegister(Reg)); |
279 | 23.5k | |
280 | 23.5k | const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); |
281 | 23.5k | unsigned Size = TRI->getRegSizeInBits(*RC); |
282 | 23.5k | if (Size > 32) |
283 | 20.6k | Reg = TRI->getSubReg(Reg, AMDGPU::sub0); |
284 | 23.5k | |
285 | 23.5k | if (TRI->hasVGPRs(RC)) { |
286 | 23.4k | Reg -= AMDGPU::VGPR0; |
287 | 23.4k | return Reg % NUM_VGPR_BANKS; |
288 | 23.4k | } |
289 | 121 | |
290 | 121 | Reg = TRI->getEncodingValue(Reg) / 2; |
291 | 121 | return Reg % NUM_SGPR_BANKS + SGPR_BANK_OFFSET; |
292 | 121 | } |
293 | | |
294 | | unsigned GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg, |
295 | 38.9k | int Bank) { |
296 | 38.9k | if (TargetRegisterInfo::isVirtualRegister(Reg)) { |
297 | 29.5k | if (!VRM->isAssignedReg(Reg)) |
298 | 0 | return 0; |
299 | 29.5k | |
300 | 29.5k | Reg = VRM->getPhys(Reg); |
301 | 29.5k | if (!Reg) |
302 | 0 | return 0; |
303 | 29.5k | if (SubReg) |
304 | 6.27k | Reg = TRI->getSubReg(Reg, SubReg); |
305 | 29.5k | } |
306 | 38.9k | |
307 | 38.9k | const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); |
308 | 38.9k | unsigned Size = TRI->getRegSizeInBits(*RC) / 32; |
309 | 38.9k | if (Size > 1) |
310 | 11.8k | Reg = TRI->getSubReg(Reg, AMDGPU::sub0); |
311 | 38.9k | |
312 | 38.9k | if (TRI->hasVGPRs(RC)) { |
313 | 19.9k | // VGPRs have 4 banks assigned in a round-robin fashion. |
314 | 19.9k | Reg -= AMDGPU::VGPR0; |
315 | 19.9k | unsigned Mask = (1 << Size) - 1; |
316 | 19.9k | unsigned Used = 0; |
317 | 19.9k | // Bitmask lacks an extract method |
318 | 44.3k | for (unsigned I = 0; I < Size; ++I24.3k ) |
319 | 24.3k | if (RegsUsed.test(Reg + I)) |
320 | 164 | Used |= 1 << I; |
321 | 19.9k | RegsUsed.set(Reg, Reg + Size); |
322 | 19.9k | Mask &= ~Used; |
323 | 19.9k | Mask <<= (Bank == -1) ? Reg % 18.5k NUM_VGPR_BANKS18.5k : unsigned(Bank)1.46k ; |
324 | 19.9k | return (Mask | (Mask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK; |
325 | 19.9k | } |
326 | 18.9k | |
327 | 18.9k | // SGPRs have 8 banks holding 2 consequitive registers each. |
328 | 18.9k | Reg = TRI->getEncodingValue(Reg) / 2; |
329 | 18.9k | unsigned StartBit = AMDGPU::VGPR_32RegClass.getNumRegs(); |
330 | 18.9k | if (Reg + StartBit >= RegsUsed.size()) |
331 | 565 | return 0; |
332 | 18.3k | |
333 | 18.3k | if (Size > 1) |
334 | 8.01k | Size /= 2; |
335 | 18.3k | unsigned Mask = (1 << Size) - 1; |
336 | 18.3k | unsigned Used = 0; |
337 | 39.1k | for (unsigned I = 0; I < Size; ++I20.8k ) |
338 | 20.8k | if (RegsUsed.test(StartBit + Reg + I)) |
339 | 583 | Used |= 1 << I; |
340 | 18.3k | RegsUsed.set(StartBit + Reg, StartBit + Reg + Size); |
341 | 18.3k | Mask &= ~Used; |
342 | 18.3k | Mask <<= (Bank == -1) ? Reg % 18.3k NUM_SGPR_BANKS18.3k |
343 | 18.3k | : unsigned(Bank - 56 SGPR_BANK_OFFSET56 ); |
344 | 18.3k | Mask = (Mask | (Mask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK; |
345 | 18.3k | // Reserve 4 bank ids for VGPRs. |
346 | 18.3k | return Mask << SGPR_BANK_OFFSET; |
347 | 18.3k | } |
348 | | |
349 | | unsigned GCNRegBankReassign::analyzeInst(const MachineInstr& MI, |
350 | | unsigned& UsedBanks, |
351 | | unsigned Reg, |
352 | 31.3k | int Bank) { |
353 | 31.3k | unsigned StallCycles = 0; |
354 | 31.3k | UsedBanks = 0; |
355 | 31.3k | |
356 | 31.3k | if (MI.isDebugValue()) |
357 | 0 | return 0; |
358 | 31.3k | |
359 | 31.3k | RegsUsed.reset(); |
360 | 31.3k | OperandMasks.clear(); |
361 | 83.7k | for (const auto& Op : MI.explicit_uses()) { |
362 | 83.7k | // Undef can be assigned to any register, so two vregs can be assigned |
363 | 83.7k | // the same phys reg within the same instruction. |
364 | 83.7k | if (!Op.isReg() || Op.isUndef()39.0k ) |
365 | 44.8k | continue; |
366 | 38.9k | |
367 | 38.9k | unsigned R = Op.getReg(); |
368 | 38.9k | if (TRI->hasAGPRs(TRI->getRegClassForReg(*MRI, R))) |
369 | 0 | continue; |
370 | 38.9k | |
371 | 38.9k | unsigned ShiftedBank = Bank; |
372 | 38.9k | |
373 | 38.9k | if (Bank != -1 && R == Reg3.30k && Op.getSubReg()1.52k ) { |
374 | 408 | unsigned LM = TRI->getSubRegIndexLaneMask(Op.getSubReg()).getAsInteger(); |
375 | 408 | if (!(LM & 1) && (Bank < 230 NUM_VGPR_BANKS230 )) { |
376 | 230 | // If a register spans all banks we cannot shift it to avoid conflict. |
377 | 230 | if (countPopulation(LM) >= NUM_VGPR_BANKS) |
378 | 230 | continue0 ; |
379 | 230 | ShiftedBank = (Bank + countTrailingZeros(LM)) % NUM_VGPR_BANKS; |
380 | 230 | } else if (178 !(LM & 3)178 && (Bank >= 0 SGPR_BANK_OFFSET0 )) { |
381 | 0 | // If a register spans all banks we cannot shift it to avoid conflict. |
382 | 0 | if (countPopulation(LM) / 2 >= NUM_SGPR_BANKS) |
383 | 0 | continue; |
384 | 0 | ShiftedBank = SGPR_BANK_OFFSET + (Bank - SGPR_BANK_OFFSET + |
385 | 0 | (countTrailingZeros(LM) >> 1)) % |
386 | 0 | NUM_SGPR_BANKS; |
387 | 0 | } |
388 | 408 | } |
389 | 38.9k | |
390 | 38.9k | unsigned Mask = getRegBankMask(R, Op.getSubReg(), |
391 | 38.9k | (Reg == R) ? ShiftedBank1.52k : -137.3k ); |
392 | 38.9k | StallCycles += countPopulation(UsedBanks & Mask); |
393 | 38.9k | UsedBanks |= Mask; |
394 | 38.9k | OperandMasks.push_back(OperandMask(Op.getReg(), Op.getSubReg(), Mask)); |
395 | 38.9k | } |
396 | 31.3k | |
397 | 31.3k | return StallCycles; |
398 | 31.3k | } |
399 | | |
400 | | unsigned GCNRegBankReassign::getOperandGatherWeight(const MachineInstr& MI, |
401 | | unsigned Reg1, |
402 | | unsigned Reg2, |
403 | | unsigned StallCycles) const |
404 | 1.09k | { |
405 | 1.09k | unsigned Defs = 0; |
406 | 1.09k | MachineBasicBlock::const_instr_iterator Def(MI.getIterator()); |
407 | 1.09k | MachineBasicBlock::const_instr_iterator B(MI.getParent()->instr_begin()); |
408 | 3.96k | for (unsigned S = StallCycles; S && Def != B2.92k && Defs != 32.92k ; --S2.86k ) { |
409 | 2.86k | if (MI.isDebugInstr()) |
410 | 0 | continue; |
411 | 2.86k | --Def; |
412 | 2.86k | if (Def->getOpcode() == TargetOpcode::IMPLICIT_DEF) |
413 | 70 | continue; |
414 | 2.79k | if (Def->modifiesRegister(Reg1, TRI)) |
415 | 109 | Defs |= 1; |
416 | 2.79k | if (Def->modifiesRegister(Reg2, TRI)) |
417 | 261 | Defs |= 2; |
418 | 2.79k | } |
419 | 1.09k | return countPopulation(Defs); |
420 | 1.09k | } |
421 | | |
422 | 2.19k | bool GCNRegBankReassign::isReassignable(unsigned Reg) const { |
423 | 2.19k | if (TargetRegisterInfo::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg)784 ) |
424 | 1.40k | return false; |
425 | 784 | |
426 | 784 | const MachineInstr *Def = MRI->getUniqueVRegDef(Reg); |
427 | 784 | |
428 | 784 | unsigned PhysReg = VRM->getPhys(Reg); |
429 | 784 | |
430 | 784 | if (Def && Def->isCopy()592 && Def->getOperand(1).getReg() == PhysReg226 ) |
431 | 188 | return false; |
432 | 596 | |
433 | 3.19k | for (auto U : MRI->use_nodbg_operands(Reg))596 { |
434 | 3.19k | if (U.isImplicit()) |
435 | 6 | return false; |
436 | 3.19k | const MachineInstr *UseInst = U.getParent(); |
437 | 3.19k | if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg51 ) |
438 | 3 | return false; |
439 | 3.19k | } |
440 | 596 | |
441 | 596 | const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysReg); |
442 | 587 | if (TRI->hasVGPRs(RC)) |
443 | 575 | return true; |
444 | 12 | |
445 | 12 | unsigned Size = TRI->getRegSizeInBits(*RC); |
446 | 12 | if (Size > 32) |
447 | 5 | PhysReg = TRI->getSubReg(PhysReg, AMDGPU::sub0); |
448 | 12 | |
449 | 12 | return AMDGPU::SGPR_32RegClass.contains(PhysReg); |
450 | 12 | } |
451 | | |
452 | | unsigned GCNRegBankReassign::getFreeBanks(unsigned Mask, |
453 | 587 | unsigned UsedBanks) const { |
454 | 587 | unsigned Size = countPopulation(Mask); |
455 | 587 | unsigned FreeBanks = 0; |
456 | 587 | unsigned Bank = findFirstSet(Mask); |
457 | 587 | |
458 | 587 | UsedBanks &= ~Mask; |
459 | 587 | |
460 | 587 | // Find free VGPR banks |
461 | 587 | if ((Mask & VGPR_BANK_MASK) && (Size < 575 NUM_VGPR_BANKS575 )) { |
462 | 2.48k | for (unsigned I = 0; I < NUM_VGPR_BANKS; ++I1.98k ) { |
463 | 1.98k | if (Bank == I) |
464 | 496 | continue; |
465 | 1.48k | unsigned NewMask = ((1 << Size) - 1) << I; |
466 | 1.48k | NewMask = (NewMask | (NewMask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK; |
467 | 1.48k | if (!(UsedBanks & NewMask)) |
468 | 1.18k | FreeBanks |= 1 << I; |
469 | 1.48k | } |
470 | 496 | return FreeBanks; |
471 | 496 | } |
472 | 91 | |
473 | 91 | // Find free SGPR banks |
474 | 91 | // SGPR tuples must be aligned, so step is size in banks it |
475 | 91 | // crosses. |
476 | 91 | Bank -= SGPR_BANK_OFFSET; |
477 | 327 | for (unsigned I = 0; I < NUM_SGPR_BANKS; I += Size236 ) { |
478 | 236 | if (Bank == I) |
479 | 12 | continue; |
480 | 224 | unsigned NewMask = ((1 << Size) - 1) << I; |
481 | 224 | NewMask = (NewMask | (NewMask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK; |
482 | 224 | if (!(UsedBanks & (NewMask << SGPR_BANK_OFFSET))) |
483 | 166 | FreeBanks |= (1 << SGPR_BANK_OFFSET) << I; |
484 | 224 | } |
485 | 91 | |
486 | 91 | return FreeBanks; |
487 | 91 | } |
488 | | |
489 | | unsigned GCNRegBankReassign::getFreeBanks(unsigned Reg, |
490 | | unsigned SubReg, |
491 | | unsigned Mask, |
492 | 2.19k | unsigned UsedBanks) const { |
493 | 2.19k | if (!isReassignable(Reg)) |
494 | 1.60k | return 0; |
495 | 587 | |
496 | 587 | unsigned FreeBanks = getFreeBanks(Mask, UsedBanks); |
497 | 587 | |
498 | 587 | unsigned LM = TRI->getSubRegIndexLaneMask(SubReg).getAsInteger(); |
499 | 587 | if (!(LM & 1) && (Mask & 42 VGPR_BANK_MASK42 )) { |
500 | 42 | unsigned Shift = countTrailingZeros(LM); |
501 | 42 | if (Shift >= NUM_VGPR_BANKS) |
502 | 42 | return 03 ; |
503 | 39 | unsigned VB = FreeBanks & VGPR_BANK_MASK; |
504 | 39 | FreeBanks = ((VB >> Shift) | (VB << (NUM_VGPR_BANKS - Shift))) & |
505 | 39 | VGPR_BANK_MASK; |
506 | 545 | } else if (!(LM & 3) && (Mask & 0 SGPR_BANK_MASK0 )) { |
507 | 0 | unsigned Shift = countTrailingZeros(LM) >> 1; |
508 | 0 | if (Shift >= NUM_SGPR_BANKS) |
509 | 0 | return 0; |
510 | 0 | unsigned SB = FreeBanks >> SGPR_BANK_OFFSET; |
511 | 0 | FreeBanks = ((SB >> Shift) | (SB << (NUM_SGPR_BANKS - Shift))) & |
512 | 0 | SGPR_BANK_SHIFTED_MASK; |
513 | 0 | FreeBanks <<= SGPR_BANK_OFFSET; |
514 | 0 | } |
515 | 587 | |
516 | 587 | LLVM_DEBUG584 (if (FreeBanks) { |
517 | 584 | dbgs() << "Potential reassignments of " << printReg(Reg, SubReg) |
518 | 584 | << " to banks: "; dumpFreeBanks(FreeBanks); |
519 | 584 | dbgs() << '\n'; }); |
520 | 584 | |
521 | 584 | return FreeBanks; |
522 | 587 | } |
523 | | |
524 | | void GCNRegBankReassign::collectCandidates(MachineInstr& MI, |
525 | | unsigned UsedBanks, |
526 | 29.2k | unsigned StallCycles) { |
527 | 29.2k | LLVM_DEBUG(MI.dump()); |
528 | 29.2k | |
529 | 29.2k | if (!StallCycles) |
530 | 28.3k | return; |
531 | 863 | |
532 | 863 | LLVM_DEBUG(dbgs() << "Stall cycles = " << StallCycles << '\n'); |
533 | 863 | |
534 | 2.61k | for (unsigned I = 0, E = OperandMasks.size(); I + 1 < E; ++I1.75k ) { |
535 | 5.55k | for (unsigned J = I + 1; J != E; ++J3.79k ) { |
536 | 3.79k | if (!(OperandMasks[I].Mask & OperandMasks[J].Mask)) |
537 | 2.69k | continue; |
538 | 1.09k | |
539 | 1.09k | unsigned Reg1 = OperandMasks[I].Reg; |
540 | 1.09k | unsigned Reg2 = OperandMasks[J].Reg; |
541 | 1.09k | unsigned SubReg1 = OperandMasks[I].SubReg; |
542 | 1.09k | unsigned SubReg2 = OperandMasks[J].SubReg; |
543 | 1.09k | unsigned Mask1 = OperandMasks[I].Mask; |
544 | 1.09k | unsigned Mask2 = OperandMasks[J].Mask; |
545 | 1.09k | unsigned Size1 = countPopulation(Mask1); |
546 | 1.09k | unsigned Size2 = countPopulation(Mask2); |
547 | 1.09k | |
548 | 1.09k | LLVM_DEBUG(dbgs() << "Conflicting operands: " << printReg(Reg1, SubReg1) << |
549 | 1.09k | " and " << printReg(Reg2, SubReg2) << '\n'); |
550 | 1.09k | |
551 | 1.09k | unsigned Weight = getOperandGatherWeight(MI, Reg1, Reg2, StallCycles); |
552 | 1.09k | Weight += MLI->getLoopDepth(MI.getParent()) * 10; |
553 | 1.09k | |
554 | 1.09k | LLVM_DEBUG(dbgs() << "Stall weight = " << Weight << '\n'); |
555 | 1.09k | |
556 | 1.09k | unsigned FreeBanks1 = getFreeBanks(Reg1, SubReg1, Mask1, UsedBanks); |
557 | 1.09k | unsigned FreeBanks2 = getFreeBanks(Reg2, SubReg2, Mask2, UsedBanks); |
558 | 1.09k | if (FreeBanks1) |
559 | 300 | Candidates.push(Candidate(&MI, Reg1, FreeBanks1, Weight |
560 | 300 | + ((Size2 > Size1) ? 119 : 0281 ))); |
561 | 1.09k | if (FreeBanks2) |
562 | 234 | Candidates.push(Candidate(&MI, Reg2, FreeBanks2, Weight |
563 | 234 | + ((Size1 > Size2) ? 161 : 0173 ))); |
564 | 1.09k | } |
565 | 1.75k | } |
566 | 863 | } |
567 | | |
568 | | unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg, |
569 | | unsigned Reg, int Bank, |
570 | 1.21k | bool Collect) { |
571 | 1.21k | unsigned TotalStallCycles = 0; |
572 | 1.21k | unsigned UsedBanks = 0; |
573 | 1.21k | SmallSet<const MachineInstr *, 16> Visited; |
574 | 1.21k | |
575 | 2.50k | for (auto &MI : MRI->use_nodbg_instructions(SrcReg)) { |
576 | 2.50k | if (MI.isBundle()) |
577 | 16 | continue; |
578 | 2.49k | if (!Visited.insert(&MI).second) |
579 | 0 | continue; |
580 | 2.49k | unsigned StallCycles = analyzeInst(MI, UsedBanks, Reg, Bank); |
581 | 2.49k | TotalStallCycles += StallCycles; |
582 | 2.49k | if (Collect) |
583 | 335 | collectCandidates(MI, UsedBanks, StallCycles); |
584 | 2.49k | } |
585 | 1.21k | |
586 | 1.21k | return TotalStallCycles; |
587 | 1.21k | } |
588 | | |
589 | | unsigned GCNRegBankReassign::scavengeReg(LiveInterval& LI, |
590 | 287 | unsigned Bank) const { |
591 | 287 | const TargetRegisterClass *RC = MRI->getRegClass(LI.reg); |
592 | 287 | unsigned MaxNumRegs = (Bank < NUM_VGPR_BANKS) ? MaxNumVGPRs199 |
593 | 287 | : MaxNumSGPRs88 ; |
594 | 287 | unsigned MaxReg = MaxNumRegs + (Bank < NUM_VGPR_BANKS ? AMDGPU::VGPR0199 |
595 | 287 | : AMDGPU::SGPR088 ); |
596 | 287 | |
597 | 23.5k | for (unsigned Reg : RC->getRegisters()) { |
598 | 23.5k | // Check occupancy limit. |
599 | 23.5k | if (TRI->isSubRegisterEq(Reg, MaxReg)) |
600 | 0 | break; |
601 | 23.5k | |
602 | 23.5k | if (!MRI->isAllocatable(Reg) || getPhysRegBank(Reg) != Bank) |
603 | 22.6k | continue; |
604 | 898 | |
605 | 64.2k | for (unsigned I = 0; 898 CSRegs[I]; ++I63.3k ) |
606 | 63.3k | if (TRI->isSubRegisterEq(Reg, CSRegs[I]) && |
607 | 63.3k | !LRM->isPhysRegUsed(CSRegs[I])86 ) |
608 | 16 | return AMDGPU::NoRegister; |
609 | 898 | |
610 | 898 | LLVM_DEBUG882 (dbgs() << "Trying register " << printReg(Reg) << '\n'); |
611 | 882 | |
612 | 882 | if (!LRM->checkInterference(LI, Reg)) |
613 | 192 | return Reg; |
614 | 882 | } |
615 | 287 | |
616 | 287 | return AMDGPU::NoRegister79 ; |
617 | 287 | } |
618 | | |
619 | 310 | unsigned GCNRegBankReassign::tryReassign(Candidate &C) { |
620 | 310 | if (!LIS->hasInterval(C.Reg)) |
621 | 0 | return 0; |
622 | 310 | |
623 | 310 | LiveInterval &LI = LIS->getInterval(C.Reg); |
624 | 310 | LLVM_DEBUG(dbgs() << "Try reassign " << printReg(C.Reg) << " in "; C.MI->dump(); |
625 | 310 | LI.dump()); |
626 | 310 | |
627 | 310 | // For each candidate bank walk all instructions in the range of live |
628 | 310 | // interval and check if replacing the register with one belonging to |
629 | 310 | // the candidate bank reduces conflicts. |
630 | 310 | |
631 | 310 | unsigned OrigStalls = computeStallCycles(C.Reg); |
632 | 310 | LLVM_DEBUG(dbgs() << "--- Stall cycles in range = " << OrigStalls << '\n'); |
633 | 310 | if (!OrigStalls) |
634 | 0 | return 0; |
635 | 310 | |
636 | 310 | struct BankStall { |
637 | 587 | BankStall(unsigned b, unsigned s) : Bank(b), Stalls(s) {}; |
638 | 320 | bool operator< (const BankStall &RHS) const { return Stalls > RHS.Stalls; } |
639 | 310 | unsigned Bank; |
640 | 310 | unsigned Stalls; |
641 | 310 | }; |
642 | 310 | SmallVector<BankStall, 8> BankStalls; |
643 | 310 | |
644 | 4.03k | for (int Bank = 0; Bank < NUM_BANKS; ++Bank3.72k ) { |
645 | 3.72k | if (C.FreeBanks & (1 << Bank)) { |
646 | 708 | LLVM_DEBUG(dbgs() << "Trying bank " << printBank(Bank) << '\n'); |
647 | 708 | unsigned Stalls = computeStallCycles(C.Reg, C.Reg, Bank); |
648 | 708 | if (Stalls < OrigStalls) { |
649 | 587 | LLVM_DEBUG(dbgs() << "With bank " << printBank(Bank) << " -> " |
650 | 587 | << Stalls << '\n'); |
651 | 587 | BankStalls.push_back(BankStall((unsigned)Bank, Stalls)); |
652 | 587 | } |
653 | 708 | } |
654 | 3.72k | } |
655 | 310 | std::sort(BankStalls.begin(), BankStalls.end()); |
656 | 310 | |
657 | 310 | unsigned OrigReg = VRM->getPhys(C.Reg); |
658 | 310 | LRM->unassign(LI); |
659 | 405 | while (!BankStalls.empty()) { |
660 | 287 | BankStall BS = BankStalls.pop_back_val(); |
661 | 287 | unsigned Reg = scavengeReg(LI, BS.Bank); |
662 | 287 | if (Reg == AMDGPU::NoRegister) { |
663 | 95 | LLVM_DEBUG(dbgs() << "No free registers in bank " << printBank(BS.Bank) |
664 | 95 | << '\n'); |
665 | 95 | continue; |
666 | 95 | } |
667 | 192 | LLVM_DEBUG(dbgs() << "Found free register " << printReg(Reg) |
668 | 192 | << (LRM->isPhysRegUsed(Reg) ? "" : " (new)") |
669 | 192 | << " in bank " << printBank(BS.Bank) << '\n'); |
670 | 192 | |
671 | 192 | LRM->assign(LI, Reg); |
672 | 192 | |
673 | 192 | LLVM_DEBUG(dbgs() << "--- Cycles saved: " << OrigStalls - BS.Stalls << '\n'); |
674 | 192 | |
675 | 192 | return OrigStalls - BS.Stalls; |
676 | 192 | } |
677 | 310 | LRM->assign(LI, OrigReg); |
678 | 118 | |
679 | 118 | return 0; |
680 | 310 | } |
681 | | |
682 | | unsigned GCNRegBankReassign::collectCandidates(MachineFunction &MF, |
683 | 2.11k | bool Collect) { |
684 | 2.11k | unsigned TotalStallCycles = 0; |
685 | 2.11k | |
686 | 2.52k | for (MachineBasicBlock &MBB : MF) { |
687 | 2.52k | |
688 | 2.52k | LLVM_DEBUG(if (Collect) { |
689 | 2.52k | if (MBB.getName().empty()) dbgs() << "bb." << MBB.getNumber(); |
690 | 2.52k | else dbgs() << MBB.getName(); dbgs() << ":\n"; |
691 | 2.52k | }); |
692 | 2.52k | |
693 | 28.9k | for (MachineInstr &MI : MBB.instrs()) { |
694 | 28.9k | if (MI.isBundle()) |
695 | 30 | continue; // we analyze the instructions inside the bundle individually |
696 | 28.8k | |
697 | 28.8k | unsigned UsedBanks = 0; |
698 | 28.8k | unsigned StallCycles = analyzeInst(MI, UsedBanks); |
699 | 28.8k | |
700 | 28.8k | if (Collect) |
701 | 28.8k | collectCandidates(MI, UsedBanks, StallCycles); |
702 | 28.8k | |
703 | 28.8k | TotalStallCycles += StallCycles; |
704 | 28.8k | } |
705 | 2.52k | |
706 | 2.52k | LLVM_DEBUG(if (Collect) { dbgs() << '\n'; }); |
707 | 2.52k | } |
708 | 2.11k | |
709 | 2.11k | return TotalStallCycles; |
710 | 2.11k | } |
711 | | |
712 | 192 | void GCNRegBankReassign::removeCandidates(unsigned Reg) { |
713 | 1.02k | Candidates.remove_if([Reg, this](const Candidate& C) { |
714 | 1.02k | return C.MI->readsRegister(Reg, TRI); |
715 | 1.02k | }); |
716 | 192 | } |
717 | | |
718 | | bool GCNRegBankReassign::verifyCycles(MachineFunction &MF, |
719 | | unsigned OriginalCycles, |
720 | 0 | unsigned CyclesSaved) { |
721 | 0 | unsigned StallCycles = collectCandidates(MF, false); |
722 | 0 | LLVM_DEBUG(dbgs() << "=== After the pass " << StallCycles |
723 | 0 | << " stall cycles left\n"); |
724 | 0 | return StallCycles + CyclesSaved == OriginalCycles; |
725 | 0 | } |
726 | | |
727 | 25.2k | bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) { |
728 | 25.2k | ST = &MF.getSubtarget<GCNSubtarget>(); |
729 | 25.2k | if (!ST->hasRegisterBanking() || skipFunction(MF.getFunction())2.12k ) |
730 | 23.1k | return false; |
731 | 2.11k | |
732 | 2.11k | MRI = &MF.getRegInfo(); |
733 | 2.11k | TRI = ST->getRegisterInfo(); |
734 | 2.11k | MLI = &getAnalysis<MachineLoopInfo>(); |
735 | 2.11k | VRM = &getAnalysis<VirtRegMap>(); |
736 | 2.11k | LRM = &getAnalysis<LiveRegMatrix>(); |
737 | 2.11k | LIS = &getAnalysis<LiveIntervals>(); |
738 | 2.11k | |
739 | 2.11k | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
740 | 2.11k | unsigned Occupancy = MFI->getOccupancy(); |
741 | 2.11k | MaxNumVGPRs = ST->getMaxNumVGPRs(MF); |
742 | 2.11k | MaxNumSGPRs = ST->getMaxNumSGPRs(MF); |
743 | 2.11k | MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(Occupancy), MaxNumVGPRs); |
744 | 2.11k | MaxNumSGPRs = std::min(ST->getMaxNumSGPRs(Occupancy, true), MaxNumSGPRs); |
745 | 2.11k | |
746 | 2.11k | CSRegs = MRI->getCalleeSavedRegs(); |
747 | 2.11k | |
748 | 2.11k | RegsUsed.resize(AMDGPU::VGPR_32RegClass.getNumRegs() + |
749 | 2.11k | TRI->getEncodingValue(AMDGPU::SGPR_NULL) / 2 + 1); |
750 | 2.11k | |
751 | 2.11k | LLVM_DEBUG(dbgs() << "=== RegBanks reassign analysis on function " << MF.getName() |
752 | 2.11k | << '\n'); |
753 | 2.11k | |
754 | 2.11k | unsigned StallCycles = collectCandidates(MF); |
755 | 2.11k | NumStallsDetected += StallCycles; |
756 | 2.11k | |
757 | 2.11k | LLVM_DEBUG(dbgs() << "=== " << StallCycles << " stall cycles detected in " |
758 | 2.11k | "function " << MF.getName() << '\n'); |
759 | 2.11k | |
760 | 2.11k | Candidates.sort(); |
761 | 2.11k | |
762 | 2.11k | LLVM_DEBUG(dbgs() << "\nCandidates:\n\n"; |
763 | 2.11k | for (auto C : Candidates) C.dump(this); |
764 | 2.11k | dbgs() << "\n\n"); |
765 | 2.11k | |
766 | 2.11k | unsigned CyclesSaved = 0; |
767 | 2.42k | while (!Candidates.empty()) { |
768 | 310 | Candidate C = Candidates.back(); |
769 | 310 | unsigned LocalCyclesSaved = tryReassign(C); |
770 | 310 | CyclesSaved += LocalCyclesSaved; |
771 | 310 | |
772 | 310 | if (VerifyStallCycles > 1 && !verifyCycles(MF, StallCycles, CyclesSaved)0 ) |
773 | 0 | report_fatal_error("RegBank reassign stall cycles verification failed."); |
774 | 310 | |
775 | 310 | Candidates.pop_back(); |
776 | 310 | if (LocalCyclesSaved) { |
777 | 192 | removeCandidates(C.Reg); |
778 | 192 | computeStallCycles(C.Reg, AMDGPU::NoRegister, -1, true); |
779 | 192 | Candidates.sort(); |
780 | 192 | |
781 | 192 | LLVM_DEBUG(dbgs() << "\nCandidates:\n\n"; |
782 | 192 | for (auto C : Candidates) |
783 | 192 | C.dump(this); |
784 | 192 | dbgs() << "\n\n"); |
785 | 192 | } |
786 | 310 | } |
787 | 2.11k | NumStallsRecovered += CyclesSaved; |
788 | 2.11k | |
789 | 2.11k | LLVM_DEBUG(dbgs() << "=== After the pass " << CyclesSaved |
790 | 2.11k | << " cycles saved in function " << MF.getName() << '\n'); |
791 | 2.11k | |
792 | 2.11k | Candidates.clear(); |
793 | 2.11k | |
794 | 2.11k | if (VerifyStallCycles == 1 && !verifyCycles(MF, StallCycles, CyclesSaved)0 ) |
795 | 0 | report_fatal_error("RegBank reassign stall cycles verification failed."); |
796 | 2.11k | |
797 | 2.11k | RegsUsed.clear(); |
798 | 2.11k | |
799 | 2.11k | return CyclesSaved > 0; |
800 | 2.11k | } |