/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===-- GCNNSAReassign.cpp - Reassign registers in NSA unstructions -------===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | // |
9 | | /// \file |
10 | | /// \brief Try to reassign registers on GFX10+ from non-sequential to sequential |
11 | | /// in NSA image instructions. Later SIShrinkInstructions pass will relace NSA |
12 | | /// with sequential versions where possible. |
13 | | /// |
14 | | //===----------------------------------------------------------------------===// |
15 | | |
16 | | #include "AMDGPU.h" |
17 | | #include "AMDGPUSubtarget.h" |
18 | | #include "SIInstrInfo.h" |
19 | | #include "SIMachineFunctionInfo.h" |
20 | | #include "llvm/ADT/Statistic.h" |
21 | | #include "llvm/CodeGen/LiveInterval.h" |
22 | | #include "llvm/CodeGen/LiveIntervals.h" |
23 | | #include "llvm/CodeGen/LiveRegMatrix.h" |
24 | | #include "llvm/CodeGen/MachineFunctionPass.h" |
25 | | #include "llvm/CodeGen/VirtRegMap.h" |
26 | | #include "llvm/Support/MathExtras.h" |
27 | | #include <algorithm> |
28 | | |
29 | | using namespace llvm; |
30 | | |
31 | | #define DEBUG_TYPE "amdgpu-nsa-reassign" |
32 | | |
33 | | STATISTIC(NumNSAInstructions, |
34 | | "Number of NSA instructions with non-sequential address found"); |
35 | | STATISTIC(NumNSAConverted, |
36 | | "Number of NSA instructions changed to sequential"); |
37 | | |
38 | | namespace { |
39 | | |
40 | | class GCNNSAReassign : public MachineFunctionPass { |
41 | | public: |
42 | | static char ID; |
43 | | |
44 | 2.39k | GCNNSAReassign() : MachineFunctionPass(ID) { |
45 | 2.39k | initializeGCNNSAReassignPass(*PassRegistry::getPassRegistry()); |
46 | 2.39k | } |
47 | | |
48 | | bool runOnMachineFunction(MachineFunction &MF) override; |
49 | | |
50 | 27.6k | StringRef getPassName() const override { return "GCN NSA Reassign"; } |
51 | | |
52 | 2.37k | void getAnalysisUsage(AnalysisUsage &AU) const override { |
53 | 2.37k | AU.addRequired<LiveIntervals>(); |
54 | 2.37k | AU.addRequired<VirtRegMap>(); |
55 | 2.37k | AU.addRequired<LiveRegMatrix>(); |
56 | 2.37k | AU.setPreservesAll(); |
57 | 2.37k | MachineFunctionPass::getAnalysisUsage(AU); |
58 | 2.37k | } |
59 | | |
60 | | private: |
61 | | typedef enum { |
62 | | NOT_NSA, // Not an NSA instruction |
63 | | FIXED, // NSA which we cannot modify |
64 | | NON_CONTIGUOUS, // NSA with non-sequential address which we can try |
65 | | // to optimize. |
66 | | CONTIGUOUS // NSA with all sequential address registers |
67 | | } NSA_Status; |
68 | | |
69 | | const GCNSubtarget *ST; |
70 | | |
71 | | const MachineRegisterInfo *MRI; |
72 | | |
73 | | const SIRegisterInfo *TRI; |
74 | | |
75 | | VirtRegMap *VRM; |
76 | | |
77 | | LiveRegMatrix *LRM; |
78 | | |
79 | | LiveIntervals *LIS; |
80 | | |
81 | | unsigned MaxNumVGPRs; |
82 | | |
83 | | const MCPhysReg *CSRegs; |
84 | | |
85 | | NSA_Status CheckNSA(const MachineInstr &MI, bool Fast = false) const; |
86 | | |
87 | | bool tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals, |
88 | | unsigned StartReg) const; |
89 | | |
90 | | bool canAssign(unsigned StartReg, unsigned NumRegs) const; |
91 | | |
92 | | bool scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const; |
93 | | }; |
94 | | |
95 | | } // End anonymous namespace. |
96 | | |
97 | 101k | INITIALIZE_PASS_BEGIN(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign", |
98 | 101k | false, false) |
99 | 101k | INITIALIZE_PASS_DEPENDENCY(LiveIntervals) |
100 | 101k | INITIALIZE_PASS_DEPENDENCY(VirtRegMap) |
101 | 101k | INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix) |
102 | 101k | INITIALIZE_PASS_END(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign", |
103 | | false, false) |
104 | | |
105 | | |
106 | | char GCNNSAReassign::ID = 0; |
107 | | |
108 | | char &llvm::GCNNSAReassignID = GCNNSAReassign::ID; |
109 | | |
110 | | bool |
111 | | GCNNSAReassign::tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals, |
112 | 57 | unsigned StartReg) const { |
113 | 57 | unsigned NumRegs = Intervals.size(); |
114 | 57 | |
115 | 234 | for (unsigned N = 0; N < NumRegs; ++N177 ) |
116 | 177 | if (VRM->hasPhys(Intervals[N]->reg)) |
117 | 31 | LRM->unassign(*Intervals[N]); |
118 | 57 | |
119 | 89 | for (unsigned N = 0; N < NumRegs; ++N32 ) |
120 | 79 | if (LRM->checkInterference(*Intervals[N], StartReg + N)) |
121 | 47 | return false; |
122 | 57 | |
123 | 57 | for (unsigned N = 0; 10 N < NumRegs41 ; ++N31 ) |
124 | 31 | LRM->assign(*Intervals[N], StartReg + N); |
125 | 10 | |
126 | 10 | return true; |
127 | 57 | } |
128 | | |
129 | 57 | bool GCNNSAReassign::canAssign(unsigned StartReg, unsigned NumRegs) const { |
130 | 234 | for (unsigned N = 0; N < NumRegs; ++N177 ) { |
131 | 177 | unsigned Reg = StartReg + N; |
132 | 177 | if (!MRI->isAllocatable(Reg)) |
133 | 0 | return false; |
134 | 177 | |
135 | 177 | for (unsigned I = 0; CSRegs[I]; ++I0 ) |
136 | 0 | if (TRI->isSubRegisterEq(Reg, CSRegs[I]) && |
137 | 0 | !LRM->isPhysRegUsed(CSRegs[I])) |
138 | 0 | return false; |
139 | 177 | } |
140 | 57 | |
141 | 57 | return true; |
142 | 57 | } |
143 | | |
144 | | bool |
145 | 10 | GCNNSAReassign::scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const { |
146 | 10 | unsigned NumRegs = Intervals.size(); |
147 | 10 | |
148 | 10 | if (NumRegs > MaxNumVGPRs) |
149 | 0 | return false; |
150 | 10 | unsigned MaxReg = MaxNumVGPRs - NumRegs + AMDGPU::VGPR0; |
151 | 10 | |
152 | 57 | for (unsigned Reg = AMDGPU::VGPR0; Reg <= MaxReg; ++Reg47 ) { |
153 | 57 | if (!canAssign(Reg, NumRegs)) |
154 | 0 | continue; |
155 | 57 | |
156 | 57 | if (tryAssignRegisters(Intervals, Reg)) |
157 | 10 | return true; |
158 | 57 | } |
159 | 10 | |
160 | 10 | return false0 ; |
161 | 10 | } |
162 | | |
163 | | GCNNSAReassign::NSA_Status |
164 | 28.8k | GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const { |
165 | 28.8k | const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); |
166 | 28.8k | if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA254 ) |
167 | 28.7k | return NSA_Status::NOT_NSA; |
168 | 125 | |
169 | 125 | int VAddr0Idx = |
170 | 125 | AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); |
171 | 125 | |
172 | 125 | unsigned VgprBase = 0; |
173 | 125 | bool NSA = false; |
174 | 230 | for (unsigned I = 0; I < Info->VAddrDwords; ++I105 ) { |
175 | 200 | const MachineOperand &Op = MI.getOperand(VAddr0Idx + I); |
176 | 200 | unsigned Reg = Op.getReg(); |
177 | 200 | if (TargetRegisterInfo::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg)) |
178 | 0 | return NSA_Status::FIXED; |
179 | 200 | |
180 | 200 | unsigned PhysReg = VRM->getPhys(Reg); |
181 | 200 | |
182 | 200 | if (!Fast) { |
183 | 153 | if (!PhysReg) |
184 | 0 | return NSA_Status::FIXED; |
185 | 153 | |
186 | 153 | // Bail if address is not a VGPR32. That should be possible to extend the |
187 | 153 | // optimization to work with subregs of a wider register tuples, but the |
188 | 153 | // logic to find free registers will be much more complicated with much |
189 | 153 | // less chances for success. That seems reasonable to assume that in most |
190 | 153 | // cases a tuple is used because a vector variable contains different |
191 | 153 | // parts of an address and it is either already consequitive or cannot |
192 | 153 | // be reassigned if not. If needed it is better to rely on register |
193 | 153 | // coalescer to process such address tuples. |
194 | 153 | if (MRI->getRegClass(Reg) != &AMDGPU::VGPR_32RegClass || Op.getSubReg()) |
195 | 0 | return NSA_Status::FIXED; |
196 | 153 | |
197 | 153 | const MachineInstr *Def = MRI->getUniqueVRegDef(Reg); |
198 | 153 | |
199 | 153 | if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg122 ) |
200 | 95 | return NSA_Status::FIXED; |
201 | 58 | |
202 | 72 | for (auto U : MRI->use_nodbg_operands(Reg))58 { |
203 | 72 | if (U.isImplicit()) |
204 | 0 | return NSA_Status::FIXED; |
205 | 72 | const MachineInstr *UseInst = U.getParent(); |
206 | 72 | if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg0 ) |
207 | 0 | return NSA_Status::FIXED; |
208 | 72 | } |
209 | 58 | |
210 | 58 | if (!LIS->hasInterval(Reg)) |
211 | 0 | return NSA_Status::FIXED; |
212 | 105 | } |
213 | 105 | |
214 | 105 | if (I == 0) |
215 | 32 | VgprBase = PhysReg; |
216 | 73 | else if (VgprBase + I != PhysReg) |
217 | 47 | NSA = true; |
218 | 105 | } |
219 | 125 | |
220 | 125 | return NSA 30 ? NSA_Status::NON_CONTIGUOUS23 : NSA_Status::CONTIGUOUS7 ; |
221 | 125 | } |
222 | | |
223 | 25.2k | bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) { |
224 | 25.2k | ST = &MF.getSubtarget<GCNSubtarget>(); |
225 | 25.2k | if (ST->getGeneration() < GCNSubtarget::GFX10) |
226 | 23.1k | return false; |
227 | 2.10k | |
228 | 2.10k | MRI = &MF.getRegInfo(); |
229 | 2.10k | TRI = ST->getRegisterInfo(); |
230 | 2.10k | VRM = &getAnalysis<VirtRegMap>(); |
231 | 2.10k | LRM = &getAnalysis<LiveRegMatrix>(); |
232 | 2.10k | LIS = &getAnalysis<LiveIntervals>(); |
233 | 2.10k | |
234 | 2.10k | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
235 | 2.10k | MaxNumVGPRs = ST->getMaxNumVGPRs(MF); |
236 | 2.10k | MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(MFI->getOccupancy()), MaxNumVGPRs); |
237 | 2.10k | CSRegs = MRI->getCalleeSavedRegs(); |
238 | 2.10k | |
239 | 2.10k | using Candidate = std::pair<const MachineInstr*, bool>; |
240 | 2.10k | SmallVector<Candidate, 32> Candidates; |
241 | 2.52k | for (const MachineBasicBlock &MBB : MF) { |
242 | 28.8k | for (const MachineInstr &MI : MBB) { |
243 | 28.8k | switch (CheckNSA(MI)) { |
244 | 28.8k | default: |
245 | 28.7k | continue; |
246 | 28.8k | case NSA_Status::CONTIGUOUS: |
247 | 5 | Candidates.push_back(std::make_pair(&MI, true)); |
248 | 5 | break; |
249 | 28.8k | case NSA_Status::NON_CONTIGUOUS: |
250 | 11 | Candidates.push_back(std::make_pair(&MI, false)); |
251 | 11 | ++NumNSAInstructions; |
252 | 11 | break; |
253 | 28.8k | } |
254 | 28.8k | } |
255 | 2.52k | } |
256 | 2.10k | |
257 | 2.10k | bool Changed = false; |
258 | 2.10k | for (auto &C : Candidates) { |
259 | 16 | if (C.second) |
260 | 5 | continue; |
261 | 11 | |
262 | 11 | const MachineInstr *MI = C.first; |
263 | 11 | if (CheckNSA(*MI, true) == NSA_Status::CONTIGUOUS) { |
264 | 0 | // Already happen to be fixed. |
265 | 0 | C.second = true; |
266 | 0 | ++NumNSAConverted; |
267 | 0 | continue; |
268 | 0 | } |
269 | 11 | |
270 | 11 | const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI->getOpcode()); |
271 | 11 | int VAddr0Idx = |
272 | 11 | AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr0); |
273 | 11 | |
274 | 11 | SmallVector<LiveInterval *, 16> Intervals; |
275 | 11 | SmallVector<unsigned, 16> OrigRegs; |
276 | 11 | SlotIndex MinInd, MaxInd; |
277 | 43 | for (unsigned I = 0; I < Info->VAddrDwords; ++I32 ) { |
278 | 33 | const MachineOperand &Op = MI->getOperand(VAddr0Idx + I); |
279 | 33 | unsigned Reg = Op.getReg(); |
280 | 33 | LiveInterval *LI = &LIS->getInterval(Reg); |
281 | 33 | if (llvm::find(Intervals, LI) != Intervals.end()) { |
282 | 1 | // Same register used, unable to make sequential |
283 | 1 | Intervals.clear(); |
284 | 1 | break; |
285 | 1 | } |
286 | 32 | Intervals.push_back(LI); |
287 | 32 | OrigRegs.push_back(VRM->getPhys(Reg)); |
288 | 32 | MinInd = I ? std::min(MinInd, LI->beginIndex())21 : LI->beginIndex()11 ; |
289 | 32 | MaxInd = I ? std::max(MaxInd, LI->endIndex())21 : LI->endIndex()11 ; |
290 | 32 | } |
291 | 11 | |
292 | 11 | if (Intervals.empty()) |
293 | 1 | continue; |
294 | 10 | |
295 | 10 | LLVM_DEBUG(dbgs() << "Attempting to reassign NSA: " << *MI |
296 | 10 | << "\tOriginal allocation:\t"; |
297 | 10 | for(auto *LI : Intervals) |
298 | 10 | dbgs() << " " << llvm::printReg((VRM->getPhys(LI->reg)), TRI); |
299 | 10 | dbgs() << '\n'); |
300 | 10 | |
301 | 10 | bool Success = scavengeRegs(Intervals); |
302 | 10 | if (!Success) { |
303 | 0 | LLVM_DEBUG(dbgs() << "\tCannot reallocate.\n"); |
304 | 0 | if (VRM->hasPhys(Intervals.back()->reg)) // Did not change allocation. |
305 | 0 | continue; |
306 | 10 | } else { |
307 | 10 | // Check we did not make it worse for other instructions. |
308 | 10 | auto I = std::lower_bound(Candidates.begin(), &C, MinInd, |
309 | 10 | [this](const Candidate &C, SlotIndex I) { |
310 | 2 | return LIS->getInstructionIndex(*C.first) < I; |
311 | 2 | }); |
312 | 23 | for (auto E = Candidates.end(); Success && I != E22 && |
313 | 23 | LIS->getInstructionIndex(*I->first) < MaxInd13 ; ++I13 ) { |
314 | 13 | if (I->second && CheckNSA(*I->first, true) < NSA_Status::CONTIGUOUS3 ) { |
315 | 1 | Success = false; |
316 | 1 | LLVM_DEBUG(dbgs() << "\tNSA conversion conflict with " << *I->first); |
317 | 1 | } |
318 | 13 | } |
319 | 10 | } |
320 | 10 | |
321 | 10 | if (!Success) { |
322 | 4 | for (unsigned I = 0; I < Info->VAddrDwords; ++I3 ) |
323 | 3 | if (VRM->hasPhys(Intervals[I]->reg)) |
324 | 3 | LRM->unassign(*Intervals[I]); |
325 | 1 | |
326 | 4 | for (unsigned I = 0; I < Info->VAddrDwords; ++I3 ) |
327 | 3 | LRM->assign(*Intervals[I], OrigRegs[I]); |
328 | 1 | |
329 | 1 | continue; |
330 | 1 | } |
331 | 9 | |
332 | 9 | C.second = true; |
333 | 9 | ++NumNSAConverted; |
334 | 9 | LLVM_DEBUG(dbgs() << "\tNew allocation:\t\t [" |
335 | 9 | << llvm::printReg((VRM->getPhys(Intervals.front()->reg)), TRI) |
336 | 9 | << " : " |
337 | 9 | << llvm::printReg((VRM->getPhys(Intervals.back()->reg)), TRI) |
338 | 9 | << "]\n"); |
339 | 9 | Changed = true; |
340 | 9 | } |
341 | 2.10k | |
342 | 2.10k | return Changed; |
343 | 2.10k | } |