/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | // |
9 | | // This file implements hazard recognizers for scheduling on GCN processors. |
10 | | // |
11 | | //===----------------------------------------------------------------------===// |
12 | | |
13 | | #include "GCNHazardRecognizer.h" |
14 | | #include "AMDGPUSubtarget.h" |
15 | | #include "SIDefines.h" |
16 | | #include "SIInstrInfo.h" |
17 | | #include "SIRegisterInfo.h" |
18 | | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
19 | | #include "Utils/AMDGPUBaseInfo.h" |
20 | | #include "llvm/ADT/iterator_range.h" |
21 | | #include "llvm/CodeGen/MachineFunction.h" |
22 | | #include "llvm/CodeGen/MachineInstr.h" |
23 | | #include "llvm/CodeGen/MachineInstrBuilder.h" |
24 | | #include "llvm/CodeGen/MachineOperand.h" |
25 | | #include "llvm/CodeGen/ScheduleDAG.h" |
26 | | #include "llvm/MC/MCInstrDesc.h" |
27 | | #include "llvm/Support/ErrorHandling.h" |
28 | | #include <algorithm> |
29 | | #include <cassert> |
30 | | #include <limits> |
31 | | #include <set> |
32 | | #include <vector> |
33 | | |
34 | | using namespace llvm; |
35 | | |
36 | | //===----------------------------------------------------------------------===// |
37 | | // Hazard Recoginizer Implementation |
38 | | //===----------------------------------------------------------------------===// |
39 | | |
40 | | GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) : |
41 | | IsHazardRecognizerMode(false), |
42 | | CurrCycleInstr(nullptr), |
43 | | MF(MF), |
44 | | ST(MF.getSubtarget<GCNSubtarget>()), |
45 | | TII(*ST.getInstrInfo()), |
46 | | TRI(TII.getRegisterInfo()), |
47 | | ClauseUses(TRI.getNumRegUnits()), |
48 | 47.2k | ClauseDefs(TRI.getNumRegUnits()) { |
49 | 47.2k | MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 181.31k : 545.9k ; |
50 | 47.2k | TSchedModel.init(&ST); |
51 | 47.2k | } |
52 | | |
53 | 270k | void GCNHazardRecognizer::EmitInstruction(SUnit *SU) { |
54 | 270k | EmitInstruction(SU->getInstr()); |
55 | 270k | } |
56 | | |
57 | 705k | void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) { |
58 | 705k | CurrCycleInstr = MI; |
59 | 705k | } |
60 | | |
61 | 859k | static bool isDivFMas(unsigned Opcode) { |
62 | 859k | return Opcode == AMDGPU::V_DIV_FMAS_F32 || Opcode == AMDGPU::V_DIV_FMAS_F64859k ; |
63 | 859k | } |
64 | | |
65 | 857k | static bool isSGetReg(unsigned Opcode) { |
66 | 857k | return Opcode == AMDGPU::S_GETREG_B32; |
67 | 857k | } |
68 | | |
69 | 858k | static bool isSSetReg(unsigned Opcode) { |
70 | 858k | return Opcode == AMDGPU::S_SETREG_B32 || Opcode == AMDGPU::S_SETREG_IMM32_B32858k ; |
71 | 858k | } |
72 | | |
73 | 859k | static bool isRWLane(unsigned Opcode) { |
74 | 859k | return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32859k ; |
75 | 859k | } |
76 | | |
77 | 857k | static bool isRFE(unsigned Opcode) { |
78 | 857k | return Opcode == AMDGPU::S_RFE_B64; |
79 | 857k | } |
80 | | |
81 | 183k | static bool isSMovRel(unsigned Opcode) { |
82 | 183k | switch (Opcode) { |
83 | 183k | case AMDGPU::S_MOVRELS_B32: |
84 | 4 | case AMDGPU::S_MOVRELS_B64: |
85 | 4 | case AMDGPU::S_MOVRELD_B32: |
86 | 4 | case AMDGPU::S_MOVRELD_B64: |
87 | 4 | return true; |
88 | 183k | default: |
89 | 183k | return false; |
90 | 183k | } |
91 | 183k | } |
92 | | |
93 | | static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, |
94 | 549k | const MachineInstr &MI) { |
95 | 549k | if (TII.isAlwaysGDS(MI.getOpcode())) |
96 | 204 | return true; |
97 | 549k | |
98 | 549k | switch (MI.getOpcode()) { |
99 | 549k | case AMDGPU::S_SENDMSG: |
100 | 139 | case AMDGPU::S_SENDMSGHALT: |
101 | 139 | case AMDGPU::S_TTRACEDATA: |
102 | 139 | return true; |
103 | 139 | // These DS opcodes don't support GDS. |
104 | 139 | case AMDGPU::DS_NOP: |
105 | 21 | case AMDGPU::DS_PERMUTE_B32: |
106 | 21 | case AMDGPU::DS_BPERMUTE_B32: |
107 | 21 | return false; |
108 | 548k | default: |
109 | 548k | if (TII.isDS(MI.getOpcode())) { |
110 | 17.2k | int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(), |
111 | 17.2k | AMDGPU::OpName::gds); |
112 | 17.2k | if (MI.getOperand(GDS).getImm()) |
113 | 120 | return true; |
114 | 548k | } |
115 | 548k | return false; |
116 | 549k | } |
117 | 549k | } |
118 | | |
119 | 28.1k | static bool isPermlane(const MachineInstr &MI) { |
120 | 28.1k | unsigned Opcode = MI.getOpcode(); |
121 | 28.1k | return Opcode == AMDGPU::V_PERMLANE16_B32 || |
122 | 28.1k | Opcode == AMDGPU::V_PERMLANEX16_B3228.1k ; |
123 | 28.1k | } |
124 | | |
125 | 616 | static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) { |
126 | 616 | const MachineOperand *RegOp = TII->getNamedOperand(RegInstr, |
127 | 616 | AMDGPU::OpName::simm16); |
128 | 616 | return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_; |
129 | 616 | } |
130 | | |
131 | | ScheduleHazardRecognizer::HazardType |
132 | 278k | GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { |
133 | 278k | MachineInstr *MI = SU->getInstr(); |
134 | 278k | if (MI->isBundle()) |
135 | 831 | return NoHazard; |
136 | 277k | |
137 | 277k | if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 026.9k ) |
138 | 84 | return NoopHazard; |
139 | 277k | |
140 | 277k | // FIXME: Should flat be considered vmem? |
141 | 277k | if ((SIInstrInfo::isVMEM(*MI) || |
142 | 277k | SIInstrInfo::isFLAT(*MI)251k ) |
143 | 277k | && checkVMEMHazards(MI) > 045.5k ) |
144 | 1.16k | return NoopHazard; |
145 | 276k | |
146 | 276k | if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 020.3k ) |
147 | 0 | return NoopHazard; |
148 | 276k | |
149 | 276k | if (checkFPAtomicToDenormModeHazard(MI) > 0) |
150 | 0 | return NoopHazard; |
151 | 276k | |
152 | 276k | if (ST.hasNoDataDepHazard()) |
153 | 20.3k | return NoHazard; |
154 | 256k | |
155 | 256k | if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0129k ) |
156 | 1.79k | return NoopHazard; |
157 | 254k | |
158 | 254k | if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0730 ) |
159 | 465 | return NoopHazard; |
160 | 254k | |
161 | 254k | if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0135 ) |
162 | 21 | return NoopHazard; |
163 | 254k | |
164 | 254k | if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 065 ) |
165 | 12 | return NoopHazard; |
166 | 254k | |
167 | 254k | if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0101 ) |
168 | 0 | return NoopHazard; |
169 | 254k | |
170 | 254k | if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 00 ) |
171 | 0 | return NoopHazard; |
172 | 254k | |
173 | 254k | if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 00 ) |
174 | 0 | return NoopHazard; |
175 | 254k | |
176 | 254k | if (ST.hasReadM0MovRelInterpHazard() && |
177 | 254k | (61.5k TII.isVINTRP(*MI)61.5k || isSMovRel(MI->getOpcode())61.5k ) && |
178 | 254k | checkReadM0Hazards(MI) > 018 ) |
179 | 6 | return NoopHazard; |
180 | 254k | |
181 | 254k | if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)178k && |
182 | 254k | checkReadM0Hazards(MI) > 0148 ) |
183 | 62 | return NoopHazard; |
184 | 254k | |
185 | 254k | if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 06.60k ) |
186 | 4.55k | return NoopHazard; |
187 | 249k | |
188 | 249k | if ((MI->mayLoad() || MI->mayStore()204k ) && checkMAILdStHazards(MI) > 072.5k ) |
189 | 71 | return NoopHazard; |
190 | 249k | |
191 | 249k | if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0918 ) |
192 | 0 | return NoopHazard; |
193 | 249k | |
194 | 249k | if (checkAnyInstHazards(MI) > 0) |
195 | 0 | return NoopHazard; |
196 | 249k | |
197 | 249k | return NoHazard; |
198 | 249k | } |
199 | | |
200 | 26 | static void insertNoopInBundle(MachineInstr *MI, const SIInstrInfo &TII) { |
201 | 26 | BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP)) |
202 | 26 | .addImm(0); |
203 | 26 | } |
204 | | |
205 | 1.74k | void GCNHazardRecognizer::processBundle() { |
206 | 1.74k | MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator()); |
207 | 1.74k | MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end(); |
208 | 1.74k | // Check bundled MachineInstr's for hazards. |
209 | 6.69k | for (; MI != E && MI->isInsideBundle()6.69k ; ++MI4.95k ) { |
210 | 4.95k | CurrCycleInstr = &*MI; |
211 | 4.95k | unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr); |
212 | 4.95k | |
213 | 4.95k | if (IsHazardRecognizerMode) |
214 | 2.60k | fixHazards(CurrCycleInstr); |
215 | 4.95k | |
216 | 4.97k | for (unsigned i = 0; i < WaitStates; ++i26 ) |
217 | 26 | insertNoopInBundle(CurrCycleInstr, TII); |
218 | 4.95k | |
219 | 4.95k | // It’s unnecessary to track more than MaxLookAhead instructions. Since we |
220 | 4.95k | // include the bundled MI directly after, only add a maximum of |
221 | 4.95k | // (MaxLookAhead - 1) noops to EmittedInstrs. |
222 | 4.97k | for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i26 ) |
223 | 26 | EmittedInstrs.push_front(nullptr); |
224 | 4.95k | |
225 | 4.95k | EmittedInstrs.push_front(CurrCycleInstr); |
226 | 4.95k | EmittedInstrs.resize(MaxLookAhead); |
227 | 4.95k | } |
228 | 1.74k | CurrCycleInstr = nullptr; |
229 | 1.74k | } |
230 | | |
231 | 270k | unsigned GCNHazardRecognizer::PreEmitNoops(SUnit *SU) { |
232 | 270k | IsHazardRecognizerMode = false; |
233 | 270k | return PreEmitNoopsCommon(SU->getInstr()); |
234 | 270k | } |
235 | | |
236 | 434k | unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) { |
237 | 434k | IsHazardRecognizerMode = true; |
238 | 434k | CurrCycleInstr = MI; |
239 | 434k | unsigned W = PreEmitNoopsCommon(MI); |
240 | 434k | fixHazards(MI); |
241 | 434k | CurrCycleInstr = nullptr; |
242 | 434k | return W; |
243 | 434k | } |
244 | | |
245 | 710k | unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) { |
246 | 710k | if (MI->isBundle()) |
247 | 1.74k | return 0; |
248 | 708k | |
249 | 708k | int WaitStates = std::max(0, checkAnyInstHazards(MI)); |
250 | 708k | |
251 | 708k | if (SIInstrInfo::isSMRD(*MI)) |
252 | 59.8k | return std::max(WaitStates, checkSMRDHazards(MI)); |
253 | 648k | |
254 | 648k | if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI)579k ) |
255 | 106k | WaitStates = std::max(WaitStates, checkVMEMHazards(MI)); |
256 | 648k | |
257 | 648k | if (ST.hasNSAtoVMEMBug()) |
258 | 42.7k | WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI)); |
259 | 648k | |
260 | 648k | WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI)); |
261 | 648k | |
262 | 648k | if (ST.hasNoDataDepHazard()) |
263 | 42.7k | return WaitStates; |
264 | 605k | |
265 | 605k | if (SIInstrInfo::isVALU(*MI)) |
266 | 277k | WaitStates = std::max(WaitStates, checkVALUHazards(MI)); |
267 | 605k | |
268 | 605k | if (SIInstrInfo::isDPP(*MI)) |
269 | 548 | WaitStates = std::max(WaitStates, checkDPPHazards(MI)); |
270 | 605k | |
271 | 605k | if (isDivFMas(MI->getOpcode())) |
272 | 272 | WaitStates = std::max(WaitStates, checkDivFMasHazards(MI)); |
273 | 605k | |
274 | 605k | if (isRWLane(MI->getOpcode())) |
275 | 122 | WaitStates = std::max(WaitStates, checkRWLaneHazards(MI)); |
276 | 605k | |
277 | 605k | if (MI->isInlineAsm()) |
278 | 2.37k | return std::max(WaitStates, checkInlineAsmHazards(MI)); |
279 | 603k | |
280 | 603k | if (isSGetReg(MI->getOpcode())) |
281 | 224 | return std::max(WaitStates, checkGetRegHazards(MI)); |
282 | 603k | |
283 | 603k | if (isSSetReg(MI->getOpcode())) |
284 | 255 | return std::max(WaitStates, checkSetRegHazards(MI)); |
285 | 602k | |
286 | 602k | if (isRFE(MI->getOpcode())) |
287 | 8 | return std::max(WaitStates, checkRFEHazards(MI)); |
288 | 602k | |
289 | 602k | if (ST.hasReadM0MovRelInterpHazard() && (122k TII.isVINTRP(*MI)122k || |
290 | 122k | isSMovRel(MI->getOpcode())122k )) |
291 | 36 | return std::max(WaitStates, checkReadM0Hazards(MI)); |
292 | 602k | |
293 | 602k | if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)370k ) |
294 | 315 | return std::max(WaitStates, checkReadM0Hazards(MI)); |
295 | 602k | |
296 | 602k | if (SIInstrInfo::isMAI(*MI)) |
297 | 4.14k | return std::max(WaitStates, checkMAIHazards(MI)); |
298 | 598k | |
299 | 598k | if (MI->mayLoad() || MI->mayStore()501k ) |
300 | 162k | return std::max(WaitStates, checkMAILdStHazards(MI)); |
301 | 435k | |
302 | 435k | return WaitStates; |
303 | 435k | } |
304 | | |
305 | 3.87k | void GCNHazardRecognizer::EmitNoop() { |
306 | 3.87k | EmittedInstrs.push_front(nullptr); |
307 | 3.87k | } |
308 | | |
309 | 1.57M | void GCNHazardRecognizer::AdvanceCycle() { |
310 | 1.57M | // When the scheduler detects a stall, it will call AdvanceCycle() without |
311 | 1.57M | // emitting any instructions. |
312 | 1.57M | if (!CurrCycleInstr) |
313 | 843k | return; |
314 | 727k | |
315 | 727k | // Do not track non-instructions which do not affect the wait states. |
316 | 727k | // If included, these instructions can lead to buffer overflow such that |
317 | 727k | // detectable hazards are missed. |
318 | 727k | if (CurrCycleInstr->isImplicitDef() || CurrCycleInstr->isDebugInstr()699k || |
319 | 727k | CurrCycleInstr->isKill()699k ) |
320 | 29.1k | return; |
321 | 698k | |
322 | 698k | if (CurrCycleInstr->isBundle()) { |
323 | 1.74k | processBundle(); |
324 | 1.74k | return; |
325 | 1.74k | } |
326 | 697k | |
327 | 697k | unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr); |
328 | 697k | |
329 | 697k | // Keep track of emitted instructions |
330 | 697k | EmittedInstrs.push_front(CurrCycleInstr); |
331 | 697k | |
332 | 697k | // Add a nullptr for each additional wait state after the first. Make sure |
333 | 697k | // not to add more than getMaxLookAhead() items to the list, since we |
334 | 697k | // truncate the list to that size right after this loop. |
335 | 697k | for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead()); |
336 | 697k | i < e; ++i794 ) { |
337 | 794 | EmittedInstrs.push_front(nullptr); |
338 | 794 | } |
339 | 697k | |
340 | 697k | // getMaxLookahead() is the largest number of wait states we will ever need |
341 | 697k | // to insert, so there is no point in keeping track of more than that many |
342 | 697k | // wait states. |
343 | 697k | EmittedInstrs.resize(getMaxLookAhead()); |
344 | 697k | |
345 | 697k | CurrCycleInstr = nullptr; |
346 | 697k | } |
347 | | |
348 | 0 | void GCNHazardRecognizer::RecedeCycle() { |
349 | 0 | llvm_unreachable("hazard recognizer does not support bottom-up scheduling."); |
350 | 0 | } |
351 | | |
352 | | //===----------------------------------------------------------------------===// |
353 | | // Helper Functions |
354 | | //===----------------------------------------------------------------------===// |
355 | | |
356 | | typedef function_ref<bool(MachineInstr *, int WaitStates)> IsExpiredFn; |
357 | | |
358 | | // Returns a minimum wait states since \p I walking all predecessors. |
359 | | // Only scans until \p IsExpired does not return true. |
360 | | // Can only be run in a hazard recognizer mode. |
361 | | static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, |
362 | | MachineBasicBlock *MBB, |
363 | | MachineBasicBlock::reverse_instr_iterator I, |
364 | | int WaitStates, |
365 | | IsExpiredFn IsExpired, |
366 | 332k | DenseSet<const MachineBasicBlock *> &Visited) { |
367 | 768k | for (auto E = MBB->instr_rend(); I != E; ++I435k ) { |
368 | 737k | // Don't add WaitStates for parent BUNDLE instructions. |
369 | 737k | if (I->isBundle()) |
370 | 523 | continue; |
371 | 736k | |
372 | 736k | if (IsHazard(&*I)) |
373 | 656 | return WaitStates; |
374 | 736k | |
375 | 736k | if (I->isInlineAsm() || I->isImplicitDef()732k || I->isDebugInstr()726k ) |
376 | 9.46k | continue; |
377 | 726k | |
378 | 726k | WaitStates += SIInstrInfo::getNumWaitStates(*I); |
379 | 726k | |
380 | 726k | if (IsExpired(&*I, WaitStates)) |
381 | 301k | return std::numeric_limits<int>::max(); |
382 | 726k | } |
383 | 332k | |
384 | 332k | int MinWaitStates = WaitStates; |
385 | 30.6k | bool Found = false; |
386 | 30.6k | for (MachineBasicBlock *Pred : MBB->predecessors()) { |
387 | 5.26k | if (!Visited.insert(Pred).second) |
388 | 448 | continue; |
389 | 4.82k | |
390 | 4.82k | int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), |
391 | 4.82k | WaitStates, IsExpired, Visited); |
392 | 4.82k | |
393 | 4.82k | if (W == std::numeric_limits<int>::max()) |
394 | 4.76k | continue; |
395 | 57 | |
396 | 57 | MinWaitStates = Found ? std::min(MinWaitStates, W)6 : W51 ; |
397 | 57 | if (IsExpired(nullptr, MinWaitStates)) |
398 | 0 | return MinWaitStates; |
399 | 57 | |
400 | 57 | Found = true; |
401 | 57 | } |
402 | 30.6k | |
403 | 30.6k | if (Found) |
404 | 51 | return MinWaitStates; |
405 | 30.5k | |
406 | 30.5k | return std::numeric_limits<int>::max(); |
407 | 30.5k | } |
408 | | |
409 | | static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, |
410 | | MachineInstr *MI, |
411 | 327k | IsExpiredFn IsExpired) { |
412 | 327k | DenseSet<const MachineBasicBlock *> Visited; |
413 | 327k | return getWaitStatesSince(IsHazard, MI->getParent(), |
414 | 327k | std::next(MI->getReverseIterator()), |
415 | 327k | 0, IsExpired, Visited); |
416 | 327k | } |
417 | | |
418 | 900k | int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) { |
419 | 900k | if (IsHazardRecognizerMode) { |
420 | 676k | auto IsExpiredFn = [Limit] (MachineInstr *, int WaitStates) { |
421 | 676k | return WaitStates >= Limit; |
422 | 676k | }; |
423 | 316k | return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn); |
424 | 316k | } |
425 | 584k | |
426 | 584k | int WaitStates = 0; |
427 | 1.35M | for (MachineInstr *MI : EmittedInstrs) { |
428 | 1.35M | if (MI) { |
429 | 1.19M | if (IsHazard(MI)) |
430 | 9.80k | return WaitStates; |
431 | 1.18M | |
432 | 1.18M | if (MI->isInlineAsm()) |
433 | 2.78k | continue; |
434 | 1.34M | } |
435 | 1.34M | ++WaitStates; |
436 | 1.34M | |
437 | 1.34M | if (WaitStates >= Limit) |
438 | 548k | break; |
439 | 1.34M | } |
440 | 584k | return std::numeric_limits<int>::max()574k ; |
441 | 584k | } |
442 | | |
443 | | int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg, |
444 | | IsHazardFn IsHazardDef, |
445 | 564k | int Limit) { |
446 | 564k | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
447 | 564k | |
448 | 1.50M | auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) { |
449 | 1.50M | return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI)412k ; |
450 | 1.50M | }; |
451 | 564k | |
452 | 564k | return getWaitStatesSince(IsHazardFn, Limit); |
453 | 564k | } |
454 | | |
455 | | int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard, |
456 | 584 | int Limit) { |
457 | 1.06k | auto IsHazardFn = [IsHazard] (MachineInstr *MI) { |
458 | 1.06k | return isSSetReg(MI->getOpcode()) && IsHazard(MI)36 ; |
459 | 1.06k | }; |
460 | 584 | |
461 | 584 | return getWaitStatesSince(IsHazardFn, Limit); |
462 | 584 | } |
463 | | |
464 | | //===----------------------------------------------------------------------===// |
465 | | // No-op Hazard Detection |
466 | | //===----------------------------------------------------------------------===// |
467 | | |
468 | | static void addRegUnits(const SIRegisterInfo &TRI, |
469 | 88.5k | BitVector &BV, unsigned Reg) { |
470 | 333k | for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI245k ) |
471 | 245k | BV.set(*RUI); |
472 | 88.5k | } |
473 | | |
474 | | static void addRegsToSet(const SIRegisterInfo &TRI, |
475 | | iterator_range<MachineInstr::const_mop_iterator> Ops, |
476 | 57.8k | BitVector &Set) { |
477 | 172k | for (const MachineOperand &Op : Ops) { |
478 | 172k | if (Op.isReg()) |
479 | 88.5k | addRegUnits(TRI, Set, Op.getReg()); |
480 | 172k | } |
481 | 57.8k | } |
482 | | |
483 | 28.9k | void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) { |
484 | 28.9k | // XXX: Do we need to worry about implicit operands |
485 | 28.9k | addRegsToSet(TRI, MI.defs(), ClauseDefs); |
486 | 28.9k | addRegsToSet(TRI, MI.uses(), ClauseUses); |
487 | 28.9k | } |
488 | | |
489 | 188k | int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) { |
490 | 188k | // SMEM soft clause are only present on VI+, and only matter if xnack is |
491 | 188k | // enabled. |
492 | 188k | if (!ST.isXNACKEnabled()) |
493 | 176k | return 0; |
494 | 12.5k | |
495 | 12.5k | bool IsSMRD = TII.isSMRD(*MEM); |
496 | 12.5k | |
497 | 12.5k | resetClause(); |
498 | 12.5k | |
499 | 12.5k | // A soft-clause is any group of consecutive SMEM instructions. The |
500 | 12.5k | // instructions in this group may return out of order and/or may be |
501 | 12.5k | // replayed (i.e. the same instruction issued more than once). |
502 | 12.5k | // |
503 | 12.5k | // In order to handle these situations correctly we need to make sure that |
504 | 12.5k | // when a clause has more than one instruction, no instruction in the clause |
505 | 12.5k | // writes to a register that is read by another instruction in the clause |
506 | 12.5k | // (including itself). If we encounter this situaion, we need to break the |
507 | 12.5k | // clause by inserting a non SMEM instruction. |
508 | 12.5k | |
509 | 32.3k | for (MachineInstr *MI : EmittedInstrs) { |
510 | 32.3k | // When we hit a non-SMEM instruction then we have passed the start of the |
511 | 32.3k | // clause and we can stop. |
512 | 32.3k | if (!MI) |
513 | 5.77k | break; |
514 | 26.5k | |
515 | 26.5k | if (IsSMRD != SIInstrInfo::isSMRD(*MI)) |
516 | 1.30k | break; |
517 | 25.2k | |
518 | 25.2k | addClauseInst(*MI); |
519 | 25.2k | } |
520 | 12.5k | |
521 | 12.5k | if (ClauseDefs.none()) |
522 | 7.92k | return 0; |
523 | 4.63k | |
524 | 4.63k | // We need to make sure not to put loads and stores in the same clause if they |
525 | 4.63k | // use the same address. For now, just start a new clause whenever we see a |
526 | 4.63k | // store. |
527 | 4.63k | if (MEM->mayStore()) |
528 | 1.03k | return 1; |
529 | 3.60k | |
530 | 3.60k | addClauseInst(*MEM); |
531 | 3.60k | |
532 | 3.60k | // If the set of defs and uses intersect then we cannot add this instruction |
533 | 3.60k | // to the clause, so we have a hazard. |
534 | 3.60k | return ClauseDefs.anyCommon(ClauseUses) ? 1731 : 02.87k ; |
535 | 3.60k | } |
536 | | |
537 | 86.8k | int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { |
538 | 86.8k | int WaitStatesNeeded = 0; |
539 | 86.8k | |
540 | 86.8k | WaitStatesNeeded = checkSoftClauseHazards(SMRD); |
541 | 86.8k | |
542 | 86.8k | // This SMRD hazard only affects SI. |
543 | 86.8k | if (!ST.hasSMRDReadVALUDefHazard()) |
544 | 71.8k | return WaitStatesNeeded; |
545 | 14.9k | |
546 | 14.9k | // A read of an SGPR by SMRD instruction requires 4 wait states when the |
547 | 14.9k | // SGPR was written by a VALU instruction. |
548 | 14.9k | int SmrdSgprWaitStates = 4; |
549 | 14.9k | auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); }12.6k ; |
550 | 14.9k | auto IsBufferHazardDefFn = [this] (MachineInstr *MI) { return TII.isSALU(*MI); }720 ; |
551 | 14.9k | |
552 | 14.9k | bool IsBufferSMRD = TII.isBufferSMRD(*SMRD); |
553 | 14.9k | |
554 | 59.8k | for (const MachineOperand &Use : SMRD->uses()) { |
555 | 59.8k | if (!Use.isReg()) |
556 | 44.7k | continue; |
557 | 15.0k | int WaitStatesNeededForUse = |
558 | 15.0k | SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn, |
559 | 15.0k | SmrdSgprWaitStates); |
560 | 15.0k | WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); |
561 | 15.0k | |
562 | 15.0k | // This fixes what appears to be undocumented hardware behavior in SI where |
563 | 15.0k | // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor |
564 | 15.0k | // needs some number of nops in between. We don't know how many we need, but |
565 | 15.0k | // let's use 4. This wasn't discovered before probably because the only |
566 | 15.0k | // case when this happens is when we expand a 64-bit pointer into a full |
567 | 15.0k | // descriptor and use s_buffer_load_dword instead of s_load_dword, which was |
568 | 15.0k | // probably never encountered in the closed-source land. |
569 | 15.0k | if (IsBufferSMRD) { |
570 | 304 | int WaitStatesNeededForUse = |
571 | 304 | SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), |
572 | 304 | IsBufferHazardDefFn, |
573 | 304 | SmrdSgprWaitStates); |
574 | 304 | WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); |
575 | 304 | } |
576 | 15.0k | } |
577 | 14.9k | |
578 | 14.9k | return WaitStatesNeeded; |
579 | 14.9k | } |
580 | | |
581 | 151k | int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) { |
582 | 151k | if (!ST.hasVMEMReadSGPRVALUDefHazard()) |
583 | 49.7k | return 0; |
584 | 102k | |
585 | 102k | int WaitStatesNeeded = checkSoftClauseHazards(VMEM); |
586 | 102k | |
587 | 102k | // A read of an SGPR by a VMEM instruction requires 5 wait states when the |
588 | 102k | // SGPR was written by a VALU Instruction. |
589 | 102k | const int VmemSgprWaitStates = 5; |
590 | 960k | auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); }; |
591 | 816k | for (const MachineOperand &Use : VMEM->uses()) { |
592 | 816k | if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg())333k ) |
593 | 606k | continue; |
594 | 209k | |
595 | 209k | int WaitStatesNeededForUse = |
596 | 209k | VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn, |
597 | 209k | VmemSgprWaitStates); |
598 | 209k | WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); |
599 | 209k | } |
600 | 102k | return WaitStatesNeeded; |
601 | 102k | } |
602 | | |
603 | 1.27k | int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) { |
604 | 1.27k | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
605 | 1.27k | const SIInstrInfo *TII = ST.getInstrInfo(); |
606 | 1.27k | |
607 | 1.27k | // Check for DPP VGPR read after VALU VGPR write and EXEC write. |
608 | 1.27k | int DppVgprWaitStates = 2; |
609 | 1.27k | int DppExecWaitStates = 5; |
610 | 1.27k | int WaitStatesNeeded = 0; |
611 | 2.83k | auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); }; |
612 | 1.27k | |
613 | 10.1k | for (const MachineOperand &Use : DPP->uses()) { |
614 | 10.1k | if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg())5.06k ) |
615 | 6.66k | continue; |
616 | 3.51k | int WaitStatesNeededForUse = |
617 | 3.51k | DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg(), |
618 | 3.51k | [](MachineInstr *) { return true; }2.38k , |
619 | 3.51k | DppVgprWaitStates); |
620 | 3.51k | WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); |
621 | 3.51k | } |
622 | 1.27k | |
623 | 1.27k | WaitStatesNeeded = std::max( |
624 | 1.27k | WaitStatesNeeded, |
625 | 1.27k | DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn, |
626 | 1.27k | DppExecWaitStates)); |
627 | 1.27k | |
628 | 1.27k | return WaitStatesNeeded; |
629 | 1.27k | } |
630 | | |
631 | 407 | int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) { |
632 | 407 | const SIInstrInfo *TII = ST.getInstrInfo(); |
633 | 407 | |
634 | 407 | // v_div_fmas requires 4 wait states after a write to vcc from a VALU |
635 | 407 | // instruction. |
636 | 407 | const int DivFMasWaitStates = 4; |
637 | 1.38k | auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); }; |
638 | 407 | int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn, |
639 | 407 | DivFMasWaitStates); |
640 | 407 | |
641 | 407 | return DivFMasWaitStates - WaitStatesNeeded; |
642 | 407 | } |
643 | | |
644 | 325 | int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) { |
645 | 325 | const SIInstrInfo *TII = ST.getInstrInfo(); |
646 | 325 | unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr); |
647 | 325 | |
648 | 325 | const int GetRegWaitStates = 2; |
649 | 325 | auto IsHazardFn = [TII, GetRegHWReg] (MachineInstr *MI) { |
650 | 16 | return GetRegHWReg == getHWReg(TII, *MI); |
651 | 16 | }; |
652 | 325 | int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates); |
653 | 325 | |
654 | 325 | return GetRegWaitStates - WaitStatesNeeded; |
655 | 325 | } |
656 | | |
657 | 255 | int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) { |
658 | 255 | const SIInstrInfo *TII = ST.getInstrInfo(); |
659 | 255 | unsigned HWReg = getHWReg(TII, *SetRegInstr); |
660 | 255 | |
661 | 255 | const int SetRegWaitStates = ST.getSetRegWaitStates(); |
662 | 255 | auto IsHazardFn = [TII, HWReg] (MachineInstr *MI) { |
663 | 16 | return HWReg == getHWReg(TII, *MI); |
664 | 16 | }; |
665 | 255 | int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates); |
666 | 255 | return SetRegWaitStates - WaitStatesNeeded; |
667 | 255 | } |
668 | | |
669 | 318k | int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) { |
670 | 318k | if (!MI.mayStore()) |
671 | 292k | return -1; |
672 | 26.5k | |
673 | 26.5k | const SIInstrInfo *TII = ST.getInstrInfo(); |
674 | 26.5k | unsigned Opcode = MI.getOpcode(); |
675 | 26.5k | const MCInstrDesc &Desc = MI.getDesc(); |
676 | 26.5k | |
677 | 26.5k | int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata); |
678 | 26.5k | int VDataRCID = -1; |
679 | 26.5k | if (VDataIdx != -1) |
680 | 8.54k | VDataRCID = Desc.OpInfo[VDataIdx].RegClass; |
681 | 26.5k | |
682 | 26.5k | if (TII->isMUBUF(MI) || TII->isMTBUF(MI)20.9k ) { |
683 | 5.59k | // There is no hazard if the instruction does not use vector regs |
684 | 5.59k | // (like wbinvl1) |
685 | 5.59k | if (VDataIdx == -1) |
686 | 118 | return -1; |
687 | 5.47k | // For MUBUF/MTBUF instructions this hazard only exists if the |
688 | 5.47k | // instruction is not using a register in the soffset field. |
689 | 5.47k | const MachineOperand *SOffset = |
690 | 5.47k | TII->getNamedOperand(MI, AMDGPU::OpName::soffset); |
691 | 5.47k | // If we have no soffset operand, then assume this field has been |
692 | 5.47k | // hardcoded to zero. |
693 | 5.47k | if (AMDGPU::getRegBitWidth(VDataRCID) > 64 && |
694 | 5.47k | (2.25k !SOffset2.25k || !SOffset->isReg()2.25k )) |
695 | 2.24k | return VDataIdx; |
696 | 24.1k | } |
697 | 24.1k | |
698 | 24.1k | // MIMG instructions create a hazard if they don't use a 256-bit T# and |
699 | 24.1k | // the store size is greater than 8 bytes and they have more than two bits |
700 | 24.1k | // of their dmask set. |
701 | 24.1k | // All our MIMG definitions use a 256-bit T#, so we can skip checking for them. |
702 | 24.1k | if (TII->isMIMG(MI)) { |
703 | 0 | int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc); |
704 | 0 | assert(SRsrcIdx != -1 && |
705 | 0 | AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256); |
706 | 0 | (void)SRsrcIdx; |
707 | 0 | } |
708 | 24.1k | |
709 | 24.1k | if (TII->isFLAT(MI)) { |
710 | 3.06k | int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata); |
711 | 3.06k | if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64) |
712 | 1.93k | return DataIdx; |
713 | 22.2k | } |
714 | 22.2k | |
715 | 22.2k | return -1; |
716 | 22.2k | } |
717 | | |
718 | | int GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def, |
719 | 365k | const MachineRegisterInfo &MRI) { |
720 | 365k | // Helper to check for the hazard where VMEM instructions that store more than |
721 | 365k | // 8 bytes can have there store data over written by the next instruction. |
722 | 365k | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
723 | 365k | |
724 | 365k | const int VALUWaitStates = 1; |
725 | 365k | int WaitStatesNeeded = 0; |
726 | 365k | |
727 | 365k | if (!TRI->isVGPR(MRI, Def.getReg())) |
728 | 39.0k | return WaitStatesNeeded; |
729 | 326k | unsigned Reg = Def.getReg(); |
730 | 326k | auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) { |
731 | 318k | int DataIdx = createsVALUHazard(*MI); |
732 | 318k | return DataIdx >= 0 && |
733 | 318k | TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg)4.17k ; |
734 | 318k | }; |
735 | 326k | int WaitStatesNeededForDef = |
736 | 326k | VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates); |
737 | 326k | WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); |
738 | 326k | |
739 | 326k | return WaitStatesNeeded; |
740 | 326k | } |
741 | | |
742 | 407k | int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { |
743 | 407k | // This checks for the hazard where VMEM instructions that store more than |
744 | 407k | // 8 bytes can have there store data over written by the next instruction. |
745 | 407k | if (!ST.has12DWordStoreHazard()) |
746 | 65.6k | return 0; |
747 | 341k | |
748 | 341k | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
749 | 341k | int WaitStatesNeeded = 0; |
750 | 341k | |
751 | 361k | for (const MachineOperand &Def : VALU->defs()) { |
752 | 361k | WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI)); |
753 | 361k | } |
754 | 341k | |
755 | 341k | return WaitStatesNeeded; |
756 | 341k | } |
757 | | |
758 | 3.29k | int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) { |
759 | 3.29k | // This checks for hazards associated with inline asm statements. |
760 | 3.29k | // Since inline asms can contain just about anything, we use this |
761 | 3.29k | // to call/leverage other check*Hazard routines. Note that |
762 | 3.29k | // this function doesn't attempt to address all possible inline asm |
763 | 3.29k | // hazards (good luck), but is a collection of what has been |
764 | 3.29k | // problematic thus far. |
765 | 3.29k | |
766 | 3.29k | // see checkVALUHazards() |
767 | 3.29k | if (!ST.has12DWordStoreHazard()) |
768 | 652 | return 0; |
769 | 2.63k | |
770 | 2.63k | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
771 | 2.63k | int WaitStatesNeeded = 0; |
772 | 2.63k | |
773 | 2.63k | for (unsigned I = InlineAsm::MIOp_FirstOperand, E = IA->getNumOperands(); |
774 | 13.3k | I != E; ++I10.6k ) { |
775 | 10.6k | const MachineOperand &Op = IA->getOperand(I); |
776 | 10.6k | if (Op.isReg() && Op.isDef()5.33k ) { |
777 | 3.69k | WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI)); |
778 | 3.69k | } |
779 | 10.6k | } |
780 | 2.63k | |
781 | 2.63k | return WaitStatesNeeded; |
782 | 2.63k | } |
783 | | |
784 | 187 | int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) { |
785 | 187 | const SIInstrInfo *TII = ST.getInstrInfo(); |
786 | 187 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
787 | 187 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
788 | 187 | |
789 | 187 | const MachineOperand *LaneSelectOp = |
790 | 187 | TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1); |
791 | 187 | |
792 | 187 | if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg())70 ) |
793 | 117 | return 0; |
794 | 70 | |
795 | 70 | unsigned LaneSelectReg = LaneSelectOp->getReg(); |
796 | 158 | auto IsHazardFn = [TII] (MachineInstr *MI) { |
797 | 158 | return TII->isVALU(*MI); |
798 | 158 | }; |
799 | 70 | |
800 | 70 | const int RWLaneWaitStates = 4; |
801 | 70 | int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn, |
802 | 70 | RWLaneWaitStates); |
803 | 70 | return RWLaneWaitStates - WaitStatesSince; |
804 | 70 | } |
805 | | |
806 | 8 | int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) { |
807 | 8 | if (!ST.hasRFEHazards()) |
808 | 4 | return 0; |
809 | 4 | |
810 | 4 | const SIInstrInfo *TII = ST.getInstrInfo(); |
811 | 4 | |
812 | 4 | const int RFEWaitStates = 1; |
813 | 4 | |
814 | 4 | auto IsHazardFn = [TII] (MachineInstr *MI) { |
815 | 4 | return getHWReg(TII, *MI) == AMDGPU::Hwreg::ID_TRAPSTS; |
816 | 4 | }; |
817 | 4 | int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates); |
818 | 4 | return RFEWaitStates - WaitStatesNeeded; |
819 | 4 | } |
820 | | |
821 | 957k | int GCNHazardRecognizer::checkAnyInstHazards(MachineInstr *MI) { |
822 | 957k | if (MI->isDebugInstr()) |
823 | 101 | return 0; |
824 | 957k | |
825 | 957k | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
826 | 957k | if (!ST.hasSMovFedHazard()) |
827 | 766k | return 0; |
828 | 190k | |
829 | 190k | // Check for any instruction reading an SGPR after a write from |
830 | 190k | // s_mov_fed_b32. |
831 | 190k | int MovFedWaitStates = 1; |
832 | 190k | int WaitStatesNeeded = 0; |
833 | 190k | |
834 | 764k | for (const MachineOperand &Use : MI->uses()) { |
835 | 764k | if (!Use.isReg() || TRI->isVGPR(MF.getRegInfo(), Use.getReg())433k ) |
836 | 463k | continue; |
837 | 300k | auto IsHazardFn = [] (MachineInstr *MI) { |
838 | 284k | return MI->getOpcode() == AMDGPU::S_MOV_FED_B32; |
839 | 284k | }; |
840 | 300k | int WaitStatesNeededForUse = |
841 | 300k | MovFedWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardFn, |
842 | 300k | MovFedWaitStates); |
843 | 300k | WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); |
844 | 300k | } |
845 | 190k | |
846 | 190k | return WaitStatesNeeded; |
847 | 190k | } |
848 | | |
849 | 517 | int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) { |
850 | 517 | const SIInstrInfo *TII = ST.getInstrInfo(); |
851 | 517 | const int SMovRelWaitStates = 1; |
852 | 517 | auto IsHazardFn = [TII] (MachineInstr *MI) { |
853 | 492 | return TII->isSALU(*MI); |
854 | 492 | }; |
855 | 517 | return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, |
856 | 517 | SMovRelWaitStates); |
857 | 517 | } |
858 | | |
859 | 437k | void GCNHazardRecognizer::fixHazards(MachineInstr *MI) { |
860 | 437k | fixVMEMtoScalarWriteHazards(MI); |
861 | 437k | fixVcmpxPermlaneHazards(MI); |
862 | 437k | fixSMEMtoVectorWriteHazards(MI); |
863 | 437k | fixVcmpxExecWARHazard(MI); |
864 | 437k | fixLdsBranchVmemWARHazard(MI); |
865 | 437k | } |
866 | | |
867 | 437k | bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) { |
868 | 437k | if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI)28.1k ) |
869 | 437k | return false; |
870 | 36 | |
871 | 36 | const SIInstrInfo *TII = ST.getInstrInfo(); |
872 | 65 | auto IsHazardFn = [TII] (MachineInstr *MI) { |
873 | 65 | return TII->isVOPC(*MI); |
874 | 65 | }; |
875 | 36 | |
876 | 48 | auto IsExpiredFn = [] (MachineInstr *MI, int) { |
877 | 48 | if (!MI) |
878 | 5 | return false; |
879 | 43 | unsigned Opc = MI->getOpcode(); |
880 | 43 | return SIInstrInfo::isVALU(*MI) && |
881 | 43 | Opc != AMDGPU::V_NOP_e3241 && |
882 | 43 | Opc != AMDGPU::V_NOP_e6431 && |
883 | 43 | Opc != AMDGPU::V_NOP_sdwa31 ; |
884 | 43 | }; |
885 | 36 | |
886 | 36 | if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == |
887 | 36 | std::numeric_limits<int>::max()) |
888 | 31 | return false; |
889 | 5 | |
890 | 5 | // V_NOP will be discarded by SQ. |
891 | 5 | // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE* |
892 | 5 | // which is always a VGPR and available. |
893 | 5 | auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0); |
894 | 5 | unsigned Reg = Src0->getReg(); |
895 | 5 | bool IsUndef = Src0->isUndef(); |
896 | 5 | BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), |
897 | 5 | TII->get(AMDGPU::V_MOV_B32_e32)) |
898 | 5 | .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead1 : 04 )) |
899 | 5 | .addReg(Reg, IsUndef ? RegState::Undef1 : RegState::Kill4 ); |
900 | 5 | |
901 | 5 | return true; |
902 | 5 | } |
903 | | |
904 | 437k | bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) { |
905 | 437k | if (!ST.hasVMEMtoScalarWriteHazard()) |
906 | 408k | return false; |
907 | 28.1k | |
908 | 28.1k | if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI)18.6k ) |
909 | 15.7k | return false; |
910 | 12.4k | |
911 | 12.4k | if (MI->getNumDefs() == 0) |
912 | 6.40k | return false; |
913 | 6.03k | |
914 | 6.03k | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
915 | 6.03k | |
916 | 13.8k | auto IsHazardFn = [TRI, MI] (MachineInstr *I) { |
917 | 13.8k | if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isDS(*I)13.7k && |
918 | 13.8k | !SIInstrInfo::isFLAT(*I)13.7k ) |
919 | 13.5k | return false; |
920 | 363 | |
921 | 363 | for (const MachineOperand &Def : MI->defs()) { |
922 | 363 | MachineOperand *Op = I->findRegisterUseOperand(Def.getReg(), false, TRI); |
923 | 363 | if (!Op) |
924 | 314 | continue; |
925 | 49 | return true; |
926 | 49 | } |
927 | 363 | return false314 ; |
928 | 363 | }; |
929 | 6.03k | |
930 | 12.7k | auto IsExpiredFn = [] (MachineInstr *MI, int) { |
931 | 12.7k | return MI && (12.7k SIInstrInfo::isVALU(*MI)12.7k || |
932 | 12.7k | (10.9k MI->getOpcode() == AMDGPU::S_WAITCNT10.9k && |
933 | 10.9k | !MI->getOperand(0).getImm()1.64k )); |
934 | 12.7k | }; |
935 | 6.03k | |
936 | 6.03k | if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == |
937 | 6.03k | std::numeric_limits<int>::max()) |
938 | 5.98k | return false; |
939 | 49 | |
940 | 49 | const SIInstrInfo *TII = ST.getInstrInfo(); |
941 | 49 | BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32)); |
942 | 49 | return true; |
943 | 49 | } |
944 | | |
945 | 437k | bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) { |
946 | 437k | if (!ST.hasSMEMtoVectorWriteHazard()) |
947 | 408k | return false; |
948 | 28.1k | |
949 | 28.1k | if (!SIInstrInfo::isVALU(*MI)) |
950 | 18.6k | return false; |
951 | 9.50k | |
952 | 9.50k | unsigned SDSTName; |
953 | 9.50k | switch (MI->getOpcode()) { |
954 | 9.50k | case AMDGPU::V_READLANE_B32: |
955 | 75 | case AMDGPU::V_READFIRSTLANE_B32: |
956 | 75 | SDSTName = AMDGPU::OpName::vdst; |
957 | 75 | break; |
958 | 9.43k | default: |
959 | 9.43k | SDSTName = AMDGPU::OpName::sdst; |
960 | 9.43k | break; |
961 | 9.50k | } |
962 | 9.50k | |
963 | 9.50k | const SIInstrInfo *TII = ST.getInstrInfo(); |
964 | 9.50k | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
965 | 9.50k | const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU()); |
966 | 9.50k | const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName); |
967 | 9.50k | if (!SDST) { |
968 | 15.0k | for (const auto &MO : MI->implicit_operands()) { |
969 | 15.0k | if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))1.67k ) { |
970 | 445 | SDST = &MO; |
971 | 445 | break; |
972 | 445 | } |
973 | 15.0k | } |
974 | 8.39k | } |
975 | 9.50k | |
976 | 9.50k | if (!SDST) |
977 | 7.95k | return false; |
978 | 1.55k | |
979 | 1.55k | const unsigned SDSTReg = SDST->getReg(); |
980 | 18.2k | auto IsHazardFn = [SDSTReg, TRI] (MachineInstr *I) { |
981 | 18.2k | return SIInstrInfo::isSMRD(*I) && I->readsRegister(SDSTReg, TRI)66 ; |
982 | 18.2k | }; |
983 | 1.55k | |
984 | 18.1k | auto IsExpiredFn = [TII, IV] (MachineInstr *MI, int) { |
985 | 18.1k | if (MI) { |
986 | 18.1k | if (TII->isSALU(*MI)) { |
987 | 1.94k | switch (MI->getOpcode()) { |
988 | 1.94k | case AMDGPU::S_SETVSKIP: |
989 | 32 | case AMDGPU::S_VERSION: |
990 | 32 | case AMDGPU::S_WAITCNT_VSCNT: |
991 | 32 | case AMDGPU::S_WAITCNT_VMCNT: |
992 | 32 | case AMDGPU::S_WAITCNT_EXPCNT: |
993 | 32 | // These instructions cannot not mitigate the hazard. |
994 | 32 | return false; |
995 | 32 | case AMDGPU::S_WAITCNT_LGKMCNT: |
996 | 2 | // Reducing lgkmcnt count to 0 always mitigates the hazard. |
997 | 2 | return (MI->getOperand(1).getImm() == 0) && |
998 | 2 | (MI->getOperand(0).getReg() == AMDGPU::SGPR_NULL)1 ; |
999 | 1.16k | case AMDGPU::S_WAITCNT: { |
1000 | 1.16k | const int64_t Imm = MI->getOperand(0).getImm(); |
1001 | 1.16k | AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm); |
1002 | 1.16k | return (Decoded.LgkmCnt == 0); |
1003 | 32 | } |
1004 | 739 | default: |
1005 | 739 | // SOPP instructions cannot mitigate the hazard. |
1006 | 739 | if (TII->isSOPP(*MI)) |
1007 | 180 | return false; |
1008 | 559 | // At this point the SALU can be assumed to mitigate the hazard |
1009 | 559 | // because either: |
1010 | 559 | // (a) it is independent of the at risk SMEM (breaking chain), |
1011 | 559 | // or |
1012 | 559 | // (b) it is dependent on the SMEM, in which case an appropriate |
1013 | 559 | // s_waitcnt lgkmcnt _must_ exist between it and the at risk |
1014 | 559 | // SMEM instruction. |
1015 | 559 | return true; |
1016 | 1.94k | } |
1017 | 1.94k | } |
1018 | 18.1k | } |
1019 | 16.1k | return false; |
1020 | 16.1k | }; |
1021 | 1.55k | |
1022 | 1.55k | if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == |
1023 | 1.55k | std::numeric_limits<int>::max()) |
1024 | 1.53k | return false; |
1025 | 20 | |
1026 | 20 | BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), |
1027 | 20 | TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL) |
1028 | 20 | .addImm(0); |
1029 | 20 | return true; |
1030 | 20 | } |
1031 | | |
1032 | 437k | bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) { |
1033 | 437k | if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI)28.1k ) |
1034 | 427k | return false; |
1035 | 9.50k | |
1036 | 9.50k | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
1037 | 9.50k | if (!MI->modifiesRegister(AMDGPU::EXEC, TRI)) |
1038 | 9.49k | return false; |
1039 | 15 | |
1040 | 16 | auto IsHazardFn = [TRI] (MachineInstr *I) 15 { |
1041 | 16 | if (SIInstrInfo::isVALU(*I)) |
1042 | 10 | return false; |
1043 | 6 | return I->readsRegister(AMDGPU::EXEC, TRI); |
1044 | 6 | }; |
1045 | 15 | |
1046 | 15 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1047 | 15 | auto IsExpiredFn = [TII, TRI] (MachineInstr *MI, int) { |
1048 | 13 | if (!MI) |
1049 | 0 | return false; |
1050 | 13 | if (SIInstrInfo::isVALU(*MI)) { |
1051 | 10 | if (TII->getNamedOperand(*MI, AMDGPU::OpName::sdst)) |
1052 | 1 | return true; |
1053 | 9 | for (auto MO : MI->implicit_operands()) |
1054 | 9 | if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))1 ) |
1055 | 1 | return true; |
1056 | 9 | } |
1057 | 13 | if (11 MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR11 && |
1058 | 11 | (MI->getOperand(0).getImm() & 0xfffe) == 0xfffe3 ) |
1059 | 2 | return true; |
1060 | 9 | return false; |
1061 | 9 | }; |
1062 | 15 | |
1063 | 15 | if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == |
1064 | 15 | std::numeric_limits<int>::max()) |
1065 | 12 | return false; |
1066 | 3 | |
1067 | 3 | BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), |
1068 | 3 | TII->get(AMDGPU::S_WAITCNT_DEPCTR)) |
1069 | 3 | .addImm(0xfffe); |
1070 | 3 | return true; |
1071 | 3 | } |
1072 | | |
1073 | 437k | bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) { |
1074 | 437k | if (!ST.hasLdsBranchVmemWARHazard()) |
1075 | 408k | return false; |
1076 | 28.1k | |
1077 | 51.7k | auto IsHazardInst = [] (const MachineInstr *MI) 28.1k { |
1078 | 51.7k | if (SIInstrInfo::isDS(*MI)) |
1079 | 218 | return 1; |
1080 | 51.5k | if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isSegmentSpecificFLAT(*MI)47.9k ) |
1081 | 5.68k | return 2; |
1082 | 45.8k | return 0; |
1083 | 45.8k | }; |
1084 | 28.1k | |
1085 | 28.1k | auto InstType = IsHazardInst(MI); |
1086 | 28.1k | if (!InstType) |
1087 | 24.7k | return false; |
1088 | 3.41k | |
1089 | 14.5k | auto IsExpiredFn = [&IsHazardInst] (MachineInstr *I, int) 3.41k { |
1090 | 14.5k | return I && (14.5k IsHazardInst(I)14.5k || |
1091 | 14.5k | (12.5k I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT12.5k && |
1092 | 12.5k | I->getOperand(0).getReg() == AMDGPU::SGPR_NULL291 && |
1093 | 12.5k | !I->getOperand(1).getImm()290 )); |
1094 | 14.5k | }; |
1095 | 3.41k | |
1096 | 15.5k | auto IsHazardFn = [InstType, &IsHazardInst] (MachineInstr *I) { |
1097 | 15.5k | if (!I->isBranch()) |
1098 | 15.2k | return false; |
1099 | 266 | |
1100 | 4.73k | auto IsHazardFn = [InstType, IsHazardInst] (MachineInstr *I) 266 { |
1101 | 4.73k | auto InstType2 = IsHazardInst(I); |
1102 | 4.73k | return InstType2 && InstType != InstType2237 ; |
1103 | 4.73k | }; |
1104 | 266 | |
1105 | 4.35k | auto IsExpiredFn = [InstType, &IsHazardInst] (MachineInstr *I, int) { |
1106 | 4.35k | if (!I) |
1107 | 5 | return false; |
1108 | 4.34k | |
1109 | 4.34k | auto InstType2 = IsHazardInst(I); |
1110 | 4.34k | if (InstType == InstType2) |
1111 | 224 | return true; |
1112 | 4.12k | |
1113 | 4.12k | return I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT && |
1114 | 4.12k | I->getOperand(0).getReg() == AMDGPU::SGPR_NULL1 && |
1115 | 4.12k | !I->getOperand(1).getImm()1 ; |
1116 | 4.12k | }; |
1117 | 266 | |
1118 | 266 | return ::getWaitStatesSince(IsHazardFn, I, IsExpiredFn) != |
1119 | 266 | std::numeric_limits<int>::max(); |
1120 | 266 | }; |
1121 | 3.41k | |
1122 | 3.41k | if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == |
1123 | 3.41k | std::numeric_limits<int>::max()) |
1124 | 3.39k | return false; |
1125 | 13 | |
1126 | 13 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1127 | 13 | BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), |
1128 | 13 | TII->get(AMDGPU::S_WAITCNT_VSCNT)) |
1129 | 13 | .addReg(AMDGPU::SGPR_NULL, RegState::Undef) |
1130 | 13 | .addImm(0); |
1131 | 13 | |
1132 | 13 | return true; |
1133 | 13 | } |
1134 | | |
1135 | 63.0k | int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) { |
1136 | 63.0k | int NSAtoVMEMWaitStates = 1; |
1137 | 63.0k | |
1138 | 63.0k | if (!ST.hasNSAtoVMEMBug()) |
1139 | 0 | return 0; |
1140 | 63.0k | |
1141 | 63.0k | if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI)58.8k ) |
1142 | 58.6k | return 0; |
1143 | 4.39k | |
1144 | 4.39k | const SIInstrInfo *TII = ST.getInstrInfo(); |
1145 | 4.39k | const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset); |
1146 | 4.39k | if (!Offset || (Offset->getImm() & 6) == 03.85k ) |
1147 | 2.84k | return 0; |
1148 | 1.54k | |
1149 | 1.54k | auto IsHazardFn = [TII] (MachineInstr *I) { |
1150 | 1.51k | if (!SIInstrInfo::isMIMG(*I)) |
1151 | 1.51k | return false; |
1152 | 3 | const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I->getOpcode()); |
1153 | 3 | return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA && |
1154 | 3 | TII->getInstSizeInBytes(*I) >= 162 ; |
1155 | 3 | }; |
1156 | 1.54k | |
1157 | 1.54k | return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1); |
1158 | 1.54k | } |
1159 | | |
1160 | 925k | int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) { |
1161 | 925k | int FPAtomicToDenormModeWaitStates = 3; |
1162 | 925k | |
1163 | 925k | if (MI->getOpcode() != AMDGPU::S_DENORM_MODE) |
1164 | 925k | return 0; |
1165 | 32 | |
1166 | 32 | auto IsHazardFn = [] (MachineInstr *I) { |
1167 | 32 | if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isFLAT(*I)) |
1168 | 2 | return false; |
1169 | 30 | return SIInstrInfo::isFPAtomic(*I); |
1170 | 30 | }; |
1171 | 32 | |
1172 | 32 | auto IsExpiredFn = [] (MachineInstr *MI, int WaitStates) { |
1173 | 2 | if (WaitStates >= 3 || SIInstrInfo::isVALU(*MI)) |
1174 | 1 | return true; |
1175 | 1 | |
1176 | 1 | switch (MI->getOpcode()) { |
1177 | 1 | case AMDGPU::S_WAITCNT: |
1178 | 1 | case AMDGPU::S_WAITCNT_VSCNT: |
1179 | 1 | case AMDGPU::S_WAITCNT_VMCNT: |
1180 | 1 | case AMDGPU::S_WAITCNT_EXPCNT: |
1181 | 1 | case AMDGPU::S_WAITCNT_LGKMCNT: |
1182 | 1 | case AMDGPU::S_WAITCNT_IDLE: |
1183 | 1 | return true; |
1184 | 1 | default: |
1185 | 0 | break; |
1186 | 0 | } |
1187 | 0 | |
1188 | 0 | return false; |
1189 | 0 | }; |
1190 | 32 | |
1191 | 32 | |
1192 | 32 | return FPAtomicToDenormModeWaitStates - |
1193 | 32 | ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn); |
1194 | 32 | } |
1195 | | |
1196 | 10.7k | int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) { |
1197 | 10.7k | assert(SIInstrInfo::isMAI(*MI)); |
1198 | 10.7k | |
1199 | 10.7k | int WaitStatesNeeded = 0; |
1200 | 10.7k | unsigned Opc = MI->getOpcode(); |
1201 | 10.7k | |
1202 | 18.4k | auto IsVALUFn = [] (MachineInstr *MI) { |
1203 | 18.4k | return SIInstrInfo::isVALU(*MI); |
1204 | 18.4k | }; |
1205 | 10.7k | |
1206 | 10.7k | if (Opc != AMDGPU::V_ACCVGPR_READ_B32) { // MFMA or v_accvgpr_write |
1207 | 3.29k | const int LegacyVALUWritesVGPRWaitStates = 2; |
1208 | 3.29k | const int VALUWritesExecWaitStates = 4; |
1209 | 3.29k | const int MaxWaitStates = 4; |
1210 | 3.29k | |
1211 | 3.29k | int WaitStatesNeededForUse = VALUWritesExecWaitStates - |
1212 | 3.29k | getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates); |
1213 | 3.29k | WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); |
1214 | 3.29k | |
1215 | 3.29k | if (WaitStatesNeeded < MaxWaitStates) { |
1216 | 4.30k | for (const MachineOperand &Use : MI->explicit_uses()) { |
1217 | 4.30k | const int MaxWaitStates = 2; |
1218 | 4.30k | |
1219 | 4.30k | if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg())3.34k ) |
1220 | 1.14k | continue; |
1221 | 3.16k | |
1222 | 3.16k | int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates - |
1223 | 3.16k | getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates); |
1224 | 3.16k | WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); |
1225 | 3.16k | |
1226 | 3.16k | if (WaitStatesNeeded == MaxWaitStates) |
1227 | 19 | break; |
1228 | 3.16k | } |
1229 | 3.29k | } |
1230 | 3.29k | } |
1231 | 10.7k | |
1232 | 133k | auto IsMFMAFn = [] (MachineInstr *MI) { |
1233 | 133k | return SIInstrInfo::isMAI(*MI) && |
1234 | 133k | MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B3287.4k && |
1235 | 133k | MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B3235.5k ; |
1236 | 133k | }; |
1237 | 10.7k | |
1238 | 22.5k | for (const MachineOperand &Op : MI->explicit_operands()) { |
1239 | 22.5k | if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg())21.5k ) |
1240 | 11.6k | continue; |
1241 | 10.9k | |
1242 | 10.9k | if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B323.29k ) |
1243 | 218 | continue; |
1244 | 10.7k | |
1245 | 10.7k | const int MFMAWritesAGPROverlappedSrcABWaitStates = 4; |
1246 | 10.7k | const int MFMAWritesAGPROverlappedSrcCWaitStates = 2; |
1247 | 10.7k | const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4; |
1248 | 10.7k | const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10; |
1249 | 10.7k | const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18; |
1250 | 10.7k | const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1; |
1251 | 10.7k | const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7; |
1252 | 10.7k | const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15; |
1253 | 10.7k | const int MaxWaitStates = 18; |
1254 | 10.7k | unsigned Reg = Op.getReg(); |
1255 | 10.7k | unsigned HazardDefLatency = 0; |
1256 | 10.7k | |
1257 | 10.7k | auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency, this] |
1258 | 96.5k | (MachineInstr *MI) { |
1259 | 96.5k | if (!IsMFMAFn(MI)) |
1260 | 91.5k | return false; |
1261 | 5.01k | unsigned DstReg = MI->getOperand(0).getReg(); |
1262 | 5.01k | if (DstReg == Reg) |
1263 | 24 | return false; |
1264 | 4.99k | HazardDefLatency = std::max(HazardDefLatency, |
1265 | 4.99k | TSchedModel.computeInstrLatency(MI)); |
1266 | 4.99k | return TRI.regsOverlap(DstReg, Reg); |
1267 | 4.99k | }; |
1268 | 10.7k | |
1269 | 10.7k | int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, |
1270 | 10.7k | MaxWaitStates); |
1271 | 10.7k | int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates; |
1272 | 10.7k | int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); |
1273 | 10.7k | int OpNo = MI->getOperandNo(&Op); |
1274 | 10.7k | if (OpNo == SrcCIdx) { |
1275 | 195 | NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates; |
1276 | 10.5k | } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32) { |
1277 | 7.46k | switch (HazardDefLatency) { |
1278 | 7.46k | case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates; |
1279 | 179 | break; |
1280 | 7.46k | case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates; |
1281 | 1.30k | break; |
1282 | 7.46k | case 16: 3.37k LLVM_FALLTHROUGH3.37k ; |
1283 | 5.98k | default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates; |
1284 | 5.98k | break; |
1285 | 3.08k | } |
1286 | 3.08k | } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) { |
1287 | 3.07k | switch (HazardDefLatency) { |
1288 | 3.07k | case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates; |
1289 | 29 | break; |
1290 | 3.07k | case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates; |
1291 | 2 | break; |
1292 | 3.07k | case 16: 18 LLVM_FALLTHROUGH18 ; |
1293 | 3.04k | default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates; |
1294 | 3.04k | break; |
1295 | 10.7k | } |
1296 | 10.7k | } |
1297 | 10.7k | |
1298 | 10.7k | int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; |
1299 | 10.7k | WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); |
1300 | 10.7k | |
1301 | 10.7k | if (WaitStatesNeeded == MaxWaitStates) |
1302 | 137 | return WaitStatesNeeded; // Early exit. |
1303 | 10.5k | |
1304 | 118k | auto IsAccVgprWriteFn = [Reg, this] (MachineInstr *MI) 10.5k { |
1305 | 118k | if (MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32) |
1306 | 75.0k | return false; |
1307 | 43.7k | unsigned DstReg = MI->getOperand(0).getReg(); |
1308 | 43.7k | return TRI.regsOverlap(Reg, DstReg); |
1309 | 43.7k | }; |
1310 | 10.5k | |
1311 | 10.5k | const int AccVGPRWriteMFMAReadSrcCWaitStates = 1; |
1312 | 10.5k | const int AccVGPRWriteMFMAReadSrcABWaitStates = 3; |
1313 | 10.5k | const int AccVGPRWriteAccVgprReadWaitStates = 3; |
1314 | 10.5k | NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates; |
1315 | 10.5k | if (OpNo == SrcCIdx) |
1316 | 195 | NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates; |
1317 | 10.4k | else if (Opc == AMDGPU::V_ACCVGPR_READ_B32) |
1318 | 7.32k | NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates; |
1319 | 10.5k | |
1320 | 10.5k | WaitStatesNeededForUse = NeedWaitStates - |
1321 | 10.5k | getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates); |
1322 | 10.5k | WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); |
1323 | 10.5k | |
1324 | 10.5k | if (WaitStatesNeeded == MaxWaitStates) |
1325 | 0 | return WaitStatesNeeded; // Early exit. |
1326 | 10.5k | } |
1327 | 10.7k | |
1328 | 10.7k | if (10.6k Opc == AMDGPU::V_ACCVGPR_WRITE_B3210.6k ) { |
1329 | 3.07k | const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0; |
1330 | 3.07k | const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5; |
1331 | 3.07k | const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13; |
1332 | 3.07k | const int MaxWaitStates = 13; |
1333 | 3.07k | unsigned DstReg = MI->getOperand(0).getReg(); |
1334 | 3.07k | unsigned HazardDefLatency = 0; |
1335 | 3.07k | |
1336 | 3.07k | auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency, this] |
1337 | 36.8k | (MachineInstr *MI) { |
1338 | 36.8k | if (!IsMFMAFn(MI)) |
1339 | 36.7k | return false; |
1340 | 47 | unsigned Reg = TII.getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg(); |
1341 | 47 | HazardDefLatency = std::max(HazardDefLatency, |
1342 | 47 | TSchedModel.computeInstrLatency(MI)); |
1343 | 47 | return TRI.regsOverlap(Reg, DstReg); |
1344 | 47 | }; |
1345 | 3.07k | |
1346 | 3.07k | int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates); |
1347 | 3.07k | int NeedWaitStates; |
1348 | 3.07k | switch (HazardDefLatency) { |
1349 | 3.07k | case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates; |
1350 | 20 | break; |
1351 | 3.07k | case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates; |
1352 | 2 | break; |
1353 | 3.07k | case 16: 13 LLVM_FALLTHROUGH13 ; |
1354 | 3.05k | default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates; |
1355 | 3.05k | break; |
1356 | 3.07k | } |
1357 | 3.07k | |
1358 | 3.07k | int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince; |
1359 | 3.07k | WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); |
1360 | 3.07k | } |
1361 | 10.6k | |
1362 | 10.6k | return WaitStatesNeeded; |
1363 | 10.6k | } |
1364 | | |
1365 | 234k | int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) { |
1366 | 234k | if (!ST.hasMAIInsts()) |
1367 | 231k | return 0; |
1368 | 3.82k | |
1369 | 3.82k | int WaitStatesNeeded = 0; |
1370 | 3.82k | |
1371 | 8.24k | auto IsAccVgprReadFn = [] (MachineInstr *MI) { |
1372 | 8.24k | return MI->getOpcode() == AMDGPU::V_ACCVGPR_READ_B32; |
1373 | 8.24k | }; |
1374 | 3.82k | |
1375 | 20.3k | for (const MachineOperand &Op : MI->explicit_uses()) { |
1376 | 20.3k | if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg())6.33k ) |
1377 | 15.1k | continue; |
1378 | 5.13k | |
1379 | 5.13k | unsigned Reg = Op.getReg(); |
1380 | 5.13k | |
1381 | 5.13k | const int AccVgprReadLdStWaitStates = 2; |
1382 | 5.13k | const int VALUWriteAccVgprReadLdStDepVALUWaitStates = 1; |
1383 | 5.13k | const int MaxWaitStates = 2; |
1384 | 5.13k | |
1385 | 5.13k | int WaitStatesNeededForUse = AccVgprReadLdStWaitStates - |
1386 | 5.13k | getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates); |
1387 | 5.13k | WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); |
1388 | 5.13k | |
1389 | 5.13k | if (WaitStatesNeeded == MaxWaitStates) |
1390 | 3 | return WaitStatesNeeded; // Early exit. |
1391 | 5.13k | |
1392 | 8.24k | auto IsVALUAccVgprReadCheckFn = [Reg, this] (MachineInstr *MI) 5.13k { |
1393 | 8.24k | if (MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32) |
1394 | 7.85k | return false; |
1395 | 480 | auto IsVALUFn = [] (MachineInstr *MI) 389 { |
1396 | 480 | return SIInstrInfo::isVALU(*MI) && !SIInstrInfo::isMAI(*MI)391 ; |
1397 | 480 | }; |
1398 | 389 | return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) < |
1399 | 389 | std::numeric_limits<int>::max(); |
1400 | 389 | }; |
1401 | 5.13k | |
1402 | 5.13k | WaitStatesNeededForUse = VALUWriteAccVgprReadLdStDepVALUWaitStates - |
1403 | 5.13k | getWaitStatesSince(IsVALUAccVgprReadCheckFn, MaxWaitStates); |
1404 | 5.13k | WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); |
1405 | 5.13k | } |
1406 | 3.82k | |
1407 | 3.82k | return WaitStatesNeeded3.82k ; |
1408 | 3.82k | } |