/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===- SILowerControlFlow.cpp - Use predicates for control flow -----------===// |
2 | | // |
3 | | // The LLVM Compiler Infrastructure |
4 | | // |
5 | | // This file is distributed under the University of Illinois Open Source |
6 | | // License. See LICENSE.TXT for details. |
7 | | // |
8 | | //===----------------------------------------------------------------------===// |
9 | | // |
10 | | /// \file |
11 | | /// Insert wait instructions for memory reads and writes. |
12 | | /// |
13 | | /// Memory reads and writes are issued asynchronously, so we need to insert |
14 | | /// S_WAITCNT instructions when we want to access any of their results or |
15 | | /// overwrite any register that's used asynchronously. |
16 | | // |
17 | | //===----------------------------------------------------------------------===// |
18 | | |
19 | | #include "AMDGPU.h" |
20 | | #include "AMDGPUSubtarget.h" |
21 | | #include "SIDefines.h" |
22 | | #include "SIInstrInfo.h" |
23 | | #include "SIMachineFunctionInfo.h" |
24 | | #include "SIRegisterInfo.h" |
25 | | #include "Utils/AMDGPUBaseInfo.h" |
26 | | #include "llvm/ADT/SmallVector.h" |
27 | | #include "llvm/ADT/StringRef.h" |
28 | | #include "llvm/CodeGen/MachineBasicBlock.h" |
29 | | #include "llvm/CodeGen/MachineFunction.h" |
30 | | #include "llvm/CodeGen/MachineFunctionPass.h" |
31 | | #include "llvm/CodeGen/MachineInstr.h" |
32 | | #include "llvm/CodeGen/MachineInstrBuilder.h" |
33 | | #include "llvm/CodeGen/MachineOperand.h" |
34 | | #include "llvm/CodeGen/MachineRegisterInfo.h" |
35 | | #include "llvm/IR/DebugLoc.h" |
36 | | #include "llvm/MC/MCInstrDesc.h" |
37 | | #include "llvm/Pass.h" |
38 | | #include "llvm/Support/Debug.h" |
39 | | #include "llvm/Support/raw_ostream.h" |
40 | | #include <algorithm> |
41 | | #include <cassert> |
42 | | #include <cstdint> |
43 | | #include <cstring> |
44 | | #include <utility> |
45 | | |
46 | | #define DEBUG_TYPE "si-insert-waits" |
47 | | |
48 | | using namespace llvm; |
49 | | |
50 | | namespace { |
51 | | |
52 | | /// One variable for each of the hardware counters |
53 | | using Counters = union { |
54 | | struct { |
55 | | unsigned VM; |
56 | | unsigned EXP; |
57 | | unsigned LGKM; |
58 | | } Named; |
59 | | unsigned Array[3]; |
60 | | }; |
61 | | |
62 | | using InstType = enum { |
63 | | OTHER, |
64 | | SMEM, |
65 | | VMEM |
66 | | }; |
67 | | |
68 | | using RegCounters = Counters[512]; |
69 | | using RegInterval = std::pair<unsigned, unsigned>; |
70 | | |
71 | | class SIInsertWaits : public MachineFunctionPass { |
72 | | private: |
73 | | const SISubtarget *ST = nullptr; |
74 | | const SIInstrInfo *TII = nullptr; |
75 | | const SIRegisterInfo *TRI = nullptr; |
76 | | const MachineRegisterInfo *MRI; |
77 | | AMDGPU::IsaInfo::IsaVersion ISA; |
78 | | |
79 | | /// Constant zero value |
80 | | static const Counters ZeroCounts; |
81 | | |
82 | | /// Hardware limits |
83 | | Counters HardwareLimits; |
84 | | |
85 | | /// Counter values we have already waited on. |
86 | | Counters WaitedOn; |
87 | | |
88 | | /// Counter values that we must wait on before the next counter |
89 | | /// increase. |
90 | | Counters DelayedWaitOn; |
91 | | |
92 | | /// Counter values for last instruction issued. |
93 | | Counters LastIssued; |
94 | | |
95 | | /// Registers used by async instructions. |
96 | | RegCounters UsedRegs; |
97 | | |
98 | | /// Registers defined by async instructions. |
99 | | RegCounters DefinedRegs; |
100 | | |
101 | | /// Different export instruction types seen since last wait. |
102 | | unsigned ExpInstrTypesSeen = 0; |
103 | | |
104 | | /// Type of the last opcode. |
105 | | InstType LastOpcodeType; |
106 | | |
107 | | bool LastInstWritesM0; |
108 | | |
109 | | /// Whether or not we have flat operations outstanding. |
110 | | bool IsFlatOutstanding; |
111 | | |
112 | | /// Whether the machine function returns void |
113 | | bool ReturnsVoid; |
114 | | |
115 | | /// Whether the VCCZ bit is possibly corrupt |
116 | | bool VCCZCorrupt = false; |
117 | | |
118 | | /// Get increment/decrement amount for this instruction. |
119 | | Counters getHwCounts(MachineInstr &MI); |
120 | | |
121 | | /// Is operand relevant for async execution? |
122 | | bool isOpRelevant(MachineOperand &Op); |
123 | | |
124 | | /// Get register interval an operand affects. |
125 | | RegInterval getRegInterval(const TargetRegisterClass *RC, |
126 | | const MachineOperand &Reg) const; |
127 | | |
128 | | /// Handle instructions async components |
129 | | void pushInstruction(MachineBasicBlock &MBB, |
130 | | MachineBasicBlock::iterator I, |
131 | | const Counters& Increment); |
132 | | |
133 | | /// Insert the actual wait instruction |
134 | | bool insertWait(MachineBasicBlock &MBB, |
135 | | MachineBasicBlock::iterator I, |
136 | | const Counters &Counts); |
137 | | |
138 | | /// Handle existing wait instructions (from intrinsics) |
139 | | void handleExistingWait(MachineBasicBlock::iterator I); |
140 | | |
141 | | /// Do we need def2def checks? |
142 | | bool unorderedDefines(MachineInstr &MI); |
143 | | |
144 | | /// Resolve all operand dependencies to counter requirements |
145 | | Counters handleOperands(MachineInstr &MI); |
146 | | |
147 | | /// Insert S_NOP between an instruction writing M0 and S_SENDMSG. |
148 | | void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I); |
149 | | |
150 | | /// Return true if there are LGKM instrucitons that haven't been waited on |
151 | | /// yet. |
152 | | bool hasOutstandingLGKM() const; |
153 | | |
154 | | public: |
155 | | static char ID; |
156 | | |
157 | 2 | SIInsertWaits() : MachineFunctionPass(ID) {} |
158 | | |
159 | | bool runOnMachineFunction(MachineFunction &MF) override; |
160 | | |
161 | 2 | StringRef getPassName() const override { |
162 | 2 | return "SI insert wait instructions"; |
163 | 2 | } |
164 | | |
165 | 2 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
166 | 2 | AU.setPreservesCFG(); |
167 | 2 | MachineFunctionPass::getAnalysisUsage(AU); |
168 | 2 | } |
169 | | }; |
170 | | |
171 | | } // end anonymous namespace |
172 | | |
173 | 88.1k | INITIALIZE_PASS_BEGIN(SIInsertWaits, DEBUG_TYPE, |
174 | 88.1k | "SI Insert Waits", false, false) |
175 | 88.1k | INITIALIZE_PASS_END(SIInsertWaits, DEBUG_TYPE, |
176 | | "SI Insert Waits", false, false) |
177 | | |
178 | | char SIInsertWaits::ID = 0; |
179 | | |
180 | | char &llvm::SIInsertWaitsID = SIInsertWaits::ID; |
181 | | |
182 | 0 | FunctionPass *llvm::createSIInsertWaitsPass() { |
183 | 0 | return new SIInsertWaits(); |
184 | 0 | } |
185 | | |
186 | | const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } }; |
187 | | |
188 | 32 | static bool readsVCCZ(const MachineInstr &MI) { |
189 | 32 | unsigned Opc = MI.getOpcode(); |
190 | 32 | return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) && |
191 | 32 | !MI.getOperand(1).isUndef()2 ; |
192 | 32 | } |
193 | | |
194 | 29 | bool SIInsertWaits::hasOutstandingLGKM() const { |
195 | 29 | return WaitedOn.Named.LGKM != LastIssued.Named.LGKM; |
196 | 29 | } |
197 | | |
198 | 55 | Counters SIInsertWaits::getHwCounts(MachineInstr &MI) { |
199 | 55 | uint64_t TSFlags = MI.getDesc().TSFlags; |
200 | 55 | Counters Result = { { 0, 0, 0 } }; |
201 | 55 | |
202 | 55 | Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT); |
203 | 55 | |
204 | 55 | // Only consider stores or EXP for EXP_CNT |
205 | 55 | Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT) && MI.mayStore()6 ; |
206 | 55 | |
207 | 55 | // LGKM may uses larger values |
208 | 55 | if (TSFlags & SIInstrFlags::LGKM_CNT) { |
209 | 14 | |
210 | 14 | if (TII->isSMRD(MI)) { |
211 | 3 | |
212 | 3 | if (MI.getNumOperands() != 0) { |
213 | 3 | assert(MI.getOperand(0).isReg() && |
214 | 3 | "First LGKM operand must be a register!"); |
215 | 3 | |
216 | 3 | // XXX - What if this is a write into a super register? |
217 | 3 | const TargetRegisterClass *RC = TII->getOpRegClass(MI, 0); |
218 | 3 | unsigned Size = TRI->getRegSizeInBits(*RC); |
219 | 3 | Result.Named.LGKM = Size > 32 ? 22 : 11 ; |
220 | 3 | } else { |
221 | 0 | // s_dcache_inv etc. do not have a destination register. Assume we |
222 | 0 | // want a wait on these. |
223 | 0 | // XXX - What is the right value? |
224 | 0 | Result.Named.LGKM = 1; |
225 | 0 | } |
226 | 11 | } else { |
227 | 11 | // DS |
228 | 11 | Result.Named.LGKM = 1; |
229 | 11 | } |
230 | 14 | |
231 | 41 | } else { |
232 | 41 | Result.Named.LGKM = 0; |
233 | 41 | } |
234 | 55 | |
235 | 55 | return Result; |
236 | 55 | } |
237 | | |
238 | 137 | bool SIInsertWaits::isOpRelevant(MachineOperand &Op) { |
239 | 137 | // Constants are always irrelevant |
240 | 137 | if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg())68 ) |
241 | 69 | return false; |
242 | 68 | |
243 | 68 | // Defines are always relevant |
244 | 68 | if (Op.isDef()) |
245 | 11 | return true; |
246 | 57 | |
247 | 57 | // For exports all registers are relevant. |
248 | 57 | // TODO: Skip undef/disabled registers. |
249 | 57 | MachineInstr &MI = *Op.getParent(); |
250 | 57 | if (TII->isEXP(MI)) |
251 | 0 | return true; |
252 | 57 | |
253 | 57 | // For stores the stored value is also relevant |
254 | 57 | if (!MI.getDesc().mayStore()) |
255 | 27 | return false; |
256 | 30 | |
257 | 30 | // Check if this operand is the value being stored. |
258 | 30 | // Special case for DS/FLAT instructions, since the address |
259 | 30 | // operand comes before the value operand and it may have |
260 | 30 | // multiple data operands. |
261 | 30 | |
262 | 30 | if (TII->isDS(MI)) { |
263 | 0 | MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0); |
264 | 0 | if (Data0 && Op.isIdenticalTo(*Data0)) |
265 | 0 | return true; |
266 | 0 | |
267 | 0 | MachineOperand *Data1 = TII->getNamedOperand(MI, AMDGPU::OpName::data1); |
268 | 0 | return Data1 && Op.isIdenticalTo(*Data1); |
269 | 0 | } |
270 | 30 | |
271 | 30 | if (TII->isFLAT(MI)) { |
272 | 12 | MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::vdata); |
273 | 12 | if (Data && Op.isIdenticalTo(*Data)) |
274 | 3 | return true; |
275 | 27 | } |
276 | 27 | |
277 | 27 | // NOTE: This assumes that the value operand is before the |
278 | 27 | // address operand, and that there is only one value operand. |
279 | 27 | for (MachineInstr::mop_iterator I = MI.operands_begin(), |
280 | 27 | E = MI.operands_end(); I != E; ++I0 ) { |
281 | 27 | |
282 | 27 | if (I->isReg() && I->isUse()) |
283 | 27 | return Op.isIdenticalTo(*I); |
284 | 27 | } |
285 | 27 | |
286 | 27 | return false0 ; |
287 | 27 | } |
288 | | |
289 | | RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC, |
290 | 135 | const MachineOperand &Reg) const { |
291 | 135 | unsigned Size = TRI->getRegSizeInBits(*RC); |
292 | 135 | assert(Size >= 32); |
293 | 135 | |
294 | 135 | RegInterval Result; |
295 | 135 | Result.first = TRI->getEncodingValue(Reg.getReg()); |
296 | 135 | Result.second = Result.first + Size / 32; |
297 | 135 | |
298 | 135 | return Result; |
299 | 135 | } |
300 | | |
301 | | void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, |
302 | | MachineBasicBlock::iterator I, |
303 | 55 | const Counters &Increment) { |
304 | 55 | // Get the hardware counter increments and sum them up |
305 | 55 | Counters Limit = ZeroCounts; |
306 | 55 | unsigned Sum = 0; |
307 | 55 | |
308 | 55 | if (TII->mayAccessFlatAddressSpace(*I)) |
309 | 8 | IsFlatOutstanding = true; |
310 | 55 | |
311 | 220 | for (unsigned i = 0; i < 3; ++i165 ) { |
312 | 165 | LastIssued.Array[i] += Increment.Array[i]; |
313 | 165 | if (Increment.Array[i]) |
314 | 37 | Limit.Array[i] = LastIssued.Array[i]; |
315 | 165 | Sum += Increment.Array[i]; |
316 | 165 | } |
317 | 55 | |
318 | 55 | // If we don't increase anything then that's it |
319 | 55 | if (Sum == 0) { |
320 | 35 | LastOpcodeType = OTHER; |
321 | 35 | return; |
322 | 35 | } |
323 | 20 | |
324 | 20 | if (ST->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { |
325 | 11 | // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM |
326 | 11 | // or SMEM clause, respectively. |
327 | 11 | // |
328 | 11 | // The temporary workaround is to break the clauses with S_NOP. |
329 | 11 | // |
330 | 11 | // The proper solution would be to allocate registers such that all source |
331 | 11 | // and destination registers don't overlap, e.g. this is illegal: |
332 | 11 | // r0 = load r2 |
333 | 11 | // r2 = load r0 |
334 | 11 | if (LastOpcodeType == VMEM && Increment.Named.VM1 ) { |
335 | 1 | // Insert a NOP to break the clause. |
336 | 1 | BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)) |
337 | 1 | .addImm(0); |
338 | 1 | LastInstWritesM0 = false; |
339 | 1 | } |
340 | 11 | |
341 | 11 | if (TII->isSMRD(*I)) |
342 | 0 | LastOpcodeType = SMEM; |
343 | 11 | else if (Increment.Named.VM) |
344 | 11 | LastOpcodeType = VMEM; |
345 | 11 | } |
346 | 20 | |
347 | 20 | // Remember which export instructions we have seen |
348 | 20 | if (Increment.Named.EXP) { |
349 | 6 | ExpInstrTypesSeen |= TII->isEXP(*I) ? 10 : 2; |
350 | 6 | } |
351 | 20 | |
352 | 157 | for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i137 ) { |
353 | 137 | MachineOperand &Op = I->getOperand(i); |
354 | 137 | if (!isOpRelevant(Op)) |
355 | 114 | continue; |
356 | 23 | |
357 | 23 | const TargetRegisterClass *RC = TII->getOpRegClass(*I, i); |
358 | 23 | RegInterval Interval = getRegInterval(RC, Op); |
359 | 60 | for (unsigned j = Interval.first; j < Interval.second; ++j37 ) { |
360 | 37 | |
361 | 37 | // Remember which registers we define |
362 | 37 | if (Op.isDef()) |
363 | 22 | DefinedRegs[j] = Limit; |
364 | 37 | |
365 | 37 | // and which one we are using |
366 | 37 | if (Op.isUse()) |
367 | 15 | UsedRegs[j] = Limit; |
368 | 37 | } |
369 | 23 | } |
370 | 20 | } |
371 | | |
372 | | bool SIInsertWaits::insertWait(MachineBasicBlock &MBB, |
373 | | MachineBasicBlock::iterator I, |
374 | 69 | const Counters &Required) { |
375 | 69 | // End of program? No need to wait on anything |
376 | 69 | // A function not returning void needs to wait, because other bytecode will |
377 | 69 | // be appended after it and we don't know what it will be. |
378 | 69 | if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM67 && ReturnsVoid12 ) |
379 | 12 | return false; |
380 | 57 | |
381 | 57 | // Figure out if the async instructions execute in order |
382 | 57 | bool Ordered[3]; |
383 | 57 | |
384 | 57 | // VM_CNT is always ordered except when there are flat instructions, which |
385 | 57 | // can return out of order. |
386 | 57 | Ordered[0] = !IsFlatOutstanding; |
387 | 57 | |
388 | 57 | // EXP_CNT is unordered if we have both EXP & VM-writes |
389 | 57 | Ordered[1] = ExpInstrTypesSeen == 3; |
390 | 57 | |
391 | 57 | // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS |
392 | 57 | Ordered[2] = false; |
393 | 57 | |
394 | 57 | // The values we are going to put into the S_WAITCNT instruction |
395 | 57 | Counters Counts = HardwareLimits; |
396 | 57 | |
397 | 57 | // Do we really need to wait? |
398 | 57 | bool NeedWait = false; |
399 | 57 | |
400 | 228 | for (unsigned i = 0; i < 3; ++i171 ) { |
401 | 171 | if (Required.Array[i] <= WaitedOn.Array[i]) |
402 | 151 | continue; |
403 | 20 | |
404 | 20 | NeedWait = true; |
405 | 20 | |
406 | 20 | if (Ordered[i]) { |
407 | 5 | unsigned Value = LastIssued.Array[i] - Required.Array[i]; |
408 | 5 | |
409 | 5 | // Adjust the value to the real hardware possibilities. |
410 | 5 | Counts.Array[i] = std::min(Value, HardwareLimits.Array[i]); |
411 | 5 | } else |
412 | 15 | Counts.Array[i] = 0; |
413 | 20 | |
414 | 20 | // Remember on what we have waited on. |
415 | 20 | WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i]; |
416 | 20 | } |
417 | 57 | |
418 | 57 | if (!NeedWait) |
419 | 44 | return false; |
420 | 13 | |
421 | 13 | // Reset EXP_CNT instruction types |
422 | 13 | if (Counts.Named.EXP == 0) |
423 | 4 | ExpInstrTypesSeen = 0; |
424 | 13 | |
425 | 13 | // Build the wait instruction |
426 | 13 | BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) |
427 | 13 | .addImm(AMDGPU::encodeWaitcnt(ISA, |
428 | 13 | Counts.Named.VM, |
429 | 13 | Counts.Named.EXP, |
430 | 13 | Counts.Named.LGKM)); |
431 | 13 | |
432 | 13 | LastOpcodeType = OTHER; |
433 | 13 | LastInstWritesM0 = false; |
434 | 13 | IsFlatOutstanding = false; |
435 | 13 | return true; |
436 | 13 | } |
437 | | |
438 | | /// helper function for handleOperands |
439 | 282 | static void increaseCounters(Counters &Dst, const Counters &Src) { |
440 | 1.12k | for (unsigned i = 0; i < 3; ++i846 ) |
441 | 846 | Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]); |
442 | 282 | } |
443 | | |
444 | | /// check whether any of the counters is non-zero |
445 | 91 | static bool countersNonZero(const Counters &Counter) { |
446 | 285 | for (unsigned i = 0; i < 3; ++i194 ) |
447 | 225 | if (Counter.Array[i]) |
448 | 31 | return true; |
449 | 91 | return false60 ; |
450 | 91 | } |
451 | | |
452 | 0 | void SIInsertWaits::handleExistingWait(MachineBasicBlock::iterator I) { |
453 | 0 | assert(I->getOpcode() == AMDGPU::S_WAITCNT); |
454 | 0 |
|
455 | 0 | unsigned Imm = I->getOperand(0).getImm(); |
456 | 0 | Counters Counts, WaitOn; |
457 | 0 |
|
458 | 0 | Counts.Named.VM = AMDGPU::decodeVmcnt(ISA, Imm); |
459 | 0 | Counts.Named.EXP = AMDGPU::decodeExpcnt(ISA, Imm); |
460 | 0 | Counts.Named.LGKM = AMDGPU::decodeLgkmcnt(ISA, Imm); |
461 | 0 |
|
462 | 0 | for (unsigned i = 0; i < 3; ++i) { |
463 | 0 | if (Counts.Array[i] <= LastIssued.Array[i]) |
464 | 0 | WaitOn.Array[i] = LastIssued.Array[i] - Counts.Array[i]; |
465 | 0 | else |
466 | 0 | WaitOn.Array[i] = 0; |
467 | 0 | } |
468 | 0 |
|
469 | 0 | increaseCounters(DelayedWaitOn, WaitOn); |
470 | 0 | } |
471 | | |
472 | 55 | Counters SIInsertWaits::handleOperands(MachineInstr &MI) { |
473 | 55 | Counters Result = ZeroCounts; |
474 | 55 | |
475 | 55 | // For each register affected by this instruction increase the result |
476 | 55 | // sequence. |
477 | 55 | // |
478 | 55 | // TODO: We could probably just look at explicit operands if we removed VCC / |
479 | 55 | // EXEC from SMRD dest reg classes. |
480 | 265 | for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i210 ) { |
481 | 210 | MachineOperand &Op = MI.getOperand(i); |
482 | 210 | if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg())112 ) |
483 | 98 | continue; |
484 | 112 | |
485 | 112 | const TargetRegisterClass *RC = TII->getOpRegClass(MI, i); |
486 | 112 | RegInterval Interval = getRegInterval(RC, Op); |
487 | 316 | for (unsigned j = Interval.first; j < Interval.second; ++j204 ) { |
488 | 204 | if (Op.isDef()) { |
489 | 47 | increaseCounters(Result, UsedRegs[j]); |
490 | 47 | increaseCounters(Result, DefinedRegs[j]); |
491 | 47 | } |
492 | 204 | |
493 | 204 | if (Op.isUse()) |
494 | 157 | increaseCounters(Result, DefinedRegs[j]); |
495 | 204 | } |
496 | 112 | } |
497 | 55 | |
498 | 55 | return Result; |
499 | 55 | } |
500 | | |
501 | | void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB, |
502 | 55 | MachineBasicBlock::iterator I) { |
503 | 55 | if (ST->getGeneration() < SISubtarget::VOLCANIC_ISLANDS) |
504 | 32 | return; |
505 | 23 | |
506 | 23 | // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG. |
507 | 23 | if (LastInstWritesM0 && (0 I->getOpcode() == AMDGPU::S_SENDMSG0 || I->getOpcode() == AMDGPU::S_SENDMSGHALT0 )) { |
508 | 0 | BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0); |
509 | 0 | LastInstWritesM0 = false; |
510 | 0 | return; |
511 | 0 | } |
512 | 23 | |
513 | 23 | // Set whether this instruction sets M0 |
514 | 23 | LastInstWritesM0 = false; |
515 | 23 | |
516 | 23 | unsigned NumOperands = I->getNumOperands(); |
517 | 120 | for (unsigned i = 0; i < NumOperands; i++97 ) { |
518 | 97 | const MachineOperand &Op = I->getOperand(i); |
519 | 97 | |
520 | 97 | if (Op.isReg() && Op.isDef()59 && Op.getReg() == AMDGPU::M013 ) |
521 | 0 | LastInstWritesM0 = true; |
522 | 97 | } |
523 | 23 | } |
524 | | |
525 | | /// Return true if \p MBB has one successor immediately following, and is its |
526 | | /// only predecessor |
527 | 16 | static bool hasTrivialSuccessor(const MachineBasicBlock &MBB) { |
528 | 16 | if (MBB.succ_size() != 1) |
529 | 8 | return false; |
530 | 8 | |
531 | 8 | const MachineBasicBlock *Succ = *MBB.succ_begin(); |
532 | 8 | return (Succ->pred_size() == 1) && MBB.isLayoutSuccessor(Succ)4 ; |
533 | 8 | } |
534 | | |
535 | | // FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States" |
536 | | // around other non-memory instructions. |
537 | 5 | bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { |
538 | 5 | bool Changes = false; |
539 | 5 | |
540 | 5 | ST = &MF.getSubtarget<SISubtarget>(); |
541 | 5 | TII = ST->getInstrInfo(); |
542 | 5 | TRI = &TII->getRegisterInfo(); |
543 | 5 | MRI = &MF.getRegInfo(); |
544 | 5 | ISA = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits()); |
545 | 5 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
546 | 5 | |
547 | 5 | HardwareLimits.Named.VM = AMDGPU::getVmcntBitMask(ISA); |
548 | 5 | HardwareLimits.Named.EXP = AMDGPU::getExpcntBitMask(ISA); |
549 | 5 | HardwareLimits.Named.LGKM = AMDGPU::getLgkmcntBitMask(ISA); |
550 | 5 | |
551 | 5 | WaitedOn = ZeroCounts; |
552 | 5 | DelayedWaitOn = ZeroCounts; |
553 | 5 | LastIssued = ZeroCounts; |
554 | 5 | LastOpcodeType = OTHER; |
555 | 5 | LastInstWritesM0 = false; |
556 | 5 | IsFlatOutstanding = false; |
557 | 5 | ReturnsVoid = MFI->returnsVoid(); |
558 | 5 | |
559 | 5 | memset(&UsedRegs, 0, sizeof(UsedRegs)); |
560 | 5 | memset(&DefinedRegs, 0, sizeof(DefinedRegs)); |
561 | 5 | |
562 | 5 | SmallVector<MachineInstr *, 4> RemoveMI; |
563 | 5 | SmallVector<MachineBasicBlock *, 4> EndPgmBlocks; |
564 | 5 | |
565 | 5 | bool HaveScalarStores = false; |
566 | 5 | |
567 | 5 | for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); |
568 | 21 | BI != BE; ++BI16 ) { |
569 | 16 | MachineBasicBlock &MBB = *BI; |
570 | 16 | |
571 | 16 | for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); |
572 | 71 | I != E; ++I55 ) { |
573 | 55 | if (!HaveScalarStores && TII->isScalarStore(*I)) |
574 | 0 | HaveScalarStores = true; |
575 | 55 | |
576 | 55 | if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) { |
577 | 32 | // There is a hardware bug on CI/SI where SMRD instruction may corrupt |
578 | 32 | // vccz bit, so when we detect that an instruction may read from a |
579 | 32 | // corrupt vccz bit, we need to: |
580 | 32 | // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD operations to |
581 | 32 | // complete. |
582 | 32 | // 2. Restore the correct value of vccz by writing the current value |
583 | 32 | // of vcc back to vcc. |
584 | 32 | |
585 | 32 | if (TII->isSMRD(I->getOpcode())) { |
586 | 3 | VCCZCorrupt = true; |
587 | 29 | } else if (!hasOutstandingLGKM() && I->modifiesRegister(AMDGPU::VCC, TRI)23 ) { |
588 | 0 | // FIXME: We only care about SMRD instructions here, not LDS or GDS. |
589 | 0 | // Whenever we store a value in vcc, the correct value of vccz is |
590 | 0 | // restored. |
591 | 0 | VCCZCorrupt = false; |
592 | 0 | } |
593 | 32 | |
594 | 32 | // Check if we need to apply the bug work-around |
595 | 32 | if (VCCZCorrupt && readsVCCZ(*I)) { |
596 | 1 | DEBUG(dbgs() << "Inserting vccz bug work-around before: " << *I << '\n'); |
597 | 1 | |
598 | 1 | // Wait on everything, not just LGKM. vccz reads usually come from |
599 | 1 | // terminators, and we always wait on everything at the end of the |
600 | 1 | // block, so if we only wait on LGKM here, we might end up with |
601 | 1 | // another s_waitcnt inserted right after this if there are non-LGKM |
602 | 1 | // instructions still outstanding. |
603 | 1 | insertWait(MBB, I, LastIssued); |
604 | 1 | |
605 | 1 | // Restore the vccz bit. Any time a value is written to vcc, the vcc |
606 | 1 | // bit is updated, so we can restore the bit by reading the value of |
607 | 1 | // vcc and then writing it back to the register. |
608 | 1 | BuildMI(MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), |
609 | 1 | AMDGPU::VCC) |
610 | 1 | .addReg(AMDGPU::VCC); |
611 | 1 | } |
612 | 32 | } |
613 | 55 | |
614 | 55 | // Record pre-existing, explicitly requested waits |
615 | 55 | if (I->getOpcode() == AMDGPU::S_WAITCNT) { |
616 | 0 | handleExistingWait(*I); |
617 | 0 | RemoveMI.push_back(&*I); |
618 | 0 | continue; |
619 | 0 | } |
620 | 55 | |
621 | 55 | Counters Required; |
622 | 55 | |
623 | 55 | // Wait for everything before a barrier. |
624 | 55 | // |
625 | 55 | // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish, |
626 | 55 | // but we also want to wait for any other outstanding transfers before |
627 | 55 | // signalling other hardware blocks |
628 | 55 | if ((I->getOpcode() == AMDGPU::S_BARRIER && |
629 | 55 | !ST->hasAutoWaitcntBeforeBarrier()0 ) || |
630 | 55 | I->getOpcode() == AMDGPU::S_SENDMSG || |
631 | 55 | I->getOpcode() == AMDGPU::S_SENDMSGHALT) |
632 | 0 | Required = LastIssued; |
633 | 55 | else |
634 | 55 | Required = handleOperands(*I); |
635 | 55 | |
636 | 55 | Counters Increment = getHwCounts(*I); |
637 | 55 | |
638 | 55 | if (countersNonZero(Required) || countersNonZero(Increment)36 ) |
639 | 31 | increaseCounters(Required, DelayedWaitOn); |
640 | 55 | |
641 | 55 | Changes |= insertWait(MBB, I, Required); |
642 | 55 | |
643 | 55 | pushInstruction(MBB, I, Increment); |
644 | 55 | handleSendMsg(MBB, I); |
645 | 55 | |
646 | 55 | if (I->getOpcode() == AMDGPU::S_ENDPGM || |
647 | 55 | I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG49 ) |
648 | 6 | EndPgmBlocks.push_back(&MBB); |
649 | 55 | } |
650 | 16 | |
651 | 16 | // Wait for everything at the end of the MBB. If there is only one |
652 | 16 | // successor, we can defer this until the uses there. |
653 | 16 | if (!hasTrivialSuccessor(MBB)) |
654 | 13 | Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued); |
655 | 16 | } |
656 | 5 | |
657 | 5 | if (HaveScalarStores) { |
658 | 0 | // If scalar writes are used, the cache must be flushed or else the next |
659 | 0 | // wave to reuse the same scratch memory can be clobbered. |
660 | 0 | // |
661 | 0 | // Insert s_dcache_wb at wave termination points if there were any scalar |
662 | 0 | // stores, and only if the cache hasn't already been flushed. This could be |
663 | 0 | // improved by looking across blocks for flushes in postdominating blocks |
664 | 0 | // from the stores but an explicitly requested flush is probably very rare. |
665 | 0 | for (MachineBasicBlock *MBB : EndPgmBlocks) { |
666 | 0 | bool SeenDCacheWB = false; |
667 | 0 |
|
668 | 0 | for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); |
669 | 0 | I != E; ++I) { |
670 | 0 | if (I->getOpcode() == AMDGPU::S_DCACHE_WB) |
671 | 0 | SeenDCacheWB = true; |
672 | 0 | else if (TII->isScalarStore(*I)) |
673 | 0 | SeenDCacheWB = false; |
674 | 0 |
|
675 | 0 | // FIXME: It would be better to insert this before a waitcnt if any. |
676 | 0 | if ((I->getOpcode() == AMDGPU::S_ENDPGM || |
677 | 0 | I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) && !SeenDCacheWB) { |
678 | 0 | Changes = true; |
679 | 0 | BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB)); |
680 | 0 | } |
681 | 0 | } |
682 | 0 | } |
683 | 0 | } |
684 | 5 | |
685 | 5 | for (MachineInstr *I : RemoveMI) |
686 | 0 | I->eraseFromParent(); |
687 | 5 | |
688 | 5 | if (!MFI->isEntryFunction()) { |
689 | 0 | // Wait for any outstanding memory operations that the input registers may |
690 | 0 | // depend on. We can't track them and it's better to the wait after the |
691 | 0 | // costly call sequence. |
692 | 0 |
|
693 | 0 | // TODO: Could insert earlier and schedule more liberally with operations |
694 | 0 | // that only use caller preserved registers. |
695 | 0 | MachineBasicBlock &EntryBB = MF.front(); |
696 | 0 | BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) |
697 | 0 | .addImm(0); |
698 | 0 |
|
699 | 0 | Changes = true; |
700 | 0 | } |
701 | 5 | |
702 | 5 | return Changes; |
703 | 5 | } |