Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
Line
Count
Source (jump to first uncovered line)
1
//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
/// \file
10
/// Insert wait instructions for memory reads and writes.
11
///
12
/// Memory reads and writes are issued asynchronously, so we need to insert
13
/// S_WAITCNT instructions when we want to access any of their results or
14
/// overwrite any register that's used asynchronously.
15
///
16
/// TODO: This pass currently keeps one timeline per hardware counter. A more
17
/// finely-grained approach that keeps one timeline per event type could
18
/// sometimes get away with generating weaker s_waitcnt instructions. For
19
/// example, when both SMEM and LDS are in flight and we need to wait for
20
/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21
/// but the pass will currently generate a conservative lgkmcnt(0) because
22
/// multiple event types are in flight.
23
//
24
//===----------------------------------------------------------------------===//
25
26
#include "AMDGPU.h"
27
#include "AMDGPUSubtarget.h"
28
#include "SIDefines.h"
29
#include "SIInstrInfo.h"
30
#include "SIMachineFunctionInfo.h"
31
#include "SIRegisterInfo.h"
32
#include "Utils/AMDGPUBaseInfo.h"
33
#include "llvm/ADT/DenseMap.h"
34
#include "llvm/ADT/DenseSet.h"
35
#include "llvm/ADT/PostOrderIterator.h"
36
#include "llvm/ADT/STLExtras.h"
37
#include "llvm/ADT/SmallVector.h"
38
#include "llvm/CodeGen/MachineBasicBlock.h"
39
#include "llvm/CodeGen/MachineFunction.h"
40
#include "llvm/CodeGen/MachineFunctionPass.h"
41
#include "llvm/CodeGen/MachineInstr.h"
42
#include "llvm/CodeGen/MachineInstrBuilder.h"
43
#include "llvm/CodeGen/MachineMemOperand.h"
44
#include "llvm/CodeGen/MachineOperand.h"
45
#include "llvm/CodeGen/MachineRegisterInfo.h"
46
#include "llvm/IR/DebugLoc.h"
47
#include "llvm/Pass.h"
48
#include "llvm/Support/Debug.h"
49
#include "llvm/Support/DebugCounter.h"
50
#include "llvm/Support/ErrorHandling.h"
51
#include "llvm/Support/raw_ostream.h"
52
#include <algorithm>
53
#include <cassert>
54
#include <cstdint>
55
#include <cstring>
56
#include <memory>
57
#include <utility>
58
#include <vector>
59
60
using namespace llvm;
61
62
#define DEBUG_TYPE "si-insert-waitcnts"
63
64
DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
65
              "Force emit s_waitcnt expcnt(0) instrs");
66
DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
67
              "Force emit s_waitcnt lgkmcnt(0) instrs");
68
DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
69
              "Force emit s_waitcnt vmcnt(0) instrs");
70
71
static cl::opt<bool> ForceEmitZeroFlag(
72
  "amdgpu-waitcnt-forcezero",
73
  cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
74
  cl::init(false), cl::Hidden);
75
76
namespace {
77
78
template <typename EnumT>
79
class enum_iterator
80
    : public iterator_facade_base<enum_iterator<EnumT>,
81
                                  std::forward_iterator_tag, const EnumT> {
82
  EnumT Value;
83
public:
84
  enum_iterator() = default;
85
890k
  enum_iterator(EnumT Value) : Value(Value) {}
86
87
1.78M
  enum_iterator &operator++() {
88
1.78M
    Value = static_cast<EnumT>(Value + 1);
89
1.78M
    return *this;
90
1.78M
  }
91
92
2.22M
  bool operator==(const enum_iterator &RHS) const { return Value == RHS.Value; }
93
94
1.78M
  EnumT operator*() const { return Value; }
95
};
96
97
// Class of object that encapsulates latest instruction counter score
98
// associated with the operand.  Used for determining whether
99
// s_waitcnt instruction needs to be emited.
100
101
#define CNT_MASK(t) (1u << (t))
102
103
enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, VS_CNT, NUM_INST_CNTS };
104
105
445k
iterator_range<enum_iterator<InstCounterType>> inst_counter_types() {
106
445k
  return make_range(enum_iterator<InstCounterType>(VM_CNT),
107
445k
                    enum_iterator<InstCounterType>(NUM_INST_CNTS));
108
445k
}
109
110
using RegInterval = std::pair<signed, signed>;
111
112
struct {
113
  uint32_t VmcntMax;
114
  uint32_t ExpcntMax;
115
  uint32_t LgkmcntMax;
116
  uint32_t VscntMax;
117
  int32_t NumVGPRsMax;
118
  int32_t NumSGPRsMax;
119
} HardwareLimits;
120
121
struct {
122
  unsigned VGPR0;
123
  unsigned VGPRL;
124
  unsigned SGPR0;
125
  unsigned SGPRL;
126
} RegisterEncoding;
127
128
enum WaitEventType {
129
  VMEM_ACCESS,      // vector-memory read & write
130
  VMEM_READ_ACCESS, // vector-memory read
131
  VMEM_WRITE_ACCESS,// vector-memory write
132
  LDS_ACCESS,       // lds read & write
133
  GDS_ACCESS,       // gds read & write
134
  SQ_MESSAGE,       // send message
135
  SMEM_ACCESS,      // scalar-memory read & write
136
  EXP_GPR_LOCK,     // export holding on its data src
137
  GDS_GPR_LOCK,     // GDS holding on its data and addr src
138
  EXP_POS_ACCESS,   // write to export position
139
  EXP_PARAM_ACCESS, // write to export parameter
140
  VMW_GPR_LOCK,     // vector-memory write holding on its data src
141
  NUM_WAIT_EVENTS,
142
};
143
144
static const uint32_t WaitEventMaskForInst[NUM_INST_CNTS] = {
145
  (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
146
  (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
147
      (1 << SQ_MESSAGE),
148
  (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
149
      (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS),
150
  (1 << VMEM_WRITE_ACCESS)
151
};
152
153
// The mapping is:
154
//  0                .. SQ_MAX_PGM_VGPRS-1               real VGPRs
155
//  SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1                  extra VGPR-like slots
156
//  NUM_ALL_VGPRS    .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
157
// We reserve a fixed number of VGPR slots in the scoring tables for
158
// special tokens like SCMEM_LDS (needed for buffer load to LDS).
159
enum RegisterMapping {
160
  SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets.
161
  SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
162
  NUM_EXTRA_VGPRS = 1,    // A reserved slot for DS.
163
  EXTRA_VGPR_LDS = 0,     // This is a placeholder the Shader algorithm uses.
164
  NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
165
};
166
167
114k
void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
168
114k
  switch (T) {
169
114k
  case VM_CNT:
170
59.9k
    Wait.VmCnt = std::min(Wait.VmCnt, Count);
171
59.9k
    break;
172
114k
  case EXP_CNT:
173
4.35k
    Wait.ExpCnt = std::min(Wait.ExpCnt, Count);
174
4.35k
    break;
175
114k
  case LGKM_CNT:
176
49.7k
    Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count);
177
49.7k
    break;
178
114k
  case VS_CNT:
179
0
    Wait.VsCnt = std::min(Wait.VsCnt, Count);
180
0
    break;
181
114k
  default:
182
0
    llvm_unreachable("bad InstCounterType");
183
114k
  }
184
114k
}
185
186
// This objects maintains the current score brackets of each wait counter, and
187
// a per-register scoreboard for each wait counter.
188
//
189
// We also maintain the latest score for every event type that can change the
190
// waitcnt in order to know if there are multiple types of events within
191
// the brackets. When multiple types of event happen in the bracket,
192
// wait count may get decreased out of order, therefore we need to put in
193
// "s_waitcnt 0" before use.
194
class WaitcntBrackets {
195
public:
196
25.4k
  WaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {
197
25.4k
    for (auto T : inst_counter_types())
198
101k
      memset(VgprScores[T], 0, sizeof(VgprScores[T]));
199
25.4k
  }
200
201
11.0k
  static uint32_t getWaitCountMax(InstCounterType T) {
202
11.0k
    switch (T) {
203
11.0k
    case VM_CNT:
204
0
      return HardwareLimits.VmcntMax;
205
11.0k
    case LGKM_CNT:
206
0
      return HardwareLimits.LgkmcntMax;
207
11.0k
    case EXP_CNT:
208
11.0k
      return HardwareLimits.ExpcntMax;
209
11.0k
    case VS_CNT:
210
0
      return HardwareLimits.VscntMax;
211
11.0k
    default:
212
0
      break;
213
0
    }
214
0
    return 0;
215
0
  }
216
217
5.03M
  uint32_t getScoreLB(InstCounterType T) const {
218
5.03M
    assert(T < NUM_INST_CNTS);
219
5.03M
    if (T >= NUM_INST_CNTS)
220
0
      return 0;
221
5.03M
    return ScoreLBs[T];
222
5.03M
  }
223
224
5.35M
  uint32_t getScoreUB(InstCounterType T) const {
225
5.35M
    assert(T < NUM_INST_CNTS);
226
5.35M
    if (T >= NUM_INST_CNTS)
227
0
      return 0;
228
5.35M
    return ScoreUBs[T];
229
5.35M
  }
230
231
  // Mapping from event to counter.
232
127k
  InstCounterType eventCounter(WaitEventType E) {
233
127k
    if (WaitEventMaskForInst[VM_CNT] & (1 << E))
234
58.8k
      return VM_CNT;
235
68.8k
    if (WaitEventMaskForInst[LGKM_CNT] & (1 << E))
236
55.4k
      return LGKM_CNT;
237
13.4k
    if (WaitEventMaskForInst[VS_CNT] & (1 << E))
238
2.41k
      return VS_CNT;
239
11.0k
    assert(WaitEventMaskForInst[EXP_CNT] & (1 << E));
240
11.0k
    return EXP_CNT;
241
11.0k
  }
242
243
3.46M
  uint32_t getRegScore(int GprNo, InstCounterType T) {
244
3.46M
    if (GprNo < NUM_ALL_VGPRS) {
245
1.89M
      return VgprScores[T][GprNo];
246
1.89M
    }
247
1.57M
    assert(T == LGKM_CNT);
248
1.57M
    return SgprScores[GprNo - NUM_ALL_VGPRS];
249
1.57M
  }
250
251
1.80k
  void clear() {
252
1.80k
    memset(ScoreLBs, 0, sizeof(ScoreLBs));
253
1.80k
    memset(ScoreUBs, 0, sizeof(ScoreUBs));
254
1.80k
    PendingEvents = 0;
255
1.80k
    memset(MixedPendingEvents, 0, sizeof(MixedPendingEvents));
256
1.80k
    for (auto T : inst_counter_types())
257
7.22k
      memset(VgprScores[T], 0, sizeof(VgprScores[T]));
258
1.80k
    memset(SgprScores, 0, sizeof(SgprScores));
259
1.80k
  }
260
261
  bool merge(const WaitcntBrackets &Other);
262
263
  RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
264
                             const MachineRegisterInfo *MRI,
265
                             const SIRegisterInfo *TRI, unsigned OpNo,
266
                             bool Def) const;
267
268
6.88k
  int32_t getMaxVGPR() const { return VgprUB; }
269
2.75k
  int32_t getMaxSGPR() const { return SgprUB; }
270
271
  bool counterOutOfOrder(InstCounterType T) const;
272
  bool simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
273
  bool simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
274
  void determineWait(InstCounterType T, uint32_t ScoreToWait,
275
                     AMDGPU::Waitcnt &Wait) const;
276
  void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
277
  void applyWaitcnt(InstCounterType T, unsigned Count);
278
  void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
279
                     const MachineRegisterInfo *MRI, WaitEventType E,
280
                     MachineInstr &MI);
281
282
29.0k
  bool hasPending() const { return PendingEvents != 0; }
283
195k
  bool hasPendingEvent(WaitEventType E) const {
284
195k
    return PendingEvents & (1 << E);
285
195k
  }
286
287
109k
  bool hasPendingFlat() const {
288
109k
    return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
289
109k
             
LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]2.08k
) ||
290
109k
            
(107k
LastFlat[VM_CNT] > ScoreLBs[VM_CNT]107k
&&
291
107k
             
LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]40
));
292
109k
  }
293
294
3.39k
  void setPendingFlat() {
295
3.39k
    LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
296
3.39k
    LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
297
3.39k
  }
298
299
  void print(raw_ostream &);
300
0
  void dump() { print(dbgs()); }
301
302
private:
303
  struct MergeInfo {
304
    uint32_t OldLB;
305
    uint32_t OtherLB;
306
    uint32_t MyShift;
307
    uint32_t OtherShift;
308
  };
309
  static bool mergeScore(const MergeInfo &M, uint32_t &Score,
310
                         uint32_t OtherScore);
311
312
52.1k
  void setScoreLB(InstCounterType T, uint32_t Val) {
313
52.1k
    assert(T < NUM_INST_CNTS);
314
52.1k
    if (T >= NUM_INST_CNTS)
315
0
      return;
316
52.1k
    ScoreLBs[T] = Val;
317
52.1k
  }
318
319
127k
  void setScoreUB(InstCounterType T, uint32_t Val) {
320
127k
    assert(T < NUM_INST_CNTS);
321
127k
    if (T >= NUM_INST_CNTS)
322
0
      return;
323
127k
    ScoreUBs[T] = Val;
324
127k
    if (T == EXP_CNT) {
325
11.0k
      uint32_t UB = ScoreUBs[T] - getWaitCountMax(EXP_CNT);
326
11.0k
      if (ScoreLBs[T] < UB && 
UB < ScoreUBs[T]8.55k
)
327
1.10k
        ScoreLBs[T] = UB;
328
11.0k
    }
329
127k
  }
330
331
181k
  void setRegScore(int GprNo, InstCounterType T, uint32_t Val) {
332
181k
    if (GprNo < NUM_ALL_VGPRS) {
333
96.4k
      if (GprNo > VgprUB) {
334
30.9k
        VgprUB = GprNo;
335
30.9k
      }
336
96.4k
      VgprScores[T][GprNo] = Val;
337
96.4k
    } else {
338
84.5k
      assert(T == LGKM_CNT);
339
84.5k
      if (GprNo - NUM_ALL_VGPRS > SgprUB) {
340
55.5k
        SgprUB = GprNo - NUM_ALL_VGPRS;
341
55.5k
      }
342
84.5k
      SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
343
84.5k
    }
344
181k
  }
345
346
  void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
347
                   const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
348
                   unsigned OpNo, uint32_t Val);
349
350
  const GCNSubtarget *ST = nullptr;
351
  uint32_t ScoreLBs[NUM_INST_CNTS] = {0};
352
  uint32_t ScoreUBs[NUM_INST_CNTS] = {0};
353
  uint32_t PendingEvents = 0;
354
  bool MixedPendingEvents[NUM_INST_CNTS] = {false};
355
  // Remember the last flat memory operation.
356
  uint32_t LastFlat[NUM_INST_CNTS] = {0};
357
  // wait_cnt scores for every vgpr.
358
  // Keep track of the VgprUB and SgprUB to make merge at join efficient.
359
  int32_t VgprUB = 0;
360
  int32_t SgprUB = 0;
361
  uint32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
362
  // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
363
  uint32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
364
};
365
366
class SIInsertWaitcnts : public MachineFunctionPass {
367
private:
368
  const GCNSubtarget *ST = nullptr;
369
  const SIInstrInfo *TII = nullptr;
370
  const SIRegisterInfo *TRI = nullptr;
371
  const MachineRegisterInfo *MRI = nullptr;
372
  AMDGPU::IsaVersion IV;
373
374
  DenseSet<MachineInstr *> TrackedWaitcntSet;
375
  DenseSet<MachineInstr *> VCCZBugHandledSet;
376
377
  struct BlockInfo {
378
    MachineBasicBlock *MBB;
379
    std::unique_ptr<WaitcntBrackets> Incoming;
380
    bool Dirty = true;
381
382
28.8k
    explicit BlockInfo(MachineBasicBlock *MBB) : MBB(MBB) {}
383
  };
384
385
  std::vector<BlockInfo> BlockInfos; // by reverse post-order traversal index
386
  DenseMap<MachineBasicBlock *, unsigned> RpotIdxMap;
387
388
  // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
389
  // because of amdgpu-waitcnt-forcezero flag
390
  bool ForceEmitZeroWaitcnts;
391
  bool ForceEmitWaitcnt[NUM_INST_CNTS];
392
393
public:
394
  static char ID;
395
396
2.45k
  SIInsertWaitcnts() : MachineFunctionPass(ID) {
397
2.45k
    (void)ForceExpCounter;
398
2.45k
    (void)ForceLgkmCounter;
399
2.45k
    (void)ForceVMCounter;
400
2.45k
  }
401
402
  bool runOnMachineFunction(MachineFunction &MF) override;
403
404
27.8k
  StringRef getPassName() const override {
405
27.8k
    return "SI insert wait instructions";
406
27.8k
  }
407
408
2.42k
  void getAnalysisUsage(AnalysisUsage &AU) const override {
409
2.42k
    AU.setPreservesCFG();
410
2.42k
    MachineFunctionPass::getAnalysisUsage(AU);
411
2.42k
  }
412
413
391k
  bool isForceEmitWaitcnt() const {
414
391k
    for (auto T : inst_counter_types())
415
1.56M
      if (ForceEmitWaitcnt[T])
416
0
        return true;
417
391k
    return false;
418
391k
  }
419
420
391k
  void setForceEmitWaitcnt() {
421
391k
// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
422
391k
// For debug builds, get the debug counter info and adjust if need be
423
#ifndef NDEBUG
424
    if (DebugCounter::isCounterSet(ForceExpCounter) &&
425
        DebugCounter::shouldExecute(ForceExpCounter)) {
426
      ForceEmitWaitcnt[EXP_CNT] = true;
427
    } else {
428
      ForceEmitWaitcnt[EXP_CNT] = false;
429
    }
430
431
    if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
432
         DebugCounter::shouldExecute(ForceLgkmCounter)) {
433
      ForceEmitWaitcnt[LGKM_CNT] = true;
434
    } else {
435
      ForceEmitWaitcnt[LGKM_CNT] = false;
436
    }
437
438
    if (DebugCounter::isCounterSet(ForceVMCounter) &&
439
        DebugCounter::shouldExecute(ForceVMCounter)) {
440
      ForceEmitWaitcnt[VM_CNT] = true;
441
    } else {
442
      ForceEmitWaitcnt[VM_CNT] = false;
443
    }
444
#endif // NDEBUG
445
  }
446
447
  bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
448
  bool generateWaitcntInstBefore(MachineInstr &MI,
449
                                 WaitcntBrackets &ScoreBrackets,
450
                                 MachineInstr *OldWaitcntInstr);
451
  void updateEventWaitcntAfter(MachineInstr &Inst,
452
                               WaitcntBrackets *ScoreBrackets);
453
  bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
454
                            WaitcntBrackets &ScoreBrackets);
455
};
456
457
} // end anonymous namespace
458
459
RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
460
                                            const SIInstrInfo *TII,
461
                                            const MachineRegisterInfo *MRI,
462
                                            const SIRegisterInfo *TRI,
463
4.52M
                                            unsigned OpNo, bool Def) const {
464
4.52M
  const MachineOperand &Op = MI->getOperand(OpNo);
465
4.52M
  if (!Op.isReg() || 
!TRI->isInAllocatableClass(Op.getReg())2.72M
||
466
4.52M
      
(2.65M
Def2.65M
&&
!Op.isDef()1.51M
) ||
TRI->isAGPR(*MRI, Op.getReg())1.56M
)
467
2.96M
    return {-1, -1};
468
1.55M
469
1.55M
  // A use via a PW operand does not need a waitcnt.
470
1.55M
  // A partial write is not a WAW.
471
1.55M
  assert(!Op.getSubReg() || !Op.isUndef());
472
1.55M
473
1.55M
  RegInterval Result;
474
1.55M
  const MachineRegisterInfo &MRIA = *MRI;
475
1.55M
476
1.55M
  unsigned Reg = TRI->getEncodingValue(Op.getReg());
477
1.55M
478
1.55M
  if (TRI->isVGPR(MRIA, Op.getReg())) {
479
663k
    assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
480
663k
    Result.first = Reg - RegisterEncoding.VGPR0;
481
663k
    assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
482
893k
  } else if (TRI->isSGPRReg(MRIA, Op.getReg())) {
483
893k
    assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
484
893k
    Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
485
893k
    assert(Result.first >= NUM_ALL_VGPRS &&
486
893k
           Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
487
893k
  }
488
0
  // TODO: Handle TTMP
489
0
  // else if (TRI->isTTMP(MRIA, Reg.getReg())) ...
490
0
  else
491
0
    return {-1, -1};
492
1.55M
493
1.55M
  const MachineInstr &MIA = *MI;
494
1.55M
  const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo);
495
1.55M
  unsigned Size = TRI->getRegSizeInBits(*RC);
496
1.55M
  Result.second = Result.first + (Size / 32);
497
1.55M
498
1.55M
  return Result;
499
1.55M
}
500
501
void WaitcntBrackets::setExpScore(const MachineInstr *MI,
502
                                  const SIInstrInfo *TII,
503
                                  const SIRegisterInfo *TRI,
504
                                  const MachineRegisterInfo *MRI, unsigned OpNo,
505
12.7k
                                  uint32_t Val) {
506
12.7k
  RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
507
12.7k
  LLVM_DEBUG({
508
12.7k
    const MachineOperand &Opnd = MI->getOperand(OpNo);
509
12.7k
    assert(TRI->isVGPR(*MRI, Opnd.getReg()));
510
12.7k
  });
511
33.8k
  for (signed RegNo = Interval.first; RegNo < Interval.second; 
++RegNo21.1k
) {
512
21.1k
    setRegScore(RegNo, EXP_CNT, Val);
513
21.1k
  }
514
12.7k
}
515
516
void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
517
                                    const SIRegisterInfo *TRI,
518
                                    const MachineRegisterInfo *MRI,
519
127k
                                    WaitEventType E, MachineInstr &Inst) {
520
127k
  const MachineRegisterInfo &MRIA = *MRI;
521
127k
  InstCounterType T = eventCounter(E);
522
127k
  uint32_t CurrScore = getScoreUB(T) + 1;
523
127k
  if (CurrScore == 0)
524
0
    report_fatal_error("InsertWaitcnt score wraparound");
525
127k
  // PendingEvents and ScoreUB need to be update regardless if this event
526
127k
  // changes the score of a register or not.
527
127k
  // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
528
127k
  if (!hasPendingEvent(E)) {
529
73.8k
    if (PendingEvents & WaitEventMaskForInst[T])
530
256
      MixedPendingEvents[T] = true;
531
73.8k
    PendingEvents |= 1 << E;
532
73.8k
  }
533
127k
  setScoreUB(T, CurrScore);
534
127k
535
127k
  if (T == EXP_CNT) {
536
11.0k
    // Put score on the source vgprs. If this is a store, just use those
537
11.0k
    // specific register(s).
538
11.0k
    if (TII->isDS(Inst) && 
(261
Inst.mayStore()261
||
Inst.mayLoad()0
)) {
539
261
      int AddrOpIdx =
540
261
          AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr);
541
261
      // All GDS operations must protect their address register (same as
542
261
      // export.)
543
261
      if (AddrOpIdx != -1) {
544
90
        setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore);
545
90
      }
546
261
547
261
      if (Inst.mayStore()) {
548
261
        if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
549
261
                                       AMDGPU::OpName::data0) != -1) {
550
180
          setExpScore(
551
180
              &Inst, TII, TRI, MRI,
552
180
              AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
553
180
              CurrScore);
554
180
        }
555
261
        if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
556
261
                                       AMDGPU::OpName::data1) != -1) {
557
4
          setExpScore(&Inst, TII, TRI, MRI,
558
4
                      AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
559
4
                                                 AMDGPU::OpName::data1),
560
4
                      CurrScore);
561
4
        }
562
261
      } else 
if (0
AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -10
&&
563
0
                 Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
564
0
                 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
565
0
                 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
566
0
                 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
567
0
                 Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
568
0
                 Inst.getOpcode() != AMDGPU::DS_APPEND &&
569
0
                 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
570
0
                 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
571
0
        for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
572
0
          const MachineOperand &Op = Inst.getOperand(I);
573
0
          if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) {
574
0
            setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
575
0
          }
576
0
        }
577
0
      }
578
10.7k
    } else if (TII->isFLAT(Inst)) {
579
0
      if (Inst.mayStore()) {
580
0
        setExpScore(
581
0
            &Inst, TII, TRI, MRI,
582
0
            AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
583
0
            CurrScore);
584
0
      } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
585
0
        setExpScore(
586
0
            &Inst, TII, TRI, MRI,
587
0
            AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
588
0
            CurrScore);
589
0
      }
590
10.7k
    } else if (TII->isMIMG(Inst)) {
591
55
      if (Inst.mayStore()) {
592
55
        setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
593
55
      } else 
if (0
AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -10
) {
594
0
        setExpScore(
595
0
            &Inst, TII, TRI, MRI,
596
0
            AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
597
0
            CurrScore);
598
0
      }
599
10.7k
    } else if (TII->isMTBUF(Inst)) {
600
39
      if (Inst.mayStore()) {
601
39
        setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
602
39
      }
603
10.6k
    } else if (TII->isMUBUF(Inst)) {
604
10.1k
      if (Inst.mayStore()) {
605
10.1k
        setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
606
10.1k
      } else 
if (0
AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -10
) {
607
0
        setExpScore(
608
0
            &Inst, TII, TRI, MRI,
609
0
            AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
610
0
            CurrScore);
611
0
      }
612
10.1k
    } else {
613
553
      if (TII->isEXP(Inst)) {
614
553
        // For export the destination registers are really temps that
615
553
        // can be used as the actual source after export patching, so
616
553
        // we need to treat them like sources and set the EXP_CNT
617
553
        // score.
618
5.53k
        for (unsigned I = 0, E = Inst.getNumOperands(); I != E; 
++I4.97k
) {
619
4.97k
          MachineOperand &DefMO = Inst.getOperand(I);
620
4.97k
          if (DefMO.isReg() && 
DefMO.isDef()2.76k
&&
621
4.97k
              
TRI->isVGPR(MRIA, DefMO.getReg())0
) {
622
0
            setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT,
623
0
                        CurrScore);
624
0
          }
625
4.97k
        }
626
553
      }
627
5.53k
      for (unsigned I = 0, E = Inst.getNumOperands(); I != E; 
++I4.97k
) {
628
4.97k
        MachineOperand &MO = Inst.getOperand(I);
629
4.97k
        if (MO.isReg() && 
!MO.isDef()2.76k
&&
TRI->isVGPR(MRIA, MO.getReg())2.76k
) {
630
2.21k
          setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
631
2.21k
        }
632
4.97k
      }
633
553
    }
634
#if 0 // TODO: check if this is handled by MUBUF code above.
635
  } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
636
       Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
637
       Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
638
    MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
639
    unsigned OpNo;//TODO: find the OpNo for this operand;
640
    RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false);
641
    for (signed RegNo = Interval.first; RegNo < Interval.second;
642
    ++RegNo) {
643
      setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
644
    }
645
#endif
646
116k
  } else {
647
116k
    // Match the score to the destination registers.
648
985k
    for (unsigned I = 0, E = Inst.getNumOperands(); I != E; 
++I868k
) {
649
868k
      RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true);
650
868k
      if (T == VM_CNT && 
Interval.first >= NUM_ALL_VGPRS521k
)
651
0
        continue;
652
1.02M
      
for (signed RegNo = Interval.first; 868k
RegNo < Interval.second;
++RegNo154k
) {
653
154k
        setRegScore(RegNo, T, CurrScore);
654
154k
      }
655
868k
    }
656
116k
    if (TII->isDS(Inst) && 
Inst.mayStore()9.44k
) {
657
5.60k
      setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
658
5.60k
    }
659
116k
  }
660
127k
}
661
662
0
void WaitcntBrackets::print(raw_ostream &OS) {
663
0
  OS << '\n';
664
0
  for (auto T : inst_counter_types()) {
665
0
    uint32_t LB = getScoreLB(T);
666
0
    uint32_t UB = getScoreUB(T);
667
0
668
0
    switch (T) {
669
0
    case VM_CNT:
670
0
      OS << "    VM_CNT(" << UB - LB << "): ";
671
0
      break;
672
0
    case LGKM_CNT:
673
0
      OS << "    LGKM_CNT(" << UB - LB << "): ";
674
0
      break;
675
0
    case EXP_CNT:
676
0
      OS << "    EXP_CNT(" << UB - LB << "): ";
677
0
      break;
678
0
    case VS_CNT:
679
0
      OS << "    VS_CNT(" << UB - LB << "): ";
680
0
      break;
681
0
    default:
682
0
      OS << "    UNKNOWN(" << UB - LB << "): ";
683
0
      break;
684
0
    }
685
0
686
0
    if (LB < UB) {
687
0
      // Print vgpr scores.
688
0
      for (int J = 0; J <= getMaxVGPR(); J++) {
689
0
        uint32_t RegScore = getRegScore(J, T);
690
0
        if (RegScore <= LB)
691
0
          continue;
692
0
        uint32_t RelScore = RegScore - LB - 1;
693
0
        if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
694
0
          OS << RelScore << ":v" << J << " ";
695
0
        } else {
696
0
          OS << RelScore << ":ds ";
697
0
        }
698
0
      }
699
0
      // Also need to print sgpr scores for lgkm_cnt.
700
0
      if (T == LGKM_CNT) {
701
0
        for (int J = 0; J <= getMaxSGPR(); J++) {
702
0
          uint32_t RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
703
0
          if (RegScore <= LB)
704
0
            continue;
705
0
          uint32_t RelScore = RegScore - LB - 1;
706
0
          OS << RelScore << ":s" << J << " ";
707
0
        }
708
0
      }
709
0
    }
710
0
    OS << '\n';
711
0
  }
712
0
  OS << '\n';
713
0
}
714
715
/// Simplify the waitcnt, in the sense of removing redundant counts, and return
716
/// whether a waitcnt instruction is needed at all.
717
391k
bool WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
718
391k
  return simplifyWaitcnt(VM_CNT, Wait.VmCnt) |
719
391k
         simplifyWaitcnt(EXP_CNT, Wait.ExpCnt) |
720
391k
         simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt) |
721
391k
         simplifyWaitcnt(VS_CNT, Wait.VsCnt);
722
391k
}
723
724
bool WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
725
1.56M
                                      unsigned &Count) const {
726
1.56M
  const uint32_t LB = getScoreLB(T);
727
1.56M
  const uint32_t UB = getScoreUB(T);
728
1.56M
  if (Count < UB && 
UB - Count > LB48.7k
)
729
47.9k
    return true;
730
1.51M
731
1.51M
  Count = ~0u;
732
1.51M
  return false;
733
1.51M
}
734
735
void WaitcntBrackets::determineWait(InstCounterType T, uint32_t ScoreToWait,
736
3.46M
                                    AMDGPU::Waitcnt &Wait) const {
737
3.46M
  // If the score of src_operand falls within the bracket, we need an
738
3.46M
  // s_waitcnt instruction.
739
3.46M
  const uint32_t LB = getScoreLB(T);
740
3.46M
  const uint32_t UB = getScoreUB(T);
741
3.46M
  if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
742
114k
    if ((T == VM_CNT || 
T == LGKM_CNT54.1k
) &&
743
114k
        
hasPendingFlat()109k
&&
744
114k
        
!ST->hasFlatLgkmVMemCountInOrder()2.12k
) {
745
1.67k
      // If there is a pending FLAT operation, and this is a VMem or LGKM
746
1.67k
      // waitcnt and the target can report early completion, then we need
747
1.67k
      // to force a waitcnt 0.
748
1.67k
      addWait(Wait, T, 0);
749
112k
    } else if (counterOutOfOrder(T)) {
750
35.2k
      // Counter can get decremented out-of-order when there
751
35.2k
      // are multiple types event in the bracket. Also emit an s_wait counter
752
35.2k
      // with a conservative value of 0 for the counter.
753
35.2k
      addWait(Wait, T, 0);
754
77.1k
    } else {
755
77.1k
      addWait(Wait, T, UB - ScoreToWait);
756
77.1k
    }
757
114k
  }
758
3.46M
}
759
760
48.7k
void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
761
48.7k
  applyWaitcnt(VM_CNT, Wait.VmCnt);
762
48.7k
  applyWaitcnt(EXP_CNT, Wait.ExpCnt);
763
48.7k
  applyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
764
48.7k
  applyWaitcnt(VS_CNT, Wait.VsCnt);
765
48.7k
}
766
767
194k
void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
768
194k
  const uint32_t UB = getScoreUB(T);
769
194k
  if (Count >= UB)
770
142k
    return;
771
52.1k
  if (Count != 0) {
772
3.38k
    if (counterOutOfOrder(T))
773
0
      return;
774
3.38k
    setScoreLB(T, std::max(getScoreLB(T), UB - Count));
775
48.7k
  } else {
776
48.7k
    setScoreLB(T, UB);
777
48.7k
    MixedPendingEvents[T] = false;
778
48.7k
    PendingEvents &= ~WaitEventMaskForInst[T];
779
48.7k
  }
780
52.1k
}
781
782
// Where there are multiple types of event in the bracket of a counter,
783
// the decrement may go out of order.
784
118k
bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
785
118k
  // Scalar memory read always can go out of order.
786
118k
  if (T == LGKM_CNT && 
hasPendingEvent(SMEM_ACCESS)50.4k
)
787
35.4k
    return true;
788
83.0k
  return MixedPendingEvents[T];
789
83.0k
}
790
791
101k
INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
792
101k
                      false)
793
101k
INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
794
                    false)
795
796
char SIInsertWaitcnts::ID = 0;
797
798
char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
799
800
2.44k
FunctionPass *llvm::createSIInsertWaitcntsPass() {
801
2.44k
  return new SIInsertWaitcnts();
802
2.44k
}
803
804
783k
static bool readsVCCZ(const MachineInstr &MI) {
805
783k
  unsigned Opc = MI.getOpcode();
806
783k
  return (Opc == AMDGPU::S_CBRANCH_VCCNZ || 
Opc == AMDGPU::S_CBRANCH_VCCZ782k
) &&
807
783k
         
!MI.getOperand(1).isUndef()598
;
808
783k
}
809
810
/// \returns true if the callee inserts an s_waitcnt 0 on function entry.
811
742
static bool callWaitsOnFunctionEntry(const MachineInstr &MI) {
812
742
  // Currently all conventions wait, but this may not always be the case.
813
742
  //
814
742
  // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
815
742
  // senses to omit the wait and do it in the caller.
816
742
  return true;
817
742
}
818
819
/// \returns true if the callee is expected to wait for any outstanding waits
820
/// before returning.
821
694
static bool callWaitsOnFunctionReturn(const MachineInstr &MI) {
822
694
  return true;
823
694
}
824
825
///  Generate s_waitcnt instruction to be placed before cur_Inst.
826
///  Instructions of a given type are returned in order,
827
///  but instructions of different types can complete out of order.
828
///  We rely on this in-order completion
829
///  and simply assign a score to the memory access instructions.
830
///  We keep track of the active "score bracket" to determine
831
///  if an access of a memory read requires an s_waitcnt
832
///  and if so what the value of each counter is.
833
///  The "score bracket" is bound by the lower bound and upper bound
834
///  scores (*_score_LB and *_score_ub respectively).
835
bool SIInsertWaitcnts::generateWaitcntInstBefore(
836
    MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
837
391k
    MachineInstr *OldWaitcntInstr) {
838
391k
  setForceEmitWaitcnt();
839
391k
  bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
840
391k
841
391k
  if (MI.isDebugInstr())
842
34
    return false;
843
391k
844
391k
  AMDGPU::Waitcnt Wait;
845
391k
846
391k
  // See if this instruction has a forced S_WAITCNT VM.
847
391k
  // TODO: Handle other cases of NeedsWaitcntVmBefore()
848
391k
  if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
849
391k
      
MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC391k
||
850
391k
      
MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL391k
||
851
391k
      
MI.getOpcode() == AMDGPU::BUFFER_GL0_INV389k
||
852
391k
      
MI.getOpcode() == AMDGPU::BUFFER_GL1_INV389k
) {
853
2.62k
    Wait.VmCnt = 0;
854
2.62k
  }
855
391k
856
391k
  // All waits must be resolved at call return.
857
391k
  // NOTE: this could be improved with knowledge of all call sites or
858
391k
  //   with knowledge of the called routines.
859
391k
  if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
860
391k
      
MI.getOpcode() == AMDGPU::S_SETPC_B64_return389k
||
861
391k
      
(387k
MI.isReturn()387k
&&
MI.isCall()21.2k
&&
!callWaitsOnFunctionEntry(MI)48
)) {
862
4.22k
    Wait = Wait.combined(AMDGPU::Waitcnt::allZero(IV));
863
4.22k
  }
864
387k
  // Resolve vm waits before gs-done.
865
387k
  else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
866
387k
            
MI.getOpcode() == AMDGPU::S_SENDMSGHALT387k
) &&
867
387k
           ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) ==
868
45
            AMDGPU::SendMsg::ID_GS_DONE)) {
869
12
    Wait.VmCnt = 0;
870
12
  }
871
#if 0 // TODO: the following blocks of logic when we have fence.
872
  else if (MI.getOpcode() == SC_FENCE) {
873
    const unsigned int group_size =
874
      context->shader_info->GetMaxThreadGroupSize();
875
    // group_size == 0 means thread group size is unknown at compile time
876
    const bool group_is_multi_wave =
877
      (group_size == 0 || group_size > target_info->GetWaveFrontSize());
878
    const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
879
880
    for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
881
      SCRegType src_type = Inst->GetSrcType(i);
882
      switch (src_type) {
883
        case SCMEM_LDS:
884
          if (group_is_multi_wave ||
885
            context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
886
            EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
887
                               ScoreBrackets->getScoreUB(LGKM_CNT));
888
            // LDS may have to wait for VM_CNT after buffer load to LDS
889
            if (target_info->HasBufferLoadToLDS()) {
890
              EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
891
                                 ScoreBrackets->getScoreUB(VM_CNT));
892
            }
893
          }
894
          break;
895
896
        case SCMEM_GDS:
897
          if (group_is_multi_wave || fence_is_global) {
898
            EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
899
              ScoreBrackets->getScoreUB(EXP_CNT));
900
            EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
901
              ScoreBrackets->getScoreUB(LGKM_CNT));
902
          }
903
          break;
904
905
        case SCMEM_UAV:
906
        case SCMEM_TFBUF:
907
        case SCMEM_RING:
908
        case SCMEM_SCATTER:
909
          if (group_is_multi_wave || fence_is_global) {
910
            EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
911
              ScoreBrackets->getScoreUB(EXP_CNT));
912
            EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
913
              ScoreBrackets->getScoreUB(VM_CNT));
914
          }
915
          break;
916
917
        case SCMEM_SCRATCH:
918
        default:
919
          break;
920
      }
921
    }
922
  }
923
#endif
924
925
387k
  // Export & GDS instructions do not read the EXEC mask until after the export
926
387k
  // is granted (which can occur well after the instruction is issued).
927
387k
  // The shader program must flush all EXP operations on the export-count
928
387k
  // before overwriting the EXEC mask.
929
387k
  else {
930
387k
    if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
931
4.22k
      // Export and GDS are tracked individually, either may trigger a waitcnt
932
4.22k
      // for EXEC.
933
4.22k
      if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
934
4.22k
          ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
935
4.22k
          ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
936
4.22k
          ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
937
4
        Wait.ExpCnt = 0;
938
4
      }
939
4.22k
    }
940
387k
941
387k
    if (MI.isCall() && 
callWaitsOnFunctionEntry(MI)694
) {
942
694
      // Don't bother waiting on anything except the call address. The function
943
694
      // is going to insert a wait on everything in its prolog. This still needs
944
694
      // to be careful if the call target is a load (e.g. a GOT load).
945
694
      Wait = AMDGPU::Waitcnt();
946
694
947
694
      int CallAddrOpIdx =
948
694
          AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
949
694
      RegInterval Interval = ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI,
950
694
                                                          CallAddrOpIdx, false);
951
2.08k
      for (signed RegNo = Interval.first; RegNo < Interval.second; 
++RegNo1.38k
) {
952
1.38k
        ScoreBrackets.determineWait(
953
1.38k
            LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
954
1.38k
      }
955
386k
    } else {
956
386k
      // FIXME: Should not be relying on memoperands.
957
386k
      // Look at the source operands of every instruction to see if
958
386k
      // any of them results from a previous memory operation that affects
959
386k
      // its current usage. If so, an s_waitcnt instruction needs to be
960
386k
      // emitted.
961
386k
      // If the source operand was defined by a load, add the s_waitcnt
962
386k
      // instruction.
963
386k
      for (const MachineMemOperand *Memop : MI.memoperands()) {
964
106k
        unsigned AS = Memop->getAddrSpace();
965
106k
        if (AS != AMDGPUAS::LOCAL_ADDRESS)
966
94.1k
          continue;
967
12.3k
        unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
968
12.3k
        // VM_CNT is only relevant to vgpr or LDS.
969
12.3k
        ScoreBrackets.determineWait(
970
12.3k
            VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
971
12.3k
      }
972
386k
973
2.20M
      for (unsigned I = 0, E = MI.getNumOperands(); I != E; 
++I1.81M
) {
974
1.81M
        const MachineOperand &Op = MI.getOperand(I);
975
1.81M
        const MachineRegisterInfo &MRIA = *MRI;
976
1.81M
        RegInterval Interval =
977
1.81M
            ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, false);
978
3.70M
        for (signed RegNo = Interval.first; RegNo < Interval.second; 
++RegNo1.88M
) {
979
1.88M
          if (TRI->isVGPR(MRIA, Op.getReg())) {
980
557k
            // VM_CNT is only relevant to vgpr or LDS.
981
557k
            ScoreBrackets.determineWait(
982
557k
                VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
983
557k
          }
984
1.88M
          ScoreBrackets.determineWait(
985
1.88M
              LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
986
1.88M
        }
987
1.81M
      }
988
386k
      // End of for loop that looks at all source operands to decide vm_wait_cnt
989
386k
      // and lgk_wait_cnt.
990
386k
991
386k
      // Two cases are handled for destination operands:
992
386k
      // 1) If the destination operand was defined by a load, add the s_waitcnt
993
386k
      // instruction to guarantee the right WAW order.
994
386k
      // 2) If a destination operand that was used by a recent export/store ins,
995
386k
      // add s_waitcnt on exp_cnt to guarantee the WAR order.
996
386k
      if (MI.mayStore()) {
997
47.5k
        // FIXME: Should not be relying on memoperands.
998
47.5k
        for (const MachineMemOperand *Memop : MI.memoperands()) {
999
46.0k
          unsigned AS = Memop->getAddrSpace();
1000
46.0k
          if (AS != AMDGPUAS::LOCAL_ADDRESS)
1001
38.5k
            continue;
1002
7.51k
          unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1003
7.51k
          ScoreBrackets.determineWait(
1004
7.51k
              VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
1005
7.51k
          ScoreBrackets.determineWait(
1006
7.51k
              EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
1007
7.51k
        }
1008
47.5k
      }
1009
2.20M
      for (unsigned I = 0, E = MI.getNumOperands(); I != E; 
++I1.81M
) {
1010
1.81M
        MachineOperand &Def = MI.getOperand(I);
1011
1.81M
        const MachineRegisterInfo &MRIA = *MRI;
1012
1.81M
        RegInterval Interval =
1013
1.81M
            ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, true);
1014
2.31M
        for (signed RegNo = Interval.first; RegNo < Interval.second; 
++RegNo497k
) {
1015
497k
          if (TRI->isVGPR(MRIA, Def.getReg())) {
1016
251k
            ScoreBrackets.determineWait(
1017
251k
                VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
1018
251k
            ScoreBrackets.determineWait(
1019
251k
                EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
1020
251k
          }
1021
497k
          ScoreBrackets.determineWait(
1022
497k
              LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
1023
497k
        }
1024
1.81M
      } // End of for loop that looks at all dest operands.
1025
386k
    }
1026
387k
  }
1027
391k
1028
391k
  // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
1029
391k
  // occurs before the instruction. Doing it here prevents any additional
1030
391k
  // S_WAITCNTs from being emitted if the instruction was marked as
1031
391k
  // requiring a WAITCNT beforehand.
1032
391k
  if (MI.getOpcode() == AMDGPU::S_BARRIER &&
1033
391k
      
!ST->hasAutoWaitcntBeforeBarrier()87
) {
1034
85
    Wait = Wait.combined(AMDGPU::Waitcnt::allZero(IV));
1035
85
  }
1036
391k
1037
391k
  // TODO: Remove this work-around, enable the assert for Bug 457939
1038
391k
  //       after fixing the scheduler. Also, the Shader Compiler code is
1039
391k
  //       independent of target.
1040
391k
  if (readsVCCZ(MI) && 
ST->hasReadVCCZBug()298
) {
1041
160
    if (ScoreBrackets.getScoreLB(LGKM_CNT) <
1042
160
            ScoreBrackets.getScoreUB(LGKM_CNT) &&
1043
160
        
ScoreBrackets.hasPendingEvent(SMEM_ACCESS)21
) {
1044
4
      Wait.LgkmCnt = 0;
1045
4
    }
1046
160
  }
1047
391k
1048
391k
  // Early-out if no wait is indicated.
1049
391k
  if (!ScoreBrackets.simplifyWaitcnt(Wait) && 
!IsForceEmitWaitcnt347k
) {
1050
347k
    bool Modified = false;
1051
347k
    if (OldWaitcntInstr) {
1052
3.55k
      for (auto II = OldWaitcntInstr->getIterator(), NextI = std::next(II);
1053
7.45k
           &*II != &MI; 
II = NextI, ++NextI3.90k
) {
1054
3.90k
        if (II->isDebugInstr())
1055
0
          continue;
1056
3.90k
1057
3.90k
        if (TrackedWaitcntSet.count(&*II)) {
1058
2
          TrackedWaitcntSet.erase(&*II);
1059
2
          II->eraseFromParent();
1060
2
          Modified = true;
1061
3.90k
        } else if (II->getOpcode() == AMDGPU::S_WAITCNT) {
1062
3.49k
          int64_t Imm = II->getOperand(0).getImm();
1063
3.49k
          ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
1064
3.49k
        } else {
1065
405
          assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
1066
405
          assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1067
405
          ScoreBrackets.applyWaitcnt(
1068
405
              AMDGPU::Waitcnt(0, 0, 0, II->getOperand(1).getImm()));
1069
405
        }
1070
3.90k
      }
1071
3.55k
    }
1072
347k
    return Modified;
1073
347k
  }
1074
44.1k
1075
44.1k
  if (ForceEmitZeroWaitcnts)
1076
0
    Wait = AMDGPU::Waitcnt::allZero(IV);
1077
44.1k
1078
44.1k
  if (ForceEmitWaitcnt[VM_CNT])
1079
0
    Wait.VmCnt = 0;
1080
44.1k
  if (ForceEmitWaitcnt[EXP_CNT])
1081
0
    Wait.ExpCnt = 0;
1082
44.1k
  if (ForceEmitWaitcnt[LGKM_CNT])
1083
0
    Wait.LgkmCnt = 0;
1084
44.1k
  if (ForceEmitWaitcnt[VS_CNT])
1085
0
    Wait.VsCnt = 0;
1086
44.1k
1087
44.1k
  ScoreBrackets.applyWaitcnt(Wait);
1088
44.1k
1089
44.1k
  AMDGPU::Waitcnt OldWait;
1090
44.1k
  bool Modified = false;
1091
44.1k
1092
44.1k
  if (OldWaitcntInstr) {
1093
1.55k
    for (auto II = OldWaitcntInstr->getIterator(), NextI = std::next(II);
1094
1.55k
         &*II != &MI; 
II = NextI, NextI++1
) {
1095
1.55k
      if (II->isDebugInstr())
1096
0
        continue;
1097
1.55k
1098
1.55k
      if (II->getOpcode() == AMDGPU::S_WAITCNT) {
1099
1.55k
        unsigned IEnc = II->getOperand(0).getImm();
1100
1.55k
        AMDGPU::Waitcnt IWait = AMDGPU::decodeWaitcnt(IV, IEnc);
1101
1.55k
        OldWait = OldWait.combined(IWait);
1102
1.55k
        if (!TrackedWaitcntSet.count(&*II))
1103
1.47k
          Wait = Wait.combined(IWait);
1104
1.55k
        unsigned NewEnc = AMDGPU::encodeWaitcnt(IV, Wait);
1105
1.55k
        if (IEnc != NewEnc) {
1106
29
          II->getOperand(0).setImm(NewEnc);
1107
29
          Modified = true;
1108
29
        }
1109
1.55k
        Wait.VmCnt = ~0u;
1110
1.55k
        Wait.LgkmCnt = ~0u;
1111
1.55k
        Wait.ExpCnt = ~0u;
1112
1.55k
      } else {
1113
1
        assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
1114
1
        assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1115
1
1116
1
        unsigned ICnt = II->getOperand(1).getImm();
1117
1
        OldWait.VsCnt = std::min(OldWait.VsCnt, ICnt);
1118
1
        if (!TrackedWaitcntSet.count(&*II))
1119
1
          Wait.VsCnt = std::min(Wait.VsCnt, ICnt);
1120
1
        if (Wait.VsCnt != ICnt) {
1121
0
          II->getOperand(1).setImm(Wait.VsCnt);
1122
0
          Modified = true;
1123
0
        }
1124
1
        Wait.VsCnt = ~0u;
1125
1
      }
1126
1.55k
1127
1.55k
      LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
1128
1.55k
                        << "Old Instr: " << MI << '\n'
1129
1.55k
                        << "New Instr: " << *II << '\n');
1130
1.55k
1131
1.55k
      if (!Wait.hasWait())
1132
1.55k
        return Modified;
1133
1.55k
    }
1134
1.55k
  }
1135
44.1k
1136
44.1k
  
if (42.5k
Wait.VmCnt != ~0u42.5k
||
Wait.LgkmCnt != ~0u25.6k
||
Wait.ExpCnt != ~0u1.95k
) {
1137
42.5k
    unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1138
42.5k
    auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
1139
42.5k
                             MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1140
42.5k
                         .addImm(Enc);
1141
42.5k
    TrackedWaitcntSet.insert(SWaitInst);
1142
42.5k
    Modified = true;
1143
42.5k
1144
42.5k
    LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
1145
42.5k
                      << "Old Instr: " << MI << '\n'
1146
42.5k
                      << "New Instr: " << *SWaitInst << '\n');
1147
42.5k
  }
1148
42.5k
1149
42.5k
  if (Wait.VsCnt != ~0u) {
1150
48
    assert(ST->hasVscnt());
1151
48
1152
48
    auto SWaitInst =
1153
48
        BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1154
48
                TII->get(AMDGPU::S_WAITCNT_VSCNT))
1155
48
            .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1156
48
            .addImm(Wait.VsCnt);
1157
48
    TrackedWaitcntSet.insert(SWaitInst);
1158
48
    Modified = true;
1159
48
1160
48
    LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
1161
48
                      << "Old Instr: " << MI << '\n'
1162
48
                      << "New Instr: " << *SWaitInst << '\n');
1163
48
  }
1164
42.5k
1165
42.5k
  return Modified;
1166
44.1k
}
1167
1168
// This is a flat memory operation. Check to see if it has memory
1169
// tokens for both LDS and Memory, and if so mark it as a flat.
1170
13.1k
bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1171
13.1k
  if (MI.memoperands_empty())
1172
67
    return true;
1173
13.0k
1174
13.0k
  for (const MachineMemOperand *Memop : MI.memoperands()) {
1175
13.0k
    unsigned AS = Memop->getAddrSpace();
1176
13.0k
    if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
1177
3.32k
      return true;
1178
13.0k
  }
1179
13.0k
1180
13.0k
  
return false9.77k
;
1181
13.0k
}
1182
1183
void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
1184
391k
                                               WaitcntBrackets *ScoreBrackets) {
1185
391k
  // Now look at the instruction opcode. If it is a memory access
1186
391k
  // instruction, update the upper-bound of the appropriate counter's
1187
391k
  // bracket and the destination operand scores.
1188
391k
  // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
1189
391k
  if (TII->isDS(Inst) && 
TII->usesLGKM_CNT(Inst)9.44k
) {
1190
9.44k
    if (TII->isAlwaysGDS(Inst.getOpcode()) ||
1191
9.44k
        
TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)9.25k
) {
1192
261
      ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
1193
261
      ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
1194
9.18k
    } else {
1195
9.18k
      ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1196
9.18k
    }
1197
382k
  } else if (TII->isFLAT(Inst)) {
1198
19.3k
    assert(Inst.mayLoad() || Inst.mayStore());
1199
19.3k
1200
19.3k
    if (TII->usesVM_CNT(Inst)) {
1201
19.3k
      if (!ST->hasVscnt())
1202
17.2k
        ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1203
2.12k
      else if (Inst.mayLoad() &&
1204
2.12k
               
AMDGPU::getAtomicRetOp(Inst.getOpcode()) == -11.09k
)
1205
797
        ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst);
1206
1.32k
      else
1207
1.32k
        ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);
1208
19.3k
    }
1209
19.3k
1210
19.3k
    if (TII->usesLGKM_CNT(Inst)) {
1211
13.1k
      ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1212
13.1k
1213
13.1k
      // This is a flat memory operation, so note it - it will require
1214
13.1k
      // that both the VM and LGKM be flushed to zero if it is pending when
1215
13.1k
      // a VM or LGKM dependency occurs.
1216
13.1k
      if (mayAccessLDSThroughFlat(Inst))
1217
3.39k
        ScoreBrackets->setPendingFlat();
1218
13.1k
    }
1219
362k
  } else if (SIInstrInfo::isVMEM(Inst) &&
1220
362k
             // TODO: get a better carve out.
1221
362k
             
Inst.getOpcode() != AMDGPU::BUFFER_WBINVL144.5k
&&
1222
362k
             
Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC44.1k
&&
1223
362k
             
Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL44.1k
&&
1224
362k
             
Inst.getOpcode() != AMDGPU::BUFFER_GL0_INV42.4k
&&
1225
362k
             
Inst.getOpcode() != AMDGPU::BUFFER_GL1_INV42.1k
) {
1226
41.9k
    if (!ST->hasVscnt())
1227
40.3k
      ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1228
1.52k
    else if ((Inst.mayLoad() &&
1229
1.52k
              
AMDGPU::getAtomicRetOp(Inst.getOpcode()) == -1428
) ||
1230
1.52k
             /* IMAGE_GET_RESINFO / IMAGE_GET_LOD */
1231
1.52k
             
(1.09k
TII->isMIMG(Inst)1.09k
&&
!Inst.mayLoad()35
&&
!Inst.mayStore()35
))
1232
438
      ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst);
1233
1.08k
    else if (Inst.mayStore())
1234
1.08k
      ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);
1235
41.9k
1236
41.9k
    if (ST->vmemWriteNeedsExpWaitcnt() &&
1237
41.9k
        
(17.2k
Inst.mayStore()17.2k
||
AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -16.98k
)) {
1238
10.2k
      ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
1239
10.2k
    }
1240
320k
  } else if (TII->isSMRD(Inst)) {
1241
32.7k
    ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1242
288k
  } else if (Inst.isCall()) {
1243
694
    if (callWaitsOnFunctionReturn(Inst)) {
1244
694
      // Act as a wait on everything
1245
694
      ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZero(IV));
1246
694
    } else {
1247
0
      // May need to way wait for anything.
1248
0
      ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
1249
0
    }
1250
287k
  } else {
1251
287k
    switch (Inst.getOpcode()) {
1252
287k
    case AMDGPU::S_SENDMSG:
1253
45
    case AMDGPU::S_SENDMSGHALT:
1254
45
      ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
1255
45
      break;
1256
553
    case AMDGPU::EXP:
1257
553
    case AMDGPU::EXP_DONE: {
1258
553
      int Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
1259
553
      if (Imm >= 32 && 
Imm <= 63124
)
1260
124
        ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
1261
429
      else if (Imm >= 12 && 
Imm <= 1535
)
1262
33
        ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
1263
396
      else
1264
396
        ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
1265
553
      break;
1266
553
    }
1267
553
    case AMDGPU::S_MEMTIME:
1268
0
    case AMDGPU::S_MEMREALTIME:
1269
0
      ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1270
0
      break;
1271
286k
    default:
1272
286k
      break;
1273
287k
    }
1274
287k
  }
1275
391k
}
1276
1277
bool WaitcntBrackets::mergeScore(const MergeInfo &M, uint32_t &Score,
1278
133k
                                 uint32_t OtherScore) {
1279
133k
  uint32_t MyShifted = Score <= M.OldLB ? 
0130k
:
Score + M.MyShift3.35k
;
1280
133k
  uint32_t OtherShifted =
1281
133k
      OtherScore <= M.OtherLB ? 
0129k
:
OtherScore + M.OtherShift4.18k
;
1282
133k
  Score = std::max(MyShifted, OtherShifted);
1283
133k
  return OtherShifted > MyShifted;
1284
133k
}
1285
1286
/// Merge the pending events and associater score brackets of \p Other into
1287
/// this brackets status.
1288
///
1289
/// Returns whether the merge resulted in a change that requires tighter waits
1290
/// (i.e. the merged brackets strictly dominate the original brackets).
1291
688
bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
1292
688
  bool StrictDom = false;
1293
688
1294
2.75k
  for (auto T : inst_counter_types()) {
1295
2.75k
    // Merge event flags for this counter
1296
2.75k
    const bool OldOutOfOrder = counterOutOfOrder(T);
1297
2.75k
    const uint32_t OldEvents = PendingEvents & WaitEventMaskForInst[T];
1298
2.75k
    const uint32_t OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
1299
2.75k
    if (OtherEvents & ~OldEvents)
1300
264
      StrictDom = true;
1301
2.75k
    if (Other.MixedPendingEvents[T] ||
1302
2.75k
        
(2.74k
OldEvents2.74k
&&
OtherEvents924
&&
OldEvents != OtherEvents782
))
1303
48
      MixedPendingEvents[T] = true;
1304
2.75k
    PendingEvents |= OtherEvents;
1305
2.75k
1306
2.75k
    // Merge scores for this counter
1307
2.75k
    const uint32_t MyPending = ScoreUBs[T] - ScoreLBs[T];
1308
2.75k
    const uint32_t OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
1309
2.75k
    MergeInfo M;
1310
2.75k
    M.OldLB = ScoreLBs[T];
1311
2.75k
    M.OtherLB = Other.ScoreLBs[T];
1312
2.75k
    M.MyShift = OtherPending > MyPending ? 
OtherPending - MyPending482
:
02.27k
;
1313
2.75k
    M.OtherShift = ScoreUBs[T] - Other.ScoreUBs[T] + M.MyShift;
1314
2.75k
1315
2.75k
    const uint32_t NewUB = ScoreUBs[T] + M.MyShift;
1316
2.75k
    if (NewUB < ScoreUBs[T])
1317
0
      report_fatal_error("waitcnt score overflow");
1318
2.75k
    ScoreUBs[T] = NewUB;
1319
2.75k
    ScoreLBs[T] = std::min(M.OldLB + M.MyShift, M.OtherLB + M.OtherShift);
1320
2.75k
1321
2.75k
    StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
1322
2.75k
1323
2.75k
    bool RegStrictDom = false;
1324
128k
    for (int J = 0, E = std::max(getMaxVGPR(), Other.getMaxVGPR()) + 1; J != E;
1325
125k
         J++) {
1326
125k
      RegStrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
1327
125k
    }
1328
2.75k
1329
2.75k
    if (T == LGKM_CNT) {
1330
688
      for (int J = 0, E = std::max(getMaxSGPR(), Other.getMaxSGPR()) + 1;
1331
6.03k
           J != E; 
J++5.34k
) {
1332
5.34k
        RegStrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
1333
5.34k
      }
1334
688
    }
1335
2.75k
1336
2.75k
    if (RegStrictDom && 
!OldOutOfOrder253
)
1337
227
      StrictDom = true;
1338
2.75k
  }
1339
688
1340
688
  VgprUB = std::max(getMaxVGPR(), Other.getMaxVGPR());
1341
688
  SgprUB = std::max(getMaxSGPR(), Other.getMaxSGPR());
1342
688
1343
688
  return StrictDom;
1344
688
}
1345
1346
// Generate s_waitcnt instructions where needed.
1347
bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
1348
                                            MachineBasicBlock &Block,
1349
29.0k
                                            WaitcntBrackets &ScoreBrackets) {
1350
29.0k
  bool Modified = false;
1351
29.0k
1352
29.0k
  LLVM_DEBUG({
1353
29.0k
    dbgs() << "*** Block" << Block.getNumber() << " ***";
1354
29.0k
    ScoreBrackets.dump();
1355
29.0k
  });
1356
29.0k
1357
29.0k
  // Walk over the instructions.
1358
29.0k
  MachineInstr *OldWaitcntInstr = nullptr;
1359
29.0k
1360
29.0k
  for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
1361
29.0k
                                         E = Block.instr_end();
1362
426k
       Iter != E;) {
1363
397k
    MachineInstr &Inst = *Iter;
1364
397k
1365
397k
    // Track pre-existing waitcnts from earlier iterations.
1366
397k
    if (Inst.getOpcode() == AMDGPU::S_WAITCNT ||
1367
397k
        
(392k
Inst.getOpcode() == AMDGPU::S_WAITCNT_VSCNT392k
&&
1368
392k
         
Inst.getOperand(0).isReg()408
&&
1369
392k
         
Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL408
)) {
1370
5.46k
      if (!OldWaitcntInstr)
1371
5.11k
        OldWaitcntInstr = &Inst;
1372
5.46k
      ++Iter;
1373
5.46k
      continue;
1374
5.46k
    }
1375
391k
1376
391k
    bool VCCZBugWorkAround = false;
1377
391k
    if (readsVCCZ(Inst) &&
1378
391k
        
(!VCCZBugHandledSet.count(&Inst))298
) {
1379
298
      if (ScoreBrackets.getScoreLB(LGKM_CNT) <
1380
298
              ScoreBrackets.getScoreUB(LGKM_CNT) &&
1381
298
          
ScoreBrackets.hasPendingEvent(SMEM_ACCESS)27
) {
1382
9
        if (ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
1383
4
          VCCZBugWorkAround = true;
1384
9
      }
1385
298
    }
1386
391k
1387
391k
    // Generate an s_waitcnt instruction to be placed before
1388
391k
    // cur_Inst, if needed.
1389
391k
    Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
1390
391k
    OldWaitcntInstr = nullptr;
1391
391k
1392
391k
    updateEventWaitcntAfter(Inst, &ScoreBrackets);
1393
391k
1394
#if 0 // TODO: implement resource type check controlled by options with ub = LB.
1395
    // If this instruction generates a S_SETVSKIP because it is an
1396
    // indexed resource, and we are on Tahiti, then it will also force
1397
    // an S_WAITCNT vmcnt(0)
1398
    if (RequireCheckResourceType(Inst, context)) {
1399
      // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
1400
      ScoreBrackets->setScoreLB(VM_CNT,
1401
      ScoreBrackets->getScoreUB(VM_CNT));
1402
    }
1403
#endif
1404
1405
391k
    LLVM_DEBUG({
1406
391k
      Inst.print(dbgs());
1407
391k
      ScoreBrackets.dump();
1408
391k
    });
1409
391k
1410
391k
    // TODO: Remove this work-around after fixing the scheduler and enable the
1411
391k
    // assert above.
1412
391k
    if (VCCZBugWorkAround) {
1413
4
      // Restore the vccz bit.  Any time a value is written to vcc, the vcc
1414
4
      // bit is updated, so we can restore the bit by reading the value of
1415
4
      // vcc and then writing it back to the register.
1416
4
      BuildMI(Block, Inst, Inst.getDebugLoc(),
1417
4
              TII->get(ST->isWave32() ? 
AMDGPU::S_MOV_B320
: AMDGPU::S_MOV_B64),
1418
4
              TRI->getVCC())
1419
4
          .addReg(TRI->getVCC());
1420
4
      VCCZBugHandledSet.insert(&Inst);
1421
4
      Modified = true;
1422
4
    }
1423
391k
1424
391k
    ++Iter;
1425
391k
  }
1426
29.0k
1427
29.0k
  return Modified;
1428
29.0k
}
1429
1430
25.4k
bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
1431
25.4k
  ST = &MF.getSubtarget<GCNSubtarget>();
1432
25.4k
  TII = ST->getInstrInfo();
1433
25.4k
  TRI = &TII->getRegisterInfo();
1434
25.4k
  MRI = &MF.getRegInfo();
1435
25.4k
  IV = AMDGPU::getIsaVersion(ST->getCPU());
1436
25.4k
  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1437
25.4k
1438
25.4k
  ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
1439
25.4k
  for (auto T : inst_counter_types())
1440
101k
    ForceEmitWaitcnt[T] = false;
1441
25.4k
1442
25.4k
  HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
1443
25.4k
  HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
1444
25.4k
  HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
1445
25.4k
  HardwareLimits.VscntMax = ST->hasVscnt() ? 
632.10k
:
023.3k
;
1446
25.4k
1447
25.4k
  HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();
1448
25.4k
  HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();
1449
25.4k
  assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
1450
25.4k
  assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
1451
25.4k
1452
25.4k
  RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
1453
25.4k
  RegisterEncoding.VGPRL =
1454
25.4k
      RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1;
1455
25.4k
  RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
1456
25.4k
  RegisterEncoding.SGPRL =
1457
25.4k
      RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
1458
25.4k
1459
25.4k
  TrackedWaitcntSet.clear();
1460
25.4k
  VCCZBugHandledSet.clear();
1461
25.4k
  RpotIdxMap.clear();
1462
25.4k
  BlockInfos.clear();
1463
25.4k
1464
25.4k
  // Keep iterating over the blocks in reverse post order, inserting and
1465
25.4k
  // updating s_waitcnt where needed, until a fix point is reached.
1466
25.4k
  for (MachineBasicBlock *MBB :
1467
28.8k
       ReversePostOrderTraversal<MachineFunction *>(&MF)) {
1468
28.8k
    RpotIdxMap[MBB] = BlockInfos.size();
1469
28.8k
    BlockInfos.emplace_back(MBB);
1470
28.8k
  }
1471
25.4k
1472
25.4k
  std::unique_ptr<WaitcntBrackets> Brackets;
1473
25.4k
  bool Modified = false;
1474
25.4k
  bool Repeat;
1475
25.5k
  do {
1476
25.5k
    Repeat = false;
1477
25.5k
1478
29.3k
    for (BlockInfo &BI : BlockInfos) {
1479
29.3k
      if (!BI.Dirty)
1480
317
        continue;
1481
29.0k
1482
29.0k
      unsigned Idx = std::distance(&*BlockInfos.begin(), &BI);
1483
29.0k
1484
29.0k
      if (BI.Incoming) {
1485
1.79k
        if (!Brackets)
1486
1.09k
          Brackets = llvm::make_unique<WaitcntBrackets>(*BI.Incoming);
1487
699
        else
1488
699
          *Brackets = *BI.Incoming;
1489
27.2k
      } else {
1490
27.2k
        if (!Brackets)
1491
25.4k
          Brackets = llvm::make_unique<WaitcntBrackets>(ST);
1492
1.80k
        else
1493
1.80k
          Brackets->clear();
1494
27.2k
      }
1495
29.0k
1496
29.0k
      Modified |= insertWaitcntInBlock(MF, *BI.MBB, *Brackets);
1497
29.0k
      BI.Dirty = false;
1498
29.0k
1499
29.0k
      if (Brackets->hasPending()) {
1500
19.9k
        BlockInfo *MoveBracketsToSucc = nullptr;
1501
19.9k
        for (MachineBasicBlock *Succ : BI.MBB->successors()) {
1502
2.40k
          unsigned SuccIdx = RpotIdxMap[Succ];
1503
2.40k
          BlockInfo &SuccBI = BlockInfos[SuccIdx];
1504
2.40k
          if (!SuccBI.Incoming) {
1505
1.71k
            SuccBI.Dirty = true;
1506
1.71k
            if (SuccIdx <= Idx)
1507
86
              Repeat = true;
1508
1.71k
            if (!MoveBracketsToSucc) {
1509
1.12k
              MoveBracketsToSucc = &SuccBI;
1510
1.12k
            } else {
1511
590
              SuccBI.Incoming = llvm::make_unique<WaitcntBrackets>(*Brackets);
1512
590
            }
1513
1.71k
          } else 
if (688
SuccBI.Incoming->merge(*Brackets)688
) {
1514
281
            SuccBI.Dirty = true;
1515
281
            if (SuccIdx <= Idx)
1516
43
              Repeat = true;
1517
281
          }
1518
2.40k
        }
1519
19.9k
        if (MoveBracketsToSucc)
1520
1.12k
          MoveBracketsToSucc->Incoming = std::move(Brackets);
1521
19.9k
      }
1522
29.0k
    }
1523
25.5k
  } while (Repeat);
1524
25.4k
1525
25.4k
  SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
1526
25.4k
1527
25.4k
  bool HaveScalarStores = false;
1528
25.4k
1529
54.3k
  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
1530
28.8k
       ++BI) {
1531
28.8k
    MachineBasicBlock &MBB = *BI;
1532
28.8k
1533
460k
    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
1534
431k
         ++I) {
1535
431k
      if (!HaveScalarStores && 
TII->isScalarStore(*I)431k
)
1536
16
        HaveScalarStores = true;
1537
431k
1538
431k
      if (I->getOpcode() == AMDGPU::S_ENDPGM ||
1539
431k
          
I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG410k
)
1540
23.1k
        EndPgmBlocks.push_back(&MBB);
1541
431k
    }
1542
28.8k
  }
1543
25.4k
1544
25.4k
  if (HaveScalarStores) {
1545
16
    // If scalar writes are used, the cache must be flushed or else the next
1546
16
    // wave to reuse the same scratch memory can be clobbered.
1547
16
    //
1548
16
    // Insert s_dcache_wb at wave termination points if there were any scalar
1549
16
    // stores, and only if the cache hasn't already been flushed. This could be
1550
16
    // improved by looking across blocks for flushes in postdominating blocks
1551
16
    // from the stores but an explicitly requested flush is probably very rare.
1552
18
    for (MachineBasicBlock *MBB : EndPgmBlocks) {
1553
18
      bool SeenDCacheWB = false;
1554
18
1555
151
      for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
1556
133
           ++I) {
1557
133
        if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
1558
2
          SeenDCacheWB = true;
1559
131
        else if (TII->isScalarStore(*I))
1560
8
          SeenDCacheWB = false;
1561
133
1562
133
        // FIXME: It would be better to insert this before a waitcnt if any.
1563
133
        if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
1564
133
             
I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG116
) &&
1565
133
            
!SeenDCacheWB18
) {
1566
17
          Modified = true;
1567
17
          BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
1568
17
        }
1569
133
      }
1570
18
    }
1571
16
  }
1572
25.4k
1573
25.4k
  if (!MFI->isEntryFunction()) {
1574
2.27k
    // Wait for any outstanding memory operations that the input registers may
1575
2.27k
    // depend on. We can't track them and it's better to the wait after the
1576
2.27k
    // costly call sequence.
1577
2.27k
1578
2.27k
    // TODO: Could insert earlier and schedule more liberally with operations
1579
2.27k
    // that only use caller preserved registers.
1580
2.27k
    MachineBasicBlock &EntryBB = MF.front();
1581
2.27k
    if (ST->hasVscnt())
1582
57
      BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(),
1583
57
              TII->get(AMDGPU::S_WAITCNT_VSCNT))
1584
57
      .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1585
57
      .addImm(0);
1586
2.27k
    BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1587
2.27k
      .addImm(0);
1588
2.27k
1589
2.27k
    Modified = true;
1590
2.27k
  }
1591
25.4k
1592
25.4k
  return Modified;
1593
25.4k
}