Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
Line
Count
Source (jump to first uncovered line)
1
//===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
/// \file
10
/// Memory legalizer - implements memory model. More information can be
11
/// found here:
12
///   http://llvm.org/docs/AMDGPUUsage.html#memory-model
13
//
14
//===----------------------------------------------------------------------===//
15
16
#include "AMDGPU.h"
17
#include "AMDGPUMachineModuleInfo.h"
18
#include "AMDGPUSubtarget.h"
19
#include "SIDefines.h"
20
#include "SIInstrInfo.h"
21
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
22
#include "Utils/AMDGPUBaseInfo.h"
23
#include "llvm/ADT/BitmaskEnum.h"
24
#include "llvm/ADT/None.h"
25
#include "llvm/ADT/Optional.h"
26
#include "llvm/CodeGen/MachineBasicBlock.h"
27
#include "llvm/CodeGen/MachineFunction.h"
28
#include "llvm/CodeGen/MachineFunctionPass.h"
29
#include "llvm/CodeGen/MachineInstrBuilder.h"
30
#include "llvm/CodeGen/MachineMemOperand.h"
31
#include "llvm/CodeGen/MachineModuleInfo.h"
32
#include "llvm/CodeGen/MachineOperand.h"
33
#include "llvm/IR/DebugLoc.h"
34
#include "llvm/IR/DiagnosticInfo.h"
35
#include "llvm/IR/Function.h"
36
#include "llvm/IR/LLVMContext.h"
37
#include "llvm/MC/MCInstrDesc.h"
38
#include "llvm/Pass.h"
39
#include "llvm/Support/AtomicOrdering.h"
40
#include "llvm/Support/MathExtras.h"
41
#include <cassert>
42
#include <list>
43
44
using namespace llvm;
45
using namespace llvm::AMDGPU;
46
47
#define DEBUG_TYPE "si-memory-legalizer"
48
27.9k
#define PASS_NAME "SI Memory Legalizer"
49
50
namespace {
51
52
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
53
54
/// Memory operation flags. Can be ORed together.
55
enum class SIMemOp {
56
  NONE = 0u,
57
  LOAD = 1u << 0,
58
  STORE = 1u << 1,
59
  LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
60
};
61
62
/// Position to insert a new instruction relative to an existing
63
/// instruction.
64
enum class Position {
65
  BEFORE,
66
  AFTER
67
};
68
69
/// The atomic synchronization scopes supported by the AMDGPU target.
70
enum class SIAtomicScope {
71
  NONE,
72
  SINGLETHREAD,
73
  WAVEFRONT,
74
  WORKGROUP,
75
  AGENT,
76
  SYSTEM
77
};
78
79
/// The distinct address spaces supported by the AMDGPU target for
80
/// atomic memory operation. Can be ORed toether.
81
enum class SIAtomicAddrSpace {
82
  NONE = 0u,
83
  GLOBAL = 1u << 0,
84
  LDS = 1u << 1,
85
  SCRATCH = 1u << 2,
86
  GDS = 1u << 3,
87
  OTHER = 1u << 4,
88
89
  /// The address spaces that can be accessed by a FLAT instruction.
90
  FLAT = GLOBAL | LDS | SCRATCH,
91
92
  /// The address spaces that support atomic instructions.
93
  ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
94
95
  /// All address spaces.
96
  ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
97
98
  LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
99
};
100
101
/// Sets named bit \p BitName to "true" if present in instruction \p MI.
102
/// \returns Returns true if \p MI is modified, false otherwise.
103
template <uint16_t BitName>
104
307
bool enableNamedBit(const MachineBasicBlock::iterator &MI) {
105
307
  int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName);
106
307
  if (BitIdx == -1)
107
48
    return false;
108
259
109
259
  MachineOperand &Bit = MI->getOperand(BitIdx);
110
259
  if (Bit.getImm() != 0)
111
0
    return false;
112
259
113
259
  Bit.setImm(1);
114
259
  return true;
115
259
}
SIMemoryLegalizer.cpp:bool (anonymous namespace)::enableNamedBit<(unsigned short)7>(llvm::MachineInstrBundleIterator<llvm::MachineInstr, false> const&)
Line
Count
Source
104
192
bool enableNamedBit(const MachineBasicBlock::iterator &MI) {
105
192
  int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName);
106
192
  if (BitIdx == -1)
107
24
    return false;
108
168
109
168
  MachineOperand &Bit = MI->getOperand(BitIdx);
110
168
  if (Bit.getImm() != 0)
111
0
    return false;
112
168
113
168
  Bit.setImm(1);
114
168
  return true;
115
168
}
SIMemoryLegalizer.cpp:bool (anonymous namespace)::enableNamedBit<(unsigned short)5>(llvm::MachineInstrBundleIterator<llvm::MachineInstr, false> const&)
Line
Count
Source
104
91
bool enableNamedBit(const MachineBasicBlock::iterator &MI) {
105
91
  int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName);
106
91
  if (BitIdx == -1)
107
24
    return false;
108
67
109
67
  MachineOperand &Bit = MI->getOperand(BitIdx);
110
67
  if (Bit.getImm() != 0)
111
0
    return false;
112
67
113
67
  Bit.setImm(1);
114
67
  return true;
115
67
}
SIMemoryLegalizer.cpp:bool (anonymous namespace)::enableNamedBit<(unsigned short)9>(llvm::MachineInstrBundleIterator<llvm::MachineInstr, false> const&)
Line
Count
Source
104
24
bool enableNamedBit(const MachineBasicBlock::iterator &MI) {
105
24
  int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName);
106
24
  if (BitIdx == -1)
107
0
    return false;
108
24
109
24
  MachineOperand &Bit = MI->getOperand(BitIdx);
110
24
  if (Bit.getImm() != 0)
111
0
    return false;
112
24
113
24
  Bit.setImm(1);
114
24
  return true;
115
24
}
116
117
class SIMemOpInfo final {
118
private:
119
120
  friend class SIMemOpAccess;
121
122
  AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
123
  AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
124
  SIAtomicScope Scope = SIAtomicScope::SYSTEM;
125
  SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
126
  SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
127
  bool IsCrossAddressSpaceOrdering = false;
128
  bool IsNonTemporal = false;
129
130
  SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
131
              SIAtomicScope Scope = SIAtomicScope::SYSTEM,
132
              SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
133
              SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
134
              bool IsCrossAddressSpaceOrdering = true,
135
              AtomicOrdering FailureOrdering =
136
                AtomicOrdering::SequentiallyConsistent,
137
              bool IsNonTemporal = false)
138
    : Ordering(Ordering), FailureOrdering(FailureOrdering),
139
      Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
140
      InstrAddrSpace(InstrAddrSpace),
141
      IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
142
67.6k
      IsNonTemporal(IsNonTemporal) {
143
67.6k
    // There is also no cross address space ordering if the ordering
144
67.6k
    // address space is the same as the instruction address space and
145
67.6k
    // only contains a single address space.
146
67.6k
    if ((OrderingAddrSpace == InstrAddrSpace) &&
147
67.6k
        
isPowerOf2_32(uint32_t(InstrAddrSpace))1.42k
)
148
69
      this->IsCrossAddressSpaceOrdering = false;
149
67.6k
  }
150
151
public:
152
  /// \returns Atomic synchronization scope of the machine instruction used to
153
  /// create this SIMemOpInfo.
154
9.31k
  SIAtomicScope getScope() const {
155
9.31k
    return Scope;
156
9.31k
  }
157
158
  /// \returns Ordering constraint of the machine instruction used to
159
  /// create this SIMemOpInfo.
160
19.8k
  AtomicOrdering getOrdering() const {
161
19.8k
    return Ordering;
162
19.8k
  }
163
164
  /// \returns Failure ordering constraint of the machine instruction used to
165
  /// create this SIMemOpInfo.
166
678
  AtomicOrdering getFailureOrdering() const {
167
678
    return FailureOrdering;
168
678
  }
169
170
  /// \returns The address spaces be accessed by the machine
171
  /// instruction used to create this SiMemOpInfo.
172
185
  SIAtomicAddrSpace getInstrAddrSpace() const {
173
185
    return InstrAddrSpace;
174
185
  }
175
176
  /// \returns The address spaces that must be ordered by the machine
177
  /// instruction used to create this SiMemOpInfo.
178
9.13k
  SIAtomicAddrSpace getOrderingAddrSpace() const {
179
9.13k
    return OrderingAddrSpace;
180
9.13k
  }
181
182
  /// \returns Return true iff memory ordering of operations on
183
  /// different address spaces is required.
184
5.93k
  bool getIsCrossAddressSpaceOrdering() const {
185
5.93k
    return IsCrossAddressSpaceOrdering;
186
5.93k
  }
187
188
  /// \returns True if memory access of the machine instruction used to
189
  /// create this SIMemOpInfo is non-temporal, false otherwise.
190
63.0k
  bool isNonTemporal() const {
191
63.0k
    return IsNonTemporal;
192
63.0k
  }
193
194
  /// \returns True if ordering constraint of the machine instruction used to
195
  /// create this SIMemOpInfo is unordered or higher, false otherwise.
196
67.6k
  bool isAtomic() const {
197
67.6k
    return Ordering != AtomicOrdering::NotAtomic;
198
67.6k
  }
199
200
};
201
202
class SIMemOpAccess final {
203
private:
204
  AMDGPUMachineModuleInfo *MMI = nullptr;
205
206
  /// Reports unsupported message \p Msg for \p MI to LLVM context.
207
  void reportUnsupported(const MachineBasicBlock::iterator &MI,
208
                         const char *Msg) const;
209
210
  /// Inspects the target synchonization scope \p SSID and determines
211
  /// the SI atomic scope it corresponds to, the address spaces it
212
  /// covers, and whether the memory ordering applies between address
213
  /// spaces.
214
  Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
215
  toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrScope) const;
216
217
  /// \return Return a bit set of the address spaces accessed by \p AS.
218
  SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
219
220
  /// \returns Info constructed from \p MI, which has at least machine memory
221
  /// operand.
222
  Optional<SIMemOpInfo> constructFromMIWithMMO(
223
      const MachineBasicBlock::iterator &MI) const;
224
225
public:
226
  /// Construct class to support accessing the machine memory operands
227
  /// of instructions in the machine function \p MF.
228
  SIMemOpAccess(MachineFunction &MF);
229
230
  /// \returns Load info if \p MI is a load operation, "None" otherwise.
231
  Optional<SIMemOpInfo> getLoadInfo(
232
      const MachineBasicBlock::iterator &MI) const;
233
234
  /// \returns Store info if \p MI is a store operation, "None" otherwise.
235
  Optional<SIMemOpInfo> getStoreInfo(
236
      const MachineBasicBlock::iterator &MI) const;
237
238
  /// \returns Atomic fence info if \p MI is an atomic fence operation,
239
  /// "None" otherwise.
240
  Optional<SIMemOpInfo> getAtomicFenceInfo(
241
      const MachineBasicBlock::iterator &MI) const;
242
243
  /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
244
  /// rmw operation, "None" otherwise.
245
  Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo(
246
      const MachineBasicBlock::iterator &MI) const;
247
};
248
249
class SICacheControl {
250
protected:
251
252
  /// Instruction info.
253
  const SIInstrInfo *TII = nullptr;
254
255
  IsaVersion IV;
256
257
  SICacheControl(const GCNSubtarget &ST);
258
259
public:
260
261
  /// Create a cache control for the subtarget \p ST.
262
  static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
263
264
  /// Update \p MI memory load instruction to bypass any caches up to
265
  /// the \p Scope memory scope for address spaces \p
266
  /// AddrSpace. Return true iff the instruction was modified.
267
  virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
268
                                     SIAtomicScope Scope,
269
                                     SIAtomicAddrSpace AddrSpace) const = 0;
270
271
  /// Update \p MI memory instruction to indicate it is
272
  /// nontemporal. Return true iff the instruction was modified.
273
  virtual bool enableNonTemporal(const MachineBasicBlock::iterator &MI)
274
    const = 0;
275
276
  /// Inserts any necessary instructions at position \p Pos relative
277
  /// to instruction \p MI to ensure any caches associated with
278
  /// address spaces \p AddrSpace for memory scopes up to memory scope
279
  /// \p Scope are invalidated. Returns true iff any instructions
280
  /// inserted.
281
  virtual bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
282
                                     SIAtomicScope Scope,
283
                                     SIAtomicAddrSpace AddrSpace,
284
                                     Position Pos) const = 0;
285
286
  /// Inserts any necessary instructions at position \p Pos relative
287
  /// to instruction \p MI to ensure memory instructions of kind \p Op
288
  /// associated with address spaces \p AddrSpace have completed as
289
  /// observed by other memory instructions executing in memory scope
290
  /// \p Scope. \p IsCrossAddrSpaceOrdering indicates if the memory
291
  /// ordering is between address spaces. Returns true iff any
292
  /// instructions inserted.
293
  virtual bool insertWait(MachineBasicBlock::iterator &MI,
294
                          SIAtomicScope Scope,
295
                          SIAtomicAddrSpace AddrSpace,
296
                          SIMemOp Op,
297
                          bool IsCrossAddrSpaceOrdering,
298
                          Position Pos) const = 0;
299
300
  /// Virtual destructor to allow derivations to be deleted.
301
25.5k
  virtual ~SICacheControl() = default;
302
303
};
304
305
class SIGfx6CacheControl : public SICacheControl {
306
protected:
307
308
  /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
309
  /// is modified, false otherwise.
310
192
  bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
311
192
    return enableNamedBit<AMDGPU::OpName::glc>(MI);
312
192
  }
313
314
  /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
315
  /// is modified, false otherwise.
316
91
  bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
317
91
    return enableNamedBit<AMDGPU::OpName::slc>(MI);
318
91
  }
319
320
public:
321
322
25.5k
  SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {};
323
324
  bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
325
                             SIAtomicScope Scope,
326
                             SIAtomicAddrSpace AddrSpace) const override;
327
328
  bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override;
329
330
  bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
331
                             SIAtomicScope Scope,
332
                             SIAtomicAddrSpace AddrSpace,
333
                             Position Pos) const override;
334
335
  bool insertWait(MachineBasicBlock::iterator &MI,
336
                  SIAtomicScope Scope,
337
                  SIAtomicAddrSpace AddrSpace,
338
                  SIMemOp Op,
339
                  bool IsCrossAddrSpaceOrdering,
340
                  Position Pos) const override;
341
};
342
343
class SIGfx7CacheControl : public SIGfx6CacheControl {
344
public:
345
346
19.1k
  SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {};
347
348
  bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
349
                             SIAtomicScope Scope,
350
                             SIAtomicAddrSpace AddrSpace,
351
                             Position Pos) const override;
352
353
};
354
355
class SIGfx10CacheControl : public SIGfx7CacheControl {
356
protected:
357
  bool CuMode = false;
358
359
  /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
360
  /// is modified, false otherwise.
361
24
  bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
362
24
    return enableNamedBit<AMDGPU::OpName::dlc>(MI);
363
24
  }
364
365
public:
366
367
  SIGfx10CacheControl(const GCNSubtarget &ST, bool CuMode) :
368
2.10k
    SIGfx7CacheControl(ST), CuMode(CuMode) {};
369
370
  bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
371
                             SIAtomicScope Scope,
372
                             SIAtomicAddrSpace AddrSpace) const override;
373
374
  bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override;
375
376
  bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
377
                             SIAtomicScope Scope,
378
                             SIAtomicAddrSpace AddrSpace,
379
                             Position Pos) const override;
380
381
  bool insertWait(MachineBasicBlock::iterator &MI,
382
                  SIAtomicScope Scope,
383
                  SIAtomicAddrSpace AddrSpace,
384
                  SIMemOp Op,
385
                  bool IsCrossAddrSpaceOrdering,
386
                  Position Pos) const override;
387
};
388
389
class SIMemoryLegalizer final : public MachineFunctionPass {
390
private:
391
392
  /// Cache Control.
393
  std::unique_ptr<SICacheControl> CC = nullptr;
394
395
  /// List of atomic pseudo instructions.
396
  std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
397
398
  /// Return true iff instruction \p MI is a atomic instruction that
399
  /// returns a result.
400
2.36k
  bool isAtomicRet(const MachineInstr &MI) const {
401
2.36k
    return AMDGPU::getAtomicNoRetOp(MI.getOpcode()) != -1;
402
2.36k
  }
403
404
  /// Removes all processed atomic pseudo instructions from the current
405
  /// function. Returns true if current function is modified, false otherwise.
406
  bool removeAtomicPseudoMIs();
407
408
  /// Expands load operation \p MI. Returns true if instructions are
409
  /// added/deleted or \p MI is modified, false otherwise.
410
  bool expandLoad(const SIMemOpInfo &MOI,
411
                  MachineBasicBlock::iterator &MI);
412
  /// Expands store operation \p MI. Returns true if instructions are
413
  /// added/deleted or \p MI is modified, false otherwise.
414
  bool expandStore(const SIMemOpInfo &MOI,
415
                   MachineBasicBlock::iterator &MI);
416
  /// Expands atomic fence operation \p MI. Returns true if
417
  /// instructions are added/deleted or \p MI is modified, false otherwise.
418
  bool expandAtomicFence(const SIMemOpInfo &MOI,
419
                         MachineBasicBlock::iterator &MI);
420
  /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
421
  /// instructions are added/deleted or \p MI is modified, false otherwise.
422
  bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
423
                                MachineBasicBlock::iterator &MI);
424
425
public:
426
  static char ID;
427
428
2.45k
  SIMemoryLegalizer() : MachineFunctionPass(ID) {}
429
430
2.42k
  void getAnalysisUsage(AnalysisUsage &AU) const override {
431
2.42k
    AU.setPreservesCFG();
432
2.42k
    MachineFunctionPass::getAnalysisUsage(AU);
433
2.42k
  }
434
435
27.9k
  StringRef getPassName() const override {
436
27.9k
    return PASS_NAME;
437
27.9k
  }
438
439
  bool runOnMachineFunction(MachineFunction &MF) override;
440
};
441
442
} // end namespace anonymous
443
444
void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
445
19
                                      const char *Msg) const {
446
19
  const Function &Func = MI->getParent()->getParent()->getFunction();
447
19
  DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
448
19
  Func.getContext().diagnose(Diag);
449
19
}
450
451
Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
452
SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
453
4.00k
                               SIAtomicAddrSpace InstrScope) const {
454
4.00k
  if (SSID == SyncScope::System)
455
1.99k
    return std::make_tuple(SIAtomicScope::SYSTEM,
456
1.99k
                           SIAtomicAddrSpace::ATOMIC,
457
1.99k
                           true);
458
2.00k
  if (SSID == MMI->getAgentSSID())
459
225
    return std::make_tuple(SIAtomicScope::AGENT,
460
225
                           SIAtomicAddrSpace::ATOMIC,
461
225
                           true);
462
1.78k
  if (SSID == MMI->getWorkgroupSSID())
463
243
    return std::make_tuple(SIAtomicScope::WORKGROUP,
464
243
                           SIAtomicAddrSpace::ATOMIC,
465
243
                           true);
466
1.53k
  if (SSID == MMI->getWavefrontSSID())
467
181
    return std::make_tuple(SIAtomicScope::WAVEFRONT,
468
181
                           SIAtomicAddrSpace::ATOMIC,
469
181
                           true);
470
1.35k
  if (SSID == SyncScope::SingleThread)
471
352
    return std::make_tuple(SIAtomicScope::SINGLETHREAD,
472
352
                           SIAtomicAddrSpace::ATOMIC,
473
352
                           true);
474
1.00k
  if (SSID == MMI->getSystemOneAddressSpaceSSID())
475
246
    return std::make_tuple(SIAtomicScope::SYSTEM,
476
246
                           SIAtomicAddrSpace::ATOMIC & InstrScope,
477
246
                           false);
478
759
  if (SSID == MMI->getAgentOneAddressSpaceSSID())
479
242
    return std::make_tuple(SIAtomicScope::AGENT,
480
242
                           SIAtomicAddrSpace::ATOMIC & InstrScope,
481
242
                           false);
482
517
  if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
483
245
    return std::make_tuple(SIAtomicScope::WORKGROUP,
484
245
                           SIAtomicAddrSpace::ATOMIC & InstrScope,
485
245
                           false);
486
272
  if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
487
197
    return std::make_tuple(SIAtomicScope::WAVEFRONT,
488
197
                           SIAtomicAddrSpace::ATOMIC & InstrScope,
489
197
                           false);
490
75
  if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
491
72
    return std::make_tuple(SIAtomicScope::SINGLETHREAD,
492
72
                           SIAtomicAddrSpace::ATOMIC & InstrScope,
493
72
                           false);
494
3
  return None;
495
3
}
496
497
70.1k
SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
498
70.1k
  if (AS == AMDGPUAS::FLAT_ADDRESS)
499
3.48k
    return SIAtomicAddrSpace::FLAT;
500
66.6k
  if (AS == AMDGPUAS::GLOBAL_ADDRESS)
501
37.2k
    return SIAtomicAddrSpace::GLOBAL;
502
29.4k
  if (AS == AMDGPUAS::LOCAL_ADDRESS)
503
12.3k
    return SIAtomicAddrSpace::LDS;
504
17.0k
  if (AS == AMDGPUAS::PRIVATE_ADDRESS)
505
15.3k
    return SIAtomicAddrSpace::SCRATCH;
506
1.69k
  if (AS == AMDGPUAS::REGION_ADDRESS)
507
160
    return SIAtomicAddrSpace::GDS;
508
1.53k
509
1.53k
  return SIAtomicAddrSpace::OTHER;
510
1.53k
}
511
512
25.5k
SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
513
25.5k
  MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
514
25.5k
}
515
516
Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
517
66.8k
    const MachineBasicBlock::iterator &MI) const {
518
66.8k
  assert(MI->getNumMemOperands() > 0);
519
66.8k
520
66.8k
  SyncScope::ID SSID = SyncScope::SingleThread;
521
66.8k
  AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
522
66.8k
  AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
523
66.8k
  SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
524
66.8k
  bool IsNonTemporal = true;
525
66.8k
526
66.8k
  // Validator should check whether or not MMOs cover the entire set of
527
66.8k
  // locations accessed by the memory instruction.
528
70.1k
  for (const auto &MMO : MI->memoperands()) {
529
70.1k
    IsNonTemporal &= MMO->isNonTemporal();
530
70.1k
    InstrAddrSpace |=
531
70.1k
      toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
532
70.1k
    AtomicOrdering OpOrdering = MMO->getOrdering();
533
70.1k
    if (OpOrdering != AtomicOrdering::NotAtomic) {
534
3.21k
      const auto &IsSyncScopeInclusion =
535
3.21k
          MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
536
3.21k
      if (!IsSyncScopeInclusion) {
537
12
        reportUnsupported(MI,
538
12
          "Unsupported non-inclusive atomic synchronization scope");
539
12
        return None;
540
12
      }
541
3.20k
542
3.20k
      SSID = IsSyncScopeInclusion.getValue() ? 
SSID245
:
MMO->getSyncScopeID()2.96k
;
543
3.20k
      Ordering =
544
3.20k
          isStrongerThan(Ordering, OpOrdering) ?
545
3.20k
              
Ordering0
: MMO->getOrdering();
546
3.20k
      assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
547
3.20k
             MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
548
3.20k
      FailureOrdering =
549
3.20k
          isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ?
550
3.20k
              
FailureOrdering0
: MMO->getFailureOrdering();
551
3.20k
    }
552
70.1k
  }
553
66.8k
554
66.8k
  SIAtomicScope Scope = SIAtomicScope::NONE;
555
66.8k
  SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
556
66.8k
  bool IsCrossAddressSpaceOrdering = false;
557
66.8k
  if (Ordering != AtomicOrdering::NotAtomic) {
558
3.20k
    auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
559
3.20k
    if (!ScopeOrNone) {
560
0
      reportUnsupported(MI, "Unsupported atomic synchronization scope");
561
0
      return None;
562
0
    }
563
3.20k
    std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
564
3.20k
      ScopeOrNone.getValue();
565
3.20k
    if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
566
3.20k
        
((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)3.20k
) {
567
4
      reportUnsupported(MI, "Unsupported atomic address space");
568
4
      return None;
569
4
    }
570
66.8k
  }
571
66.8k
  return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
572
66.8k
                     IsCrossAddressSpaceOrdering, FailureOrdering, IsNonTemporal);
573
66.8k
}
574
575
Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo(
576
67.6k
    const MachineBasicBlock::iterator &MI) const {
577
67.6k
  assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
578
67.6k
579
67.6k
  if (!(MI->mayLoad() && 
!MI->mayStore()27.7k
))
580
43.1k
    return None;
581
24.5k
582
24.5k
  // Be conservative if there are no memory operands.
583
24.5k
  if (MI->getNumMemOperands() == 0)
584
0
    return SIMemOpInfo();
585
24.5k
586
24.5k
  return constructFromMIWithMMO(MI);
587
24.5k
}
588
589
Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo(
590
43.1k
    const MachineBasicBlock::iterator &MI) const {
591
43.1k
  assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
592
43.1k
593
43.1k
  if (!(!MI->mayLoad() && 
MI->mayStore()39.9k
))
594
3.95k
    return None;
595
39.1k
596
39.1k
  // Be conservative if there are no memory operands.
597
39.1k
  if (MI->getNumMemOperands() == 0)
598
34
    return SIMemOpInfo();
599
39.1k
600
39.1k
  return constructFromMIWithMMO(MI);
601
39.1k
}
602
603
Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo(
604
3.96k
    const MachineBasicBlock::iterator &MI) const {
605
3.96k
  assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
606
3.96k
607
3.96k
  if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
608
3.16k
    return None;
609
796
610
796
  AtomicOrdering Ordering =
611
796
    static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
612
796
613
796
  SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
614
796
  auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
615
796
  if (!ScopeOrNone) {
616
3
    reportUnsupported(MI, "Unsupported atomic synchronization scope");
617
3
    return None;
618
3
  }
619
793
620
793
  SIAtomicScope Scope = SIAtomicScope::NONE;
621
793
  SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
622
793
  bool IsCrossAddressSpaceOrdering = false;
623
793
  std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
624
793
    ScopeOrNone.getValue();
625
793
626
793
  if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
627
793
      ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
628
0
    reportUnsupported(MI, "Unsupported atomic address space");
629
0
    return None;
630
0
  }
631
793
632
793
  return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
633
793
                     IsCrossAddressSpaceOrdering);
634
793
}
635
636
Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
637
3.16k
    const MachineBasicBlock::iterator &MI) const {
638
3.16k
  assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
639
3.16k
640
3.16k
  if (!(MI->mayLoad() && 
MI->mayStore()3.15k
))
641
22
    return None;
642
3.14k
643
3.14k
  // Be conservative if there are no memory operands.
644
3.14k
  if (MI->getNumMemOperands() == 0)
645
0
    return SIMemOpInfo();
646
3.14k
647
3.14k
  return constructFromMIWithMMO(MI);
648
3.14k
}
649
650
25.5k
SICacheControl::SICacheControl(const GCNSubtarget &ST) {
651
25.5k
  TII = ST.getInstrInfo();
652
25.5k
  IV = getIsaVersion(ST.getCPU());
653
25.5k
}
654
655
/* static */
656
25.5k
std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
657
25.5k
  GCNSubtarget::Generation Generation = ST.getGeneration();
658
25.5k
  if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
659
6.38k
    return make_unique<SIGfx6CacheControl>(ST);
660
19.1k
  if (Generation < AMDGPUSubtarget::GFX10)
661
17.0k
    return make_unique<SIGfx7CacheControl>(ST);
662
2.10k
  return make_unique<SIGfx10CacheControl>(ST, ST.isCuModeEnabled());
663
2.10k
}
664
665
bool SIGfx6CacheControl::enableLoadCacheBypass(
666
    const MachineBasicBlock::iterator &MI,
667
    SIAtomicScope Scope,
668
203
    SIAtomicAddrSpace AddrSpace) const {
669
203
  assert(MI->mayLoad() && !MI->mayStore());
670
203
  bool Changed = false;
671
203
672
203
  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
673
179
    /// TODO: Do not set glc for rmw atomic operations as they
674
179
    /// implicitly bypass the L1 cache.
675
179
676
179
    switch (Scope) {
677
179
    case SIAtomicScope::SYSTEM:
678
101
    case SIAtomicScope::AGENT:
679
101
      Changed |= enableGLCBit(MI);
680
101
      break;
681
101
    case SIAtomicScope::WORKGROUP:
682
78
    case SIAtomicScope::WAVEFRONT:
683
78
    case SIAtomicScope::SINGLETHREAD:
684
78
      // No cache to bypass.
685
78
      break;
686
78
    default:
687
0
      llvm_unreachable("Unsupported synchronization scope");
688
203
    }
689
203
  }
690
203
691
203
  /// The scratch address space does not need the global memory caches
692
203
  /// to be bypassed as all memory operations by the same thread are
693
203
  /// sequentially consistent, and no other thread can access scratch
694
203
  /// memory.
695
203
696
203
  /// Other address spaces do not hava a cache.
697
203
698
203
  return Changed;
699
203
}
700
701
bool SIGfx6CacheControl::enableNonTemporal(
702
61
    const MachineBasicBlock::iterator &MI) const {
703
61
  assert(MI->mayLoad() ^ MI->mayStore());
704
61
  bool Changed = false;
705
61
706
61
  /// TODO: Do not enableGLCBit if rmw atomic.
707
61
  Changed |= enableGLCBit(MI);
708
61
  Changed |= enableSLCBit(MI);
709
61
710
61
  return Changed;
711
61
}
712
713
bool SIGfx6CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
714
                                               SIAtomicScope Scope,
715
                                               SIAtomicAddrSpace AddrSpace,
716
430
                                               Position Pos) const {
717
430
  bool Changed = false;
718
430
719
430
  MachineBasicBlock &MBB = *MI->getParent();
720
430
  DebugLoc DL = MI->getDebugLoc();
721
430
722
430
  if (Pos == Position::AFTER)
723
280
    ++MI;
724
430
725
430
  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
726
430
    switch (Scope) {
727
430
    case SIAtomicScope::SYSTEM:
728
340
    case SIAtomicScope::AGENT:
729
340
      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
730
340
      Changed = true;
731
340
      break;
732
340
    case SIAtomicScope::WORKGROUP:
733
90
    case SIAtomicScope::WAVEFRONT:
734
90
    case SIAtomicScope::SINGLETHREAD:
735
90
      // No cache to invalidate.
736
90
      break;
737
90
    default:
738
0
      llvm_unreachable("Unsupported synchronization scope");
739
430
    }
740
430
  }
741
430
742
430
  /// The scratch address space does not need the global memory cache
743
430
  /// to be flushed as all memory operations by the same thread are
744
430
  /// sequentially consistent, and no other thread can access scratch
745
430
  /// memory.
746
430
747
430
  /// Other address spaces do not hava a cache.
748
430
749
430
  if (Pos == Position::AFTER)
750
280
    --MI;
751
430
752
430
  return Changed;
753
430
}
754
755
bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
756
                                    SIAtomicScope Scope,
757
                                    SIAtomicAddrSpace AddrSpace,
758
                                    SIMemOp Op,
759
                                    bool IsCrossAddrSpaceOrdering,
760
5.00k
                                    Position Pos) const {
761
5.00k
  bool Changed = false;
762
5.00k
763
5.00k
  MachineBasicBlock &MBB = *MI->getParent();
764
5.00k
  DebugLoc DL = MI->getDebugLoc();
765
5.00k
766
5.00k
  if (Pos == Position::AFTER)
767
2.16k
    ++MI;
768
5.00k
769
5.00k
  bool VMCnt = false;
770
5.00k
  bool LGKMCnt = false;
771
5.00k
772
5.00k
  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
773
4.95k
    switch (Scope) {
774
4.95k
    case SIAtomicScope::SYSTEM:
775
4.10k
    case SIAtomicScope::AGENT:
776
4.10k
      VMCnt |= true;
777
4.10k
      break;
778
4.10k
    case SIAtomicScope::WORKGROUP:
779
846
    case SIAtomicScope::WAVEFRONT:
780
846
    case SIAtomicScope::SINGLETHREAD:
781
846
      // The L1 cache keeps all memory operations in order for
782
846
      // wavefronts in the same work-group.
783
846
      break;
784
846
    default:
785
0
      llvm_unreachable("Unsupported synchronization scope");
786
5.00k
    }
787
5.00k
  }
788
5.00k
789
5.00k
  if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
790
4.95k
    switch (Scope) {
791
4.95k
    case SIAtomicScope::SYSTEM:
792
4.43k
    case SIAtomicScope::AGENT:
793
4.43k
    case SIAtomicScope::WORKGROUP:
794
4.43k
      // If no cross address space ordering then an LDS waitcnt is not
795
4.43k
      // needed as LDS operations for all waves are executed in a
796
4.43k
      // total global ordering as observed by all waves. Required if
797
4.43k
      // also synchronizing with global/GDS memory as LDS operations
798
4.43k
      // could be reordered with respect to later global/GDS memory
799
4.43k
      // operations of the same wave.
800
4.43k
      LGKMCnt |= IsCrossAddrSpaceOrdering;
801
4.43k
      break;
802
4.43k
    case SIAtomicScope::WAVEFRONT:
803
522
    case SIAtomicScope::SINGLETHREAD:
804
522
      // The LDS keeps all memory operations in order for
805
522
      // the same wavesfront.
806
522
      break;
807
522
    default:
808
0
      llvm_unreachable("Unsupported synchronization scope");
809
5.00k
    }
810
5.00k
  }
811
5.00k
812
5.00k
  if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
813
4.51k
    switch (Scope) {
814
4.51k
    case SIAtomicScope::SYSTEM:
815
3.85k
    case SIAtomicScope::AGENT:
816
3.85k
      // If no cross address space ordering then an GDS waitcnt is not
817
3.85k
      // needed as GDS operations for all waves are executed in a
818
3.85k
      // total global ordering as observed by all waves. Required if
819
3.85k
      // also synchronizing with global/LDS memory as GDS operations
820
3.85k
      // could be reordered with respect to later global/LDS memory
821
3.85k
      // operations of the same wave.
822
3.85k
      LGKMCnt |= IsCrossAddrSpaceOrdering;
823
3.85k
      break;
824
3.85k
    case SIAtomicScope::WORKGROUP:
825
664
    case SIAtomicScope::WAVEFRONT:
826
664
    case SIAtomicScope::SINGLETHREAD:
827
664
      // The GDS keeps all memory operations in order for
828
664
      // the same work-group.
829
664
      break;
830
664
    default:
831
0
      llvm_unreachable("Unsupported synchronization scope");
832
5.00k
    }
833
5.00k
  }
834
5.00k
835
5.00k
  if (VMCnt || 
LGKMCnt902
) {
836
4.27k
    unsigned WaitCntImmediate =
837
4.27k
      AMDGPU::encodeWaitcnt(IV,
838
4.27k
                            VMCnt ? 
04.10k
:
getVmcntBitMask(IV)173
,
839
4.27k
                            getExpcntBitMask(IV),
840
4.27k
                            LGKMCnt ? 
03.93k
:
getLgkmcntBitMask(IV)340
);
841
4.27k
    BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
842
4.27k
    Changed = true;
843
4.27k
  }
844
5.00k
845
5.00k
  if (Pos == Position::AFTER)
846
2.16k
    --MI;
847
5.00k
848
5.00k
  return Changed;
849
5.00k
}
850
851
bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
852
                                               SIAtomicScope Scope,
853
                                               SIAtomicAddrSpace AddrSpace,
854
2.17k
                                               Position Pos) const {
855
2.17k
  bool Changed = false;
856
2.17k
857
2.17k
  MachineBasicBlock &MBB = *MI->getParent();
858
2.17k
  DebugLoc DL = MI->getDebugLoc();
859
2.17k
860
2.17k
  const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
861
2.17k
862
2.17k
  const unsigned Flush = STM.isAmdPalOS() || 
STM.isMesa3DOS()2.11k
863
2.17k
                             ? 
AMDGPU::BUFFER_WBINVL1150
864
2.17k
                             : 
AMDGPU::BUFFER_WBINVL1_VOL2.02k
;
865
2.17k
866
2.17k
  if (Pos == Position::AFTER)
867
1.88k
    ++MI;
868
2.17k
869
2.17k
  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
870
2.15k
    switch (Scope) {
871
2.15k
    case SIAtomicScope::SYSTEM:
872
1.73k
    case SIAtomicScope::AGENT:
873
1.73k
      BuildMI(MBB, MI, DL, TII->get(Flush));
874
1.73k
      Changed = true;
875
1.73k
      break;
876
1.73k
    case SIAtomicScope::WORKGROUP:
877
417
    case SIAtomicScope::WAVEFRONT:
878
417
    case SIAtomicScope::SINGLETHREAD:
879
417
      // No cache to invalidate.
880
417
      break;
881
417
    default:
882
0
      llvm_unreachable("Unsupported synchronization scope");
883
2.17k
    }
884
2.17k
  }
885
2.17k
886
2.17k
  /// The scratch address space does not need the global memory cache
887
2.17k
  /// to be flushed as all memory operations by the same thread are
888
2.17k
  /// sequentially consistent, and no other thread can access scratch
889
2.17k
  /// memory.
890
2.17k
891
2.17k
  /// Other address spaces do not hava a cache.
892
2.17k
893
2.17k
  if (Pos == Position::AFTER)
894
1.88k
    --MI;
895
2.17k
896
2.17k
  return Changed;
897
2.17k
}
898
899
bool SIGfx10CacheControl::enableLoadCacheBypass(
900
    const MachineBasicBlock::iterator &MI,
901
    SIAtomicScope Scope,
902
60
    SIAtomicAddrSpace AddrSpace) const {
903
60
  assert(MI->mayLoad() && !MI->mayStore());
904
60
  bool Changed = false;
905
60
906
60
  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
907
60
    /// TODO Do not set glc for rmw atomic operations as they
908
60
    /// implicitly bypass the L0/L1 caches.
909
60
910
60
    switch (Scope) {
911
60
    case SIAtomicScope::SYSTEM:
912
24
    case SIAtomicScope::AGENT:
913
24
      Changed |= enableGLCBit(MI);
914
24
      Changed |= enableDLCBit(MI);
915
24
      break;
916
24
    case SIAtomicScope::WORKGROUP:
917
12
      // In WGP mode the waves of a work-group can be executing on either CU of
918
12
      // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
919
12
      // CU mode and all waves of a work-group are on the same CU, and so the
920
12
      // L0 does not need to be bypassed.
921
12
      if (!CuMode) 
Changed |= enableGLCBit(MI)6
;
922
12
      break;
923
24
    case SIAtomicScope::WAVEFRONT:
924
24
    case SIAtomicScope::SINGLETHREAD:
925
24
      // No cache to bypass.
926
24
      break;
927
24
    default:
928
0
      llvm_unreachable("Unsupported synchronization scope");
929
60
    }
930
60
  }
931
60
932
60
  /// The scratch address space does not need the global memory caches
933
60
  /// to be bypassed as all memory operations by the same thread are
934
60
  /// sequentially consistent, and no other thread can access scratch
935
60
  /// memory.
936
60
937
60
  /// Other address spaces do not hava a cache.
938
60
939
60
  return Changed;
940
60
}
941
942
bool SIGfx10CacheControl::enableNonTemporal(
943
30
    const MachineBasicBlock::iterator &MI) const {
944
30
  assert(MI->mayLoad() ^ MI->mayStore());
945
30
  bool Changed = false;
946
30
947
30
  Changed |= enableSLCBit(MI);
948
30
  /// TODO for store (non-rmw atomic) instructions also enableGLCBit(MI)
949
30
950
30
  return Changed;
951
30
}
952
953
bool SIGfx10CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
954
                                                SIAtomicScope Scope,
955
                                                SIAtomicAddrSpace AddrSpace,
956
520
                                                Position Pos) const {
957
520
  bool Changed = false;
958
520
959
520
  MachineBasicBlock &MBB = *MI->getParent();
960
520
  DebugLoc DL = MI->getDebugLoc();
961
520
962
520
  if (Pos == Position::AFTER)
963
392
    ++MI;
964
520
965
520
  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
966
520
    switch (Scope) {
967
520
    case SIAtomicScope::SYSTEM:
968
240
    case SIAtomicScope::AGENT:
969
240
      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
970
240
      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
971
240
      Changed = true;
972
240
      break;
973
240
    case SIAtomicScope::WORKGROUP:
974
122
      // In WGP mode the waves of a work-group can be executing on either CU of
975
122
      // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
976
122
      // in CU mode and all waves of a work-group are on the same CU, and so the
977
122
      // L0 does not need to be invalidated.
978
122
      if (!CuMode) {
979
62
        BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
980
62
        Changed = true;
981
62
      }
982
122
      break;
983
240
    case SIAtomicScope::WAVEFRONT:
984
158
    case SIAtomicScope::SINGLETHREAD:
985
158
      // No cache to invalidate.
986
158
      break;
987
158
    default:
988
0
      llvm_unreachable("Unsupported synchronization scope");
989
520
    }
990
520
  }
991
520
992
520
  /// The scratch address space does not need the global memory cache
993
520
  /// to be flushed as all memory operations by the same thread are
994
520
  /// sequentially consistent, and no other thread can access scratch
995
520
  /// memory.
996
520
997
520
  /// Other address spaces do not hava a cache.
998
520
999
520
  if (Pos == Position::AFTER)
1000
392
    --MI;
1001
520
1002
520
  return Changed;
1003
520
}
1004
1005
bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1006
                                     SIAtomicScope Scope,
1007
                                     SIAtomicAddrSpace AddrSpace,
1008
                                     SIMemOp Op,
1009
                                     bool IsCrossAddrSpaceOrdering,
1010
928
                                     Position Pos) const {
1011
928
  bool Changed = false;
1012
928
1013
928
  MachineBasicBlock &MBB = *MI->getParent();
1014
928
  DebugLoc DL = MI->getDebugLoc();
1015
928
1016
928
  if (Pos == Position::AFTER)
1017
392
    ++MI;
1018
928
1019
928
  bool VMCnt = false;
1020
928
  bool VSCnt = false;
1021
928
  bool LGKMCnt = false;
1022
928
1023
928
  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1024
928
    switch (Scope) {
1025
928
    case SIAtomicScope::SYSTEM:
1026
428
    case SIAtomicScope::AGENT:
1027
428
      if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1028
340
        VMCnt |= true;
1029
428
      if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1030
324
        VSCnt |= true;
1031
428
      break;
1032
428
    case SIAtomicScope::WORKGROUP:
1033
216
      // In WGP mode the waves of a work-group can be executing on either CU of
1034
216
      // the WGP. Therefore need to wait for operations to complete to ensure
1035
216
      // they are visible to waves in the other CU as the L0 is per CU.
1036
216
      // Otherwise in CU mode and all waves of a work-group are on the same CU
1037
216
      // which shares the same L0.
1038
216
      if (!CuMode) {
1039
110
        if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1040
88
          VMCnt |= true;
1041
110
        if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1042
84
          VSCnt |= true;
1043
110
      }
1044
216
      break;
1045
428
    case SIAtomicScope::WAVEFRONT:
1046
284
    case SIAtomicScope::SINGLETHREAD:
1047
284
      // The L0 cache keeps all memory operations in order for
1048
284
      // work-items in the same wavefront.
1049
284
      break;
1050
284
    default:
1051
0
      llvm_unreachable("Unsupported synchronization scope");
1052
928
    }
1053
928
  }
1054
928
1055
928
  if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1056
928
    switch (Scope) {
1057
928
    case SIAtomicScope::SYSTEM:
1058
644
    case SIAtomicScope::AGENT:
1059
644
    case SIAtomicScope::WORKGROUP:
1060
644
      // If no cross address space ordering then an LDS waitcnt is not
1061
644
      // needed as LDS operations for all waves are executed in a
1062
644
      // total global ordering as observed by all waves. Required if
1063
644
      // also synchronizing with global/GDS memory as LDS operations
1064
644
      // could be reordered with respect to later global/GDS memory
1065
644
      // operations of the same wave.
1066
644
      LGKMCnt |= IsCrossAddrSpaceOrdering;
1067
644
      break;
1068
644
    case SIAtomicScope::WAVEFRONT:
1069
284
    case SIAtomicScope::SINGLETHREAD:
1070
284
      // The LDS keeps all memory operations in order for
1071
284
      // the same wavesfront.
1072
284
      break;
1073
284
    default:
1074
0
      llvm_unreachable("Unsupported synchronization scope");
1075
928
    }
1076
928
  }
1077
928
1078
928
  if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1079
582
    switch (Scope) {
1080
582
    case SIAtomicScope::SYSTEM:
1081
240
    case SIAtomicScope::AGENT:
1082
240
      // If no cross address space ordering then an GDS waitcnt is not
1083
240
      // needed as GDS operations for all waves are executed in a
1084
240
      // total global ordering as observed by all waves. Required if
1085
240
      // also synchronizing with global/LDS memory as GDS operations
1086
240
      // could be reordered with respect to later global/LDS memory
1087
240
      // operations of the same wave.
1088
240
      LGKMCnt |= IsCrossAddrSpaceOrdering;
1089
240
      break;
1090
342
    case SIAtomicScope::WORKGROUP:
1091
342
    case SIAtomicScope::WAVEFRONT:
1092
342
    case SIAtomicScope::SINGLETHREAD:
1093
342
      // The GDS keeps all memory operations in order for
1094
342
      // the same work-group.
1095
342
      break;
1096
342
    default:
1097
0
      llvm_unreachable("Unsupported synchronization scope");
1098
928
    }
1099
928
  }
1100
928
1101
928
  if (VMCnt || 
LGKMCnt500
) {
1102
536
    unsigned WaitCntImmediate =
1103
536
      AMDGPU::encodeWaitcnt(IV,
1104
536
                            VMCnt ? 
0428
:
getVmcntBitMask(IV)108
,
1105
536
                            getExpcntBitMask(IV),
1106
536
                            LGKMCnt ? 
0324
:
getLgkmcntBitMask(IV)212
);
1107
536
    BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1108
536
    Changed = true;
1109
536
  }
1110
928
1111
928
  if (VSCnt) {
1112
408
    BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1113
408
      .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1114
408
      .addImm(0);
1115
408
    Changed = true;
1116
408
  }
1117
928
1118
928
  if (Pos == Position::AFTER)
1119
392
    --MI;
1120
928
1121
928
  return Changed;
1122
928
}
1123
1124
25.5k
bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
1125
25.5k
  if (AtomicPseudoMIs.empty())
1126
24.7k
    return false;
1127
764
1128
764
  for (auto &MI : AtomicPseudoMIs)
1129
793
    MI->eraseFromParent();
1130
764
1131
764
  AtomicPseudoMIs.clear();
1132
764
  return true;
1133
764
}
1134
1135
bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
1136
24.5k
                                   MachineBasicBlock::iterator &MI) {
1137
24.5k
  assert(MI->mayLoad() && !MI->mayStore());
1138
24.5k
1139
24.5k
  bool Changed = false;
1140
24.5k
1141
24.5k
  if (MOI.isAtomic()) {
1142
333
    if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1143
333
        
MOI.getOrdering() == AtomicOrdering::Acquire255
||
1144
333
        
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent185
) {
1145
263
      Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
1146
263
                                           MOI.getOrderingAddrSpace());
1147
263
    }
1148
333
1149
333
    if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1150
115
      Changed |= CC->insertWait(MI, MOI.getScope(),
1151
115
                                MOI.getOrderingAddrSpace(),
1152
115
                                SIMemOp::LOAD | SIMemOp::STORE,
1153
115
                                MOI.getIsCrossAddressSpaceOrdering(),
1154
115
                                Position::BEFORE);
1155
333
1156
333
    if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1157
333
        
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent263
) {
1158
185
      Changed |= CC->insertWait(MI, MOI.getScope(),
1159
185
                                MOI.getInstrAddrSpace(),
1160
185
                                SIMemOp::LOAD,
1161
185
                                MOI.getIsCrossAddressSpaceOrdering(),
1162
185
                                Position::AFTER);
1163
185
      Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
1164
185
                                           MOI.getOrderingAddrSpace(),
1165
185
                                           Position::AFTER);
1166
185
    }
1167
333
1168
333
    return Changed;
1169
333
  }
1170
24.2k
1171
24.2k
  // Atomic instructions do not have the nontemporal attribute.
1172
24.2k
  if (MOI.isNonTemporal()) {
1173
43
    Changed |= CC->enableNonTemporal(MI);
1174
43
    return Changed;
1175
43
  }
1176
24.1k
1177
24.1k
  return Changed;
1178
24.1k
}
1179
1180
bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
1181
39.1k
                                    MachineBasicBlock::iterator &MI) {
1182
39.1k
  assert(!MI->mayLoad() && MI->mayStore());
1183
39.1k
1184
39.1k
  bool Changed = false;
1185
39.1k
1186
39.1k
  if (MOI.isAtomic()) {
1187
369
    if (MOI.getOrdering() == AtomicOrdering::Release ||
1188
369
        
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent299
)
1189
221
      Changed |= CC->insertWait(MI, MOI.getScope(),
1190
221
                                MOI.getOrderingAddrSpace(),
1191
221
                                SIMemOp::LOAD | SIMemOp::STORE,
1192
221
                                MOI.getIsCrossAddressSpaceOrdering(),
1193
221
                                Position::BEFORE);
1194
369
1195
369
    return Changed;
1196
369
  }
1197
38.7k
1198
38.7k
  // Atomic instructions do not have the nontemporal attribute.
1199
38.7k
  if (MOI.isNonTemporal()) {
1200
48
    Changed |= CC->enableNonTemporal(MI);
1201
48
    return Changed;
1202
48
  }
1203
38.7k
1204
38.7k
  return Changed;
1205
38.7k
}
1206
1207
bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
1208
793
                                          MachineBasicBlock::iterator &MI) {
1209
793
  assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
1210
793
1211
793
  AtomicPseudoMIs.push_back(MI);
1212
793
  bool Changed = false;
1213
793
1214
793
  if (MOI.isAtomic()) {
1215
793
    if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1216
793
        
MOI.getOrdering() == AtomicOrdering::Release584
||
1217
793
        
MOI.getOrdering() == AtomicOrdering::AcquireRelease360
||
1218
793
        
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent180
)
1219
793
      /// TODO: This relies on a barrier always generating a waitcnt
1220
793
      /// for LDS to ensure it is not reordered with the completion of
1221
793
      /// the proceeding LDS operations. If barrier had a memory
1222
793
      /// ordering and memory scope, then library does not need to
1223
793
      /// generate a fence. Could add support in this file for
1224
793
      /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
1225
793
      /// adding waitcnt before a S_BARRIER.
1226
793
      Changed |= CC->insertWait(MI, MOI.getScope(),
1227
793
                                MOI.getOrderingAddrSpace(),
1228
793
                                SIMemOp::LOAD | SIMemOp::STORE,
1229
793
                                MOI.getIsCrossAddressSpaceOrdering(),
1230
793
                                Position::BEFORE);
1231
793
1232
793
    if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1233
793
        
MOI.getOrdering() == AtomicOrdering::AcquireRelease584
||
1234
793
        
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent404
)
1235
569
      Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
1236
569
                                           MOI.getOrderingAddrSpace(),
1237
569
                                           Position::BEFORE);
1238
793
1239
793
    return Changed;
1240
793
  }
1241
0
1242
0
  return Changed;
1243
0
}
1244
1245
bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
1246
3.13k
  MachineBasicBlock::iterator &MI) {
1247
3.13k
  assert(MI->mayLoad() && MI->mayStore());
1248
3.13k
1249
3.13k
  bool Changed = false;
1250
3.13k
1251
3.13k
  if (MOI.isAtomic()) {
1252
2.53k
    if (MOI.getOrdering() == AtomicOrdering::Release ||
1253
2.53k
        
MOI.getOrdering() == AtomicOrdering::AcquireRelease2.38k
||
1254
2.53k
        
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent2.04k
||
1255
2.53k
        
MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent282
)
1256
2.25k
      Changed |= CC->insertWait(MI, MOI.getScope(),
1257
2.25k
                                MOI.getOrderingAddrSpace(),
1258
2.25k
                                SIMemOp::LOAD | SIMemOp::STORE,
1259
2.25k
                                MOI.getIsCrossAddressSpaceOrdering(),
1260
2.25k
                                Position::BEFORE);
1261
2.53k
1262
2.53k
    if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1263
2.53k
        
MOI.getOrdering() == AtomicOrdering::AcquireRelease2.33k
||
1264
2.53k
        
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent1.98k
||
1265
2.53k
        
MOI.getFailureOrdering() == AtomicOrdering::Acquire230
||
1266
2.53k
        
MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent166
) {
1267
2.36k
      Changed |= CC->insertWait(MI, MOI.getScope(),
1268
2.36k
                                MOI.getOrderingAddrSpace(),
1269
2.36k
                                isAtomicRet(*MI) ? 
SIMemOp::LOAD1.17k
:
1270
2.36k
                                                   
SIMemOp::STORE1.19k
,
1271
2.36k
                                MOI.getIsCrossAddressSpaceOrdering(),
1272
2.36k
                                Position::AFTER);
1273
2.36k
      Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
1274
2.36k
                                           MOI.getOrderingAddrSpace(),
1275
2.36k
                                           Position::AFTER);
1276
2.36k
    }
1277
2.53k
1278
2.53k
    return Changed;
1279
2.53k
  }
1280
604
1281
604
  return Changed;
1282
604
}
1283
1284
25.5k
bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
1285
25.5k
  bool Changed = false;
1286
25.5k
1287
25.5k
  SIMemOpAccess MOA(MF);
1288
25.5k
  CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
1289
25.5k
1290
28.8k
  for (auto &MBB : MF) {
1291
411k
    for (auto MI = MBB.begin(); MI != MBB.end(); 
++MI382k
) {
1292
382k
      if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
1293
314k
        continue;
1294
67.6k
1295
67.6k
      if (const auto &MOI = MOA.getLoadInfo(MI))
1296
24.5k
        Changed |= expandLoad(MOI.getValue(), MI);
1297
43.1k
      else if (const auto &MOI = MOA.getStoreInfo(MI))
1298
39.1k
        Changed |= expandStore(MOI.getValue(), MI);
1299
3.96k
      else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
1300
793
        Changed |= expandAtomicFence(MOI.getValue(), MI);
1301
3.16k
      else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
1302
3.13k
        Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI);
1303
67.6k
    }
1304
28.8k
  }
1305
25.5k
1306
25.5k
  Changed |= removeAtomicPseudoMIs();
1307
25.5k
  return Changed;
1308
25.5k
}
1309
1310
INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
1311
1312
char SIMemoryLegalizer::ID = 0;
1313
char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
1314
1315
2.44k
FunctionPass *llvm::createSIMemoryLegalizerPass() {
1316
2.44k
  return new SIMemoryLegalizer();
1317
2.44k
}