Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
Line
Count
Source (jump to first uncovered line)
1
//===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
/// \file
10
/// This pass optimizes atomic operations by using a single lane of a wavefront
11
/// to perform the atomic operation, thus reducing contention on that memory
12
/// location.
13
//
14
//===----------------------------------------------------------------------===//
15
16
#include "AMDGPU.h"
17
#include "AMDGPUSubtarget.h"
18
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
19
#include "llvm/CodeGen/TargetPassConfig.h"
20
#include "llvm/IR/IRBuilder.h"
21
#include "llvm/IR/InstVisitor.h"
22
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
23
24
#define DEBUG_TYPE "amdgpu-atomic-optimizer"
25
26
using namespace llvm;
27
28
namespace {
29
30
enum DPP_CTRL {
31
  DPP_ROW_SR1 = 0x111,
32
  DPP_ROW_SR2 = 0x112,
33
  DPP_ROW_SR3 = 0x113,
34
  DPP_ROW_SR4 = 0x114,
35
  DPP_ROW_SR8 = 0x118,
36
  DPP_WF_SR1 = 0x138,
37
  DPP_ROW_BCAST15 = 0x142,
38
  DPP_ROW_BCAST31 = 0x143
39
};
40
41
struct ReplacementInfo {
42
  Instruction *I;
43
  AtomicRMWInst::BinOp Op;
44
  unsigned ValIdx;
45
  bool ValDivergent;
46
};
47
48
class AMDGPUAtomicOptimizer : public FunctionPass,
49
                              public InstVisitor<AMDGPUAtomicOptimizer> {
50
private:
51
  SmallVector<ReplacementInfo, 8> ToReplace;
52
  const LegacyDivergenceAnalysis *DA;
53
  const DataLayout *DL;
54
  DominatorTree *DT;
55
  bool HasDPP;
56
  bool IsPixelShader;
57
58
  void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx,
59
                      bool ValDivergent) const;
60
61
public:
62
  static char ID;
63
64
18
  AMDGPUAtomicOptimizer() : FunctionPass(ID) {}
65
66
  bool runOnFunction(Function &F) override;
67
68
18
  void getAnalysisUsage(AnalysisUsage &AU) const override {
69
18
    AU.addPreserved<DominatorTreeWrapperPass>();
70
18
    AU.addRequired<LegacyDivergenceAnalysis>();
71
18
    AU.addRequired<TargetPassConfig>();
72
18
  }
73
74
  void visitAtomicRMWInst(AtomicRMWInst &I);
75
  void visitIntrinsicInst(IntrinsicInst &I);
76
};
77
78
} // namespace
79
80
char AMDGPUAtomicOptimizer::ID = 0;
81
82
char &llvm::AMDGPUAtomicOptimizerID = AMDGPUAtomicOptimizer::ID;
83
84
189
bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) {
85
189
  if (skipFunction(F)) {
86
0
    return false;
87
0
  }
88
189
89
189
  DA = &getAnalysis<LegacyDivergenceAnalysis>();
90
189
  DL = &F.getParent()->getDataLayout();
91
189
  DominatorTreeWrapperPass *const DTW =
92
189
      getAnalysisIfAvailable<DominatorTreeWrapperPass>();
93
189
  DT = DTW ? &DTW->getDomTree() : 
nullptr0
;
94
189
  const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
95
189
  const TargetMachine &TM = TPC.getTM<TargetMachine>();
96
189
  const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
97
189
  HasDPP = ST.hasDPP();
98
189
  IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
99
189
100
189
  visit(F);
101
189
102
189
  const bool Changed = !ToReplace.empty();
103
189
104
189
  for (ReplacementInfo &Info : ToReplace) {
105
135
    optimizeAtomic(*Info.I, Info.Op, Info.ValIdx, Info.ValDivergent);
106
135
  }
107
189
108
189
  ToReplace.clear();
109
189
110
189
  return Changed;
111
189
}
112
113
105
void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) {
114
105
  // Early exit for unhandled address space atomic instructions.
115
105
  switch (I.getPointerAddressSpace()) {
116
105
  default:
117
0
    return;
118
105
  case AMDGPUAS::GLOBAL_ADDRESS:
119
105
  case AMDGPUAS::LOCAL_ADDRESS:
120
105
    break;
121
105
  }
122
105
123
105
  AtomicRMWInst::BinOp Op = I.getOperation();
124
105
125
105
  switch (Op) {
126
105
  default:
127
0
    return;
128
105
  case AtomicRMWInst::Add:
129
105
  case AtomicRMWInst::Sub:
130
105
  case AtomicRMWInst::And:
131
105
  case AtomicRMWInst::Or:
132
105
  case AtomicRMWInst::Xor:
133
105
  case AtomicRMWInst::Max:
134
105
  case AtomicRMWInst::Min:
135
105
  case AtomicRMWInst::UMax:
136
105
  case AtomicRMWInst::UMin:
137
105
    break;
138
105
  }
139
105
140
105
  const unsigned PtrIdx = 0;
141
105
  const unsigned ValIdx = 1;
142
105
143
105
  // If the pointer operand is divergent, then each lane is doing an atomic
144
105
  // operation on a different address, and we cannot optimize that.
145
105
  if (DA->isDivergent(I.getOperand(PtrIdx))) {
146
0
    return;
147
0
  }
148
105
149
105
  const bool ValDivergent = DA->isDivergent(I.getOperand(ValIdx));
150
105
151
105
  // If the value operand is divergent, each lane is contributing a different
152
105
  // value to the atomic calculation. We can only optimize divergent values if
153
105
  // we have DPP available on our subtarget, and the atomic operation is 32
154
105
  // bits.
155
105
  if (ValDivergent && 
(45
!HasDPP45
||
(DL->getTypeSizeInBits(I.getType()) != 32)30
)) {
156
23
    return;
157
23
  }
158
82
159
82
  // If we get here, we can optimize the atomic using a single wavefront-wide
160
82
  // atomic operation to do the calculation for the entire wavefront, so
161
82
  // remember the instruction so we can come back to it.
162
82
  const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent};
163
82
164
82
  ToReplace.push_back(Info);
165
82
}
166
167
366
void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) {
168
366
  AtomicRMWInst::BinOp Op;
169
366
170
366
  switch (I.getIntrinsicID()) {
171
366
  default:
172
282
    return;
173
366
  case Intrinsic::amdgcn_buffer_atomic_add:
174
45
  case Intrinsic::amdgcn_struct_buffer_atomic_add:
175
45
  case Intrinsic::amdgcn_raw_buffer_atomic_add:
176
45
    Op = AtomicRMWInst::Add;
177
45
    break;
178
45
  case Intrinsic::amdgcn_buffer_atomic_sub:
179
39
  case Intrinsic::amdgcn_struct_buffer_atomic_sub:
180
39
  case Intrinsic::amdgcn_raw_buffer_atomic_sub:
181
39
    Op = AtomicRMWInst::Sub;
182
39
    break;
183
39
  case Intrinsic::amdgcn_buffer_atomic_and:
184
0
  case Intrinsic::amdgcn_struct_buffer_atomic_and:
185
0
  case Intrinsic::amdgcn_raw_buffer_atomic_and:
186
0
    Op = AtomicRMWInst::And;
187
0
    break;
188
0
  case Intrinsic::amdgcn_buffer_atomic_or:
189
0
  case Intrinsic::amdgcn_struct_buffer_atomic_or:
190
0
  case Intrinsic::amdgcn_raw_buffer_atomic_or:
191
0
    Op = AtomicRMWInst::Or;
192
0
    break;
193
0
  case Intrinsic::amdgcn_buffer_atomic_xor:
194
0
  case Intrinsic::amdgcn_struct_buffer_atomic_xor:
195
0
  case Intrinsic::amdgcn_raw_buffer_atomic_xor:
196
0
    Op = AtomicRMWInst::Xor;
197
0
    break;
198
0
  case Intrinsic::amdgcn_buffer_atomic_smin:
199
0
  case Intrinsic::amdgcn_struct_buffer_atomic_smin:
200
0
  case Intrinsic::amdgcn_raw_buffer_atomic_smin:
201
0
    Op = AtomicRMWInst::Min;
202
0
    break;
203
0
  case Intrinsic::amdgcn_buffer_atomic_umin:
204
0
  case Intrinsic::amdgcn_struct_buffer_atomic_umin:
205
0
  case Intrinsic::amdgcn_raw_buffer_atomic_umin:
206
0
    Op = AtomicRMWInst::UMin;
207
0
    break;
208
0
  case Intrinsic::amdgcn_buffer_atomic_smax:
209
0
  case Intrinsic::amdgcn_struct_buffer_atomic_smax:
210
0
  case Intrinsic::amdgcn_raw_buffer_atomic_smax:
211
0
    Op = AtomicRMWInst::Max;
212
0
    break;
213
0
  case Intrinsic::amdgcn_buffer_atomic_umax:
214
0
  case Intrinsic::amdgcn_struct_buffer_atomic_umax:
215
0
  case Intrinsic::amdgcn_raw_buffer_atomic_umax:
216
0
    Op = AtomicRMWInst::UMax;
217
0
    break;
218
84
  }
219
84
220
84
  const unsigned ValIdx = 0;
221
84
222
84
  const bool ValDivergent = DA->isDivergent(I.getOperand(ValIdx));
223
84
224
84
  // If the value operand is divergent, each lane is contributing a different
225
84
  // value to the atomic calculation. We can only optimize divergent values if
226
84
  // we have DPP available on our subtarget, and the atomic operation is 32
227
84
  // bits.
228
84
  if (ValDivergent && 
(21
!HasDPP21
||
(DL->getTypeSizeInBits(I.getType()) != 32)14
)) {
229
7
    return;
230
7
  }
231
77
232
77
  // If any of the other arguments to the intrinsic are divergent, we can't
233
77
  // optimize the operation.
234
388
  
for (unsigned Idx = 1; 77
Idx < I.getNumOperands();
Idx++311
) {
235
335
    if (DA->isDivergent(I.getOperand(Idx))) {
236
24
      return;
237
24
    }
238
335
  }
239
77
240
77
  // If we get here, we can optimize the atomic using a single wavefront-wide
241
77
  // atomic operation to do the calculation for the entire wavefront, so
242
77
  // remember the instruction so we can come back to it.
243
77
  const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent};
244
53
245
53
  ToReplace.push_back(Info);
246
53
}
247
248
// Use the builder to create the non-atomic counterpart of the specified
249
// atomicrmw binary op.
250
static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
251
387
                                  Value *LHS, Value *RHS) {
252
387
  CmpInst::Predicate Pred;
253
387
254
387
  switch (Op) {
255
387
  default:
256
0
    llvm_unreachable("Unhandled atomic op");
257
387
  case AtomicRMWInst::Add:
258
141
    return B.CreateBinOp(Instruction::Add, LHS, RHS);
259
387
  case AtomicRMWInst::Sub:
260
122
    return B.CreateBinOp(Instruction::Sub, LHS, RHS);
261
387
  case AtomicRMWInst::And:
262
16
    return B.CreateBinOp(Instruction::And, LHS, RHS);
263
387
  case AtomicRMWInst::Or:
264
16
    return B.CreateBinOp(Instruction::Or, LHS, RHS);
265
387
  case AtomicRMWInst::Xor:
266
16
    return B.CreateBinOp(Instruction::Xor, LHS, RHS);
267
387
268
387
  case AtomicRMWInst::Max:
269
19
    Pred = CmpInst::ICMP_SGT;
270
19
    break;
271
387
  case AtomicRMWInst::Min:
272
19
    Pred = CmpInst::ICMP_SLT;
273
19
    break;
274
387
  case AtomicRMWInst::UMax:
275
19
    Pred = CmpInst::ICMP_UGT;
276
19
    break;
277
387
  case AtomicRMWInst::UMin:
278
19
    Pred = CmpInst::ICMP_ULT;
279
19
    break;
280
76
  }
281
76
  Value *Cond = B.CreateICmp(Pred, LHS, RHS);
282
76
  return B.CreateSelect(Cond, LHS, RHS);
283
76
}
284
285
static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op,
286
135
                                         unsigned BitWidth) {
287
135
  switch (Op) {
288
135
  default:
289
0
    llvm_unreachable("Unhandled atomic op");
290
135
  case AtomicRMWInst::Add:
291
118
  case AtomicRMWInst::Sub:
292
118
  case AtomicRMWInst::Or:
293
118
  case AtomicRMWInst::Xor:
294
118
  case AtomicRMWInst::UMax:
295
118
    return APInt::getMinValue(BitWidth);
296
118
  case AtomicRMWInst::And:
297
7
  case AtomicRMWInst::UMin:
298
7
    return APInt::getMaxValue(BitWidth);
299
7
  case AtomicRMWInst::Max:
300
5
    return APInt::getSignedMinValue(BitWidth);
301
7
  case AtomicRMWInst::Min:
302
5
    return APInt::getSignedMaxValue(BitWidth);
303
135
  }
304
135
}
305
306
void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
307
                                           AtomicRMWInst::BinOp Op,
308
                                           unsigned ValIdx,
309
135
                                           bool ValDivergent) const {
310
135
  // Start building just before the instruction.
311
135
  IRBuilder<> B(&I);
312
135
313
135
  // If we are in a pixel shader, because of how we have to mask out helper
314
135
  // lane invocations, we need to record the entry and exit BB's.
315
135
  BasicBlock *PixelEntryBB = nullptr;
316
135
  BasicBlock *PixelExitBB = nullptr;
317
135
318
135
  // If we're optimizing an atomic within a pixel shader, we need to wrap the
319
135
  // entire atomic operation in a helper-lane check. We do not want any helper
320
135
  // lanes that are around only for the purposes of derivatives to take part
321
135
  // in any cross-lane communication, and we use a branch on whether the lane is
322
135
  // live to do this.
323
135
  if (IsPixelShader) {
324
5
    // Record I's original position as the entry block.
325
5
    PixelEntryBB = I.getParent();
326
5
327
5
    Value *const Cond = B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {}, {});
328
5
    Instruction *const NonHelperTerminator =
329
5
        SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr);
330
5
331
5
    // Record I's new position as the exit block.
332
5
    PixelExitBB = I.getParent();
333
5
334
5
    I.moveBefore(NonHelperTerminator);
335
5
    B.SetInsertPoint(&I);
336
5
  }
337
135
338
135
  Type *const Ty = I.getType();
339
135
  const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty);
340
135
  Type *const VecTy = VectorType::get(B.getInt32Ty(), 2);
341
135
342
135
  // This is the value in the atomic operation we need to combine in order to
343
135
  // reduce the number of atomic operations.
344
135
  Value *const V = I.getOperand(ValIdx);
345
135
346
135
  // We need to know how many lanes are active within the wavefront, and we do
347
135
  // this by doing a ballot of active lanes.
348
135
  CallInst *const Ballot = B.CreateIntrinsic(
349
135
      Intrinsic::amdgcn_icmp, {B.getInt64Ty(), B.getInt32Ty()},
350
135
      {B.getInt32(1), B.getInt32(0), B.getInt32(CmpInst::ICMP_NE)});
351
135
352
135
  // We need to know how many lanes are active within the wavefront that are
353
135
  // below us. If we counted each lane linearly starting from 0, a lane is
354
135
  // below us only if its associated index was less than ours. We do this by
355
135
  // using the mbcnt intrinsic.
356
135
  Value *const BitCast = B.CreateBitCast(Ballot, VecTy);
357
135
  Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0));
358
135
  Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1));
359
135
  CallInst *const PartialMbcnt = B.CreateIntrinsic(
360
135
      Intrinsic::amdgcn_mbcnt_lo, {}, {ExtractLo, B.getInt32(0)});
361
135
  Value *const Mbcnt =
362
135
      B.CreateIntCast(B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {},
363
135
                                        {ExtractHi, PartialMbcnt}),
364
135
                      Ty, false);
365
135
366
135
  Value *const Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth));
367
135
368
135
  Value *ExclScan = nullptr;
369
135
  Value *NewV = nullptr;
370
135
371
135
  // If we have a divergent value in each lane, we need to combine the value
372
135
  // using DPP.
373
135
  if (ValDivergent) {
374
36
    // First we need to set all inactive invocations to the identity value, so
375
36
    // that they can correctly contribute to the final result.
376
36
    CallInst *const SetInactive =
377
36
        B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
378
36
379
36
    ExclScan =
380
36
        B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, Ty,
381
36
                          {Identity, SetInactive, B.getInt32(DPP_WF_SR1),
382
36
                           B.getInt32(0xf), B.getInt32(0xf), B.getFalse()});
383
36
384
36
    const unsigned Iters = 6;
385
36
    const unsigned DPPCtrl[Iters] = {DPP_ROW_SR1,     DPP_ROW_SR2,
386
36
                                     DPP_ROW_SR4,     DPP_ROW_SR8,
387
36
                                     DPP_ROW_BCAST15, DPP_ROW_BCAST31};
388
36
    const unsigned RowMask[Iters] = {0xf, 0xf, 0xf, 0xf, 0xa, 0xc};
389
36
    const unsigned BankMask[Iters] = {0xf, 0xf, 0xe, 0xc, 0xf, 0xf};
390
36
391
36
    // This loop performs an exclusive scan across the wavefront, with all lanes
392
36
    // active (by using the WWM intrinsic).
393
252
    for (unsigned Idx = 0; Idx < Iters; 
Idx++216
) {
394
216
      CallInst *const DPP = B.CreateIntrinsic(
395
216
          Intrinsic::amdgcn_update_dpp, Ty,
396
216
          {Identity, ExclScan, B.getInt32(DPPCtrl[Idx]),
397
216
           B.getInt32(RowMask[Idx]), B.getInt32(BankMask[Idx]), B.getFalse()});
398
216
399
216
      ExclScan = buildNonAtomicBinOp(B, Op, ExclScan, DPP);
400
216
    }
401
36
402
36
    NewV = buildNonAtomicBinOp(B, Op, SetInactive, ExclScan);
403
36
404
36
    // Read the value from the last lane, which has accumlated the values of
405
36
    // each active lane in the wavefront. This will be our new value which we
406
36
    // will provide to the atomic operation.
407
36
    if (TyBitWidth == 64) {
408
0
      Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty());
409
0
      Value *const ExtractHi =
410
0
          B.CreateTrunc(B.CreateLShr(NewV, B.getInt64(32)), B.getInt32Ty());
411
0
      CallInst *const ReadLaneLo = B.CreateIntrinsic(
412
0
          Intrinsic::amdgcn_readlane, {}, {ExtractLo, B.getInt32(63)});
413
0
      CallInst *const ReadLaneHi = B.CreateIntrinsic(
414
0
          Intrinsic::amdgcn_readlane, {}, {ExtractHi, B.getInt32(63)});
415
0
      Value *const PartialInsert = B.CreateInsertElement(
416
0
          UndefValue::get(VecTy), ReadLaneLo, B.getInt32(0));
417
0
      Value *const Insert =
418
0
          B.CreateInsertElement(PartialInsert, ReadLaneHi, B.getInt32(1));
419
0
      NewV = B.CreateBitCast(Insert, Ty);
420
36
    } else if (TyBitWidth == 32) {
421
36
      NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
422
36
                               {NewV, B.getInt32(63)});
423
36
    } else {
424
0
      llvm_unreachable("Unhandled atomic bit width");
425
0
    }
426
36
427
36
    // Finally mark the readlanes in the WWM section.
428
36
    NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV);
429
99
  } else {
430
99
    switch (Op) {
431
99
    default:
432
0
      llvm_unreachable("Unhandled atomic op");
433
99
434
99
    case AtomicRMWInst::Add:
435
87
    case AtomicRMWInst::Sub: {
436
87
      // The new value we will be contributing to the atomic operation is the
437
87
      // old value times the number of active lanes.
438
87
      Value *const Ctpop = B.CreateIntCast(
439
87
          B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
440
87
      NewV = B.CreateMul(V, Ctpop);
441
87
      break;
442
87
    }
443
87
444
87
    case AtomicRMWInst::And:
445
12
    case AtomicRMWInst::Or:
446
12
    case AtomicRMWInst::Max:
447
12
    case AtomicRMWInst::Min:
448
12
    case AtomicRMWInst::UMax:
449
12
    case AtomicRMWInst::UMin:
450
12
      // These operations with a uniform value are idempotent: doing the atomic
451
12
      // operation multiple times has the same effect as doing it once.
452
12
      NewV = V;
453
12
      break;
454
12
455
12
    case AtomicRMWInst::Xor:
456
0
      // The new value we will be contributing to the atomic operation is the
457
0
      // old value times the parity of the number of active lanes.
458
0
      Value *const Ctpop = B.CreateIntCast(
459
0
          B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
460
0
      NewV = B.CreateMul(V, B.CreateAnd(Ctpop, 1));
461
0
      break;
462
135
    }
463
135
  }
464
135
465
135
  // We only want a single lane to enter our new control flow, and we do this
466
135
  // by checking if there are any active lanes below us. Only one lane will
467
135
  // have 0 active lanes below us, so that will be the only one to progress.
468
135
  Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getIntN(TyBitWidth, 0));
469
135
470
135
  // Store I's original basic block before we split the block.
471
135
  BasicBlock *const EntryBB = I.getParent();
472
135
473
135
  // We need to introduce some new control flow to force a single lane to be
474
135
  // active. We do this by splitting I's basic block at I, and introducing the
475
135
  // new block such that:
476
135
  // entry --> single_lane -\
477
135
  //       \------------------> exit
478
135
  Instruction *const SingleLaneTerminator =
479
135
      SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr);
480
135
481
135
  // Move the IR builder into single_lane next.
482
135
  B.SetInsertPoint(SingleLaneTerminator);
483
135
484
135
  // Clone the original atomic operation into single lane, replacing the
485
135
  // original value with our newly created one.
486
135
  Instruction *const NewI = I.clone();
487
135
  B.Insert(NewI);
488
135
  NewI->setOperand(ValIdx, NewV);
489
135
490
135
  // Move the IR builder into exit next, and start inserting just before the
491
135
  // original instruction.
492
135
  B.SetInsertPoint(&I);
493
135
494
135
  const bool NeedResult = !I.use_empty();
495
135
  if (NeedResult) {
496
135
    // Create a PHI node to get our new atomic result into the exit block.
497
135
    PHINode *const PHI = B.CreatePHI(Ty, 2);
498
135
    PHI->addIncoming(UndefValue::get(Ty), EntryBB);
499
135
    PHI->addIncoming(NewI, SingleLaneTerminator->getParent());
500
135
501
135
    // We need to broadcast the value who was the lowest active lane (the first
502
135
    // lane) to all other lanes in the wavefront. We use an intrinsic for this,
503
135
    // but have to handle 64-bit broadcasts with two calls to this intrinsic.
504
135
    Value *BroadcastI = nullptr;
505
135
506
135
    if (TyBitWidth == 64) {
507
36
      Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty());
508
36
      Value *const ExtractHi =
509
36
          B.CreateTrunc(B.CreateLShr(PHI, B.getInt64(32)), B.getInt32Ty());
510
36
      CallInst *const ReadFirstLaneLo =
511
36
          B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);
512
36
      CallInst *const ReadFirstLaneHi =
513
36
          B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi);
514
36
      Value *const PartialInsert = B.CreateInsertElement(
515
36
          UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));
516
36
      Value *const Insert =
517
36
          B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1));
518
36
      BroadcastI = B.CreateBitCast(Insert, Ty);
519
99
    } else if (TyBitWidth == 32) {
520
99
521
99
      BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI);
522
99
    } else {
523
0
      llvm_unreachable("Unhandled atomic bit width");
524
0
    }
525
135
526
135
    // Now that we have the result of our single atomic operation, we need to
527
135
    // get our individual lane's slice into the result. We use the lane offset
528
135
    // we previously calculated combined with the atomic result value we got
529
135
    // from the first lane, to get our lane's index into the atomic result.
530
135
    Value *LaneOffset = nullptr;
531
135
    if (ValDivergent) {
532
36
      LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, ExclScan);
533
99
    } else {
534
99
      switch (Op) {
535
99
      default:
536
0
        llvm_unreachable("Unhandled atomic op");
537
99
      case AtomicRMWInst::Add:
538
87
      case AtomicRMWInst::Sub:
539
87
        LaneOffset = B.CreateMul(V, Mbcnt);
540
87
        break;
541
87
      case AtomicRMWInst::And:
542
12
      case AtomicRMWInst::Or:
543
12
      case AtomicRMWInst::Max:
544
12
      case AtomicRMWInst::Min:
545
12
      case AtomicRMWInst::UMax:
546
12
      case AtomicRMWInst::UMin:
547
12
        LaneOffset = B.CreateSelect(Cond, Identity, V);
548
12
        break;
549
12
      case AtomicRMWInst::Xor:
550
0
        LaneOffset = B.CreateMul(V, B.CreateAnd(Mbcnt, 1));
551
0
        break;
552
135
      }
553
135
    }
554
135
    Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);
555
135
556
135
    if (IsPixelShader) {
557
5
      // Need a final PHI to reconverge to above the helper lane branch mask.
558
5
      B.SetInsertPoint(PixelExitBB->getFirstNonPHI());
559
5
560
5
      PHINode *const PHI = B.CreatePHI(Ty, 2);
561
5
      PHI->addIncoming(UndefValue::get(Ty), PixelEntryBB);
562
5
      PHI->addIncoming(Result, I.getParent());
563
5
      I.replaceAllUsesWith(PHI);
564
130
    } else {
565
130
      // Replace the original atomic instruction with the new one.
566
130
      I.replaceAllUsesWith(Result);
567
130
    }
568
135
  }
569
135
570
135
  // And delete the original.
571
135
  I.eraseFromParent();
572
135
}
573
574
101k
INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE,
575
101k
                      "AMDGPU atomic optimizations", false, false)
576
101k
INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
577
101k
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
578
101k
INITIALIZE_PASS_END(AMDGPUAtomicOptimizer, DEBUG_TYPE,
579
                    "AMDGPU atomic optimizations", false, false)
580
581
18
FunctionPass *llvm::createAMDGPUAtomicOptimizerPass() {
582
18
  return new AMDGPUAtomicOptimizer();
583
18
}