Coverage Report

Created: 2017-10-03 07:32

/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/CodeGen/AtomicExpandPass.cpp
Line
Count
Source (jump to first uncovered line)
1
//===- AtomicExpandPass.cpp - Expand atomic instructions ------------------===//
2
//
3
//                     The LLVM Compiler Infrastructure
4
//
5
// This file is distributed under the University of Illinois Open Source
6
// License. See LICENSE.TXT for details.
7
//
8
//===----------------------------------------------------------------------===//
9
//
10
// This file contains a pass (at IR level) to replace atomic instructions with
11
// __atomic_* library calls, or target specific instruction which implement the
12
// same semantics in a way which better fits the target backend.  This can
13
// include the use of (intrinsic-based) load-linked/store-conditional loops,
14
// AtomicCmpXchg, or type coercions.
15
//
16
//===----------------------------------------------------------------------===//
17
18
#include "llvm/ADT/ArrayRef.h"
19
#include "llvm/ADT/STLExtras.h"
20
#include "llvm/ADT/SmallVector.h"
21
#include "llvm/CodeGen/AtomicExpandUtils.h"
22
#include "llvm/CodeGen/RuntimeLibcalls.h"
23
#include "llvm/CodeGen/TargetPassConfig.h"
24
#include "llvm/CodeGen/ValueTypes.h"
25
#include "llvm/IR/Attributes.h"
26
#include "llvm/IR/BasicBlock.h"
27
#include "llvm/IR/Constant.h"
28
#include "llvm/IR/Constants.h"
29
#include "llvm/IR/DataLayout.h"
30
#include "llvm/IR/DerivedTypes.h"
31
#include "llvm/IR/Function.h"
32
#include "llvm/IR/IRBuilder.h"
33
#include "llvm/IR/InstIterator.h"
34
#include "llvm/IR/Instruction.h"
35
#include "llvm/IR/Instructions.h"
36
#include "llvm/IR/Module.h"
37
#include "llvm/IR/Type.h"
38
#include "llvm/IR/User.h"
39
#include "llvm/IR/Value.h"
40
#include "llvm/Pass.h"
41
#include "llvm/Support/AtomicOrdering.h"
42
#include "llvm/Support/Casting.h"
43
#include "llvm/Support/Debug.h"
44
#include "llvm/Support/ErrorHandling.h"
45
#include "llvm/Support/raw_ostream.h"
46
#include "llvm/Target/TargetLowering.h"
47
#include "llvm/Target/TargetMachine.h"
48
#include "llvm/Target/TargetSubtargetInfo.h"
49
#include <cassert>
50
#include <cstdint>
51
#include <iterator>
52
53
using namespace llvm;
54
55
#define DEBUG_TYPE "atomic-expand"
56
57
namespace {
58
59
  class AtomicExpand: public FunctionPass {
60
    const TargetLowering *TLI = nullptr;
61
62
  public:
63
    static char ID; // Pass identification, replacement for typeid
64
65
30.6k
    AtomicExpand() : FunctionPass(ID) {
66
30.6k
      initializeAtomicExpandPass(*PassRegistry::getPassRegistry());
67
30.6k
    }
68
69
    bool runOnFunction(Function &F) override;
70
71
  private:
72
    bool bracketInstWithFences(Instruction *I, AtomicOrdering Order);
73
    IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL);
74
    LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI);
75
    bool tryExpandAtomicLoad(LoadInst *LI);
76
    bool expandAtomicLoadToLL(LoadInst *LI);
77
    bool expandAtomicLoadToCmpXchg(LoadInst *LI);
78
    StoreInst *convertAtomicStoreToIntegerType(StoreInst *SI);
79
    bool expandAtomicStore(StoreInst *SI);
80
    bool tryExpandAtomicRMW(AtomicRMWInst *AI);
81
    Value *
82
    insertRMWLLSCLoop(IRBuilder<> &Builder, Type *ResultTy, Value *Addr,
83
                      AtomicOrdering MemOpOrder,
84
                      function_ref<Value *(IRBuilder<> &, Value *)> PerformOp);
85
    void expandAtomicOpToLLSC(
86
        Instruction *I, Type *ResultTy, Value *Addr, AtomicOrdering MemOpOrder,
87
        function_ref<Value *(IRBuilder<> &, Value *)> PerformOp);
88
    void expandPartwordAtomicRMW(
89
        AtomicRMWInst *I,
90
        TargetLoweringBase::AtomicExpansionKind ExpansionKind);
91
    void expandPartwordCmpXchg(AtomicCmpXchgInst *I);
92
93
    AtomicCmpXchgInst *convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI);
94
    static Value *insertRMWCmpXchgLoop(
95
        IRBuilder<> &Builder, Type *ResultType, Value *Addr,
96
        AtomicOrdering MemOpOrder,
97
        function_ref<Value *(IRBuilder<> &, Value *)> PerformOp,
98
        CreateCmpXchgInstFun CreateCmpXchg);
99
100
    bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI);
101
    bool isIdempotentRMW(AtomicRMWInst *AI);
102
    bool simplifyIdempotentRMW(AtomicRMWInst *AI);
103
104
    bool expandAtomicOpToLibcall(Instruction *I, unsigned Size, unsigned Align,
105
                                 Value *PointerOperand, Value *ValueOperand,
106
                                 Value *CASExpected, AtomicOrdering Ordering,
107
                                 AtomicOrdering Ordering2,
108
                                 ArrayRef<RTLIB::Libcall> Libcalls);
109
    void expandAtomicLoadToLibcall(LoadInst *LI);
110
    void expandAtomicStoreToLibcall(StoreInst *LI);
111
    void expandAtomicRMWToLibcall(AtomicRMWInst *I);
112
    void expandAtomicCASToLibcall(AtomicCmpXchgInst *I);
113
114
    friend bool
115
    llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
116
                                   CreateCmpXchgInstFun CreateCmpXchg);
117
  };
118
119
} // end anonymous namespace
120
121
char AtomicExpand::ID = 0;
122
123
char &llvm::AtomicExpandID = AtomicExpand::ID;
124
125
INITIALIZE_PASS(AtomicExpand, DEBUG_TYPE, "Expand Atomic instructions",
126
                false, false)
127
128
30.6k
FunctionPass *llvm::createAtomicExpandPass() { return new AtomicExpand(); }
129
130
// Helper functions to retrieve the size of atomic instructions.
131
1.00k
static unsigned getAtomicOpSize(LoadInst *LI) {
132
1.00k
  const DataLayout &DL = LI->getModule()->getDataLayout();
133
1.00k
  return DL.getTypeStoreSize(LI->getType());
134
1.00k
}
135
136
12.1k
static unsigned getAtomicOpSize(StoreInst *SI) {
137
12.1k
  const DataLayout &DL = SI->getModule()->getDataLayout();
138
12.1k
  return DL.getTypeStoreSize(SI->getValueOperand()->getType());
139
12.1k
}
140
141
81.4k
static unsigned getAtomicOpSize(AtomicRMWInst *RMWI) {
142
81.4k
  const DataLayout &DL = RMWI->getModule()->getDataLayout();
143
81.4k
  return DL.getTypeStoreSize(RMWI->getValOperand()->getType());
144
81.4k
}
145
146
20.0k
static unsigned getAtomicOpSize(AtomicCmpXchgInst *CASI) {
147
20.0k
  const DataLayout &DL = CASI->getModule()->getDataLayout();
148
20.0k
  return DL.getTypeStoreSize(CASI->getCompareOperand()->getType());
149
20.0k
}
150
151
// Helper functions to retrieve the alignment of atomic instructions.
152
1.00k
static unsigned getAtomicOpAlign(LoadInst *LI) {
153
1.00k
  unsigned Align = LI->getAlignment();
154
1.00k
  // In the future, if this IR restriction is relaxed, we should
155
1.00k
  // return DataLayout::getABITypeAlignment when there's no align
156
1.00k
  // value.
157
1.00k
  assert(Align != 0 && "An atomic LoadInst always has an explicit alignment");
158
1.00k
  return Align;
159
1.00k
}
160
161
12.1k
static unsigned getAtomicOpAlign(StoreInst *SI) {
162
12.1k
  unsigned Align = SI->getAlignment();
163
12.1k
  // In the future, if this IR restriction is relaxed, we should
164
12.1k
  // return DataLayout::getABITypeAlignment when there's no align
165
12.1k
  // value.
166
12.1k
  assert(Align != 0 && "An atomic StoreInst always has an explicit alignment");
167
12.1k
  return Align;
168
12.1k
}
169
170
42.2k
static unsigned getAtomicOpAlign(AtomicRMWInst *RMWI) {
171
42.2k
  // TODO(PR27168): This instruction has no alignment attribute, but unlike the
172
42.2k
  // default alignment for load/store, the default here is to assume
173
42.2k
  // it has NATURAL alignment, not DataLayout-specified alignment.
174
42.2k
  const DataLayout &DL = RMWI->getModule()->getDataLayout();
175
42.2k
  return DL.getTypeStoreSize(RMWI->getValOperand()->getType());
176
42.2k
}
177
178
10.0k
static unsigned getAtomicOpAlign(AtomicCmpXchgInst *CASI) {
179
10.0k
  // TODO(PR27168): same comment as above.
180
10.0k
  const DataLayout &DL = CASI->getModule()->getDataLayout();
181
10.0k
  return DL.getTypeStoreSize(CASI->getCompareOperand()->getType());
182
10.0k
}
183
184
// Determine if a particular atomic operation has a supported size,
185
// and is of appropriate alignment, to be passed through for target
186
// lowering. (Versus turning into a __atomic libcall)
187
template <typename Inst>
188
65.4k
static bool atomicSizeSupported(const TargetLowering *TLI, Inst *I) {
189
65.4k
  unsigned Size = getAtomicOpSize(I);
190
65.4k
  unsigned Align = getAtomicOpAlign(I);
191
65.4k
  return Align >= Size && Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8;
192
65.4k
}
AtomicExpandPass.cpp:bool atomicSizeSupported<llvm::LoadInst>(llvm::TargetLowering const*, llvm::LoadInst*)
Line
Count
Source
188
999
static bool atomicSizeSupported(const TargetLowering *TLI, Inst *I) {
189
999
  unsigned Size = getAtomicOpSize(I);
190
999
  unsigned Align = getAtomicOpAlign(I);
191
998
  return Align >= Size && Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8;
192
999
}
AtomicExpandPass.cpp:bool atomicSizeSupported<llvm::StoreInst>(llvm::TargetLowering const*, llvm::StoreInst*)
Line
Count
Source
188
12.1k
static bool atomicSizeSupported(const TargetLowering *TLI, Inst *I) {
189
12.1k
  unsigned Size = getAtomicOpSize(I);
190
12.1k
  unsigned Align = getAtomicOpAlign(I);
191
12.1k
  return Align >= Size && Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8;
192
12.1k
}
AtomicExpandPass.cpp:bool atomicSizeSupported<llvm::AtomicRMWInst>(llvm::TargetLowering const*, llvm::AtomicRMWInst*)
Line
Count
Source
188
42.2k
static bool atomicSizeSupported(const TargetLowering *TLI, Inst *I) {
189
42.2k
  unsigned Size = getAtomicOpSize(I);
190
42.2k
  unsigned Align = getAtomicOpAlign(I);
191
42.2k
  return Align >= Size && Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8;
192
42.2k
}
AtomicExpandPass.cpp:bool atomicSizeSupported<llvm::AtomicCmpXchgInst>(llvm::TargetLowering const*, llvm::AtomicCmpXchgInst*)
Line
Count
Source
188
10.0k
static bool atomicSizeSupported(const TargetLowering *TLI, Inst *I) {
189
10.0k
  unsigned Size = getAtomicOpSize(I);
190
10.0k
  unsigned Align = getAtomicOpAlign(I);
191
10.0k
  return Align >= Size && Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8;
192
10.0k
}
193
194
570k
bool AtomicExpand::runOnFunction(Function &F) {
195
570k
  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
196
570k
  if (!TPC)
197
0
    return false;
198
570k
199
570k
  auto &TM = TPC->getTM<TargetMachine>();
200
570k
  if (!TM.getSubtargetImpl(F)->enableAtomicExpand())
201
4.07k
    return false;
202
566k
  TLI = TM.getSubtargetImpl(F)->getTargetLowering();
203
566k
204
566k
  SmallVector<Instruction *, 1> AtomicInsts;
205
566k
206
566k
  // Changing control-flow while iterating through it is a bad idea, so gather a
207
566k
  // list of all atomic instructions before we start.
208
20.0M
  for (inst_iterator II = inst_begin(F), E = inst_end(F); 
II != E20.0M
;
++II19.5M
) {
209
19.5M
    Instruction *I = &*II;
210
19.5M
    if (
I->isAtomic() && 19.5M
!isa<FenceInst>(I)68.9k
)
211
65.4k
      AtomicInsts.push_back(I);
212
19.5M
  }
213
566k
214
566k
  bool MadeChange = false;
215
65.4k
  for (auto I : AtomicInsts) {
216
65.4k
    auto LI = dyn_cast<LoadInst>(I);
217
65.4k
    auto SI = dyn_cast<StoreInst>(I);
218
65.4k
    auto RMWI = dyn_cast<AtomicRMWInst>(I);
219
65.4k
    auto CASI = dyn_cast<AtomicCmpXchgInst>(I);
220
65.4k
    assert((LI || SI || RMWI || CASI) && "Unknown atomic instruction");
221
65.4k
222
65.4k
    // If the Size/Alignment is not supported, replace with a libcall.
223
65.4k
    if (
LI65.4k
) {
224
999
      if (
!atomicSizeSupported(TLI, LI)999
) {
225
4
        expandAtomicLoadToLibcall(LI);
226
4
        MadeChange = true;
227
4
        continue;
228
4
      }
229
64.4k
    } else 
if (64.4k
SI64.4k
) {
230
12.1k
      if (
!atomicSizeSupported(TLI, SI)12.1k
) {
231
5
        expandAtomicStoreToLibcall(SI);
232
5
        MadeChange = true;
233
5
        continue;
234
5
      }
235
52.2k
    } else 
if (52.2k
RMWI52.2k
) {
236
42.2k
      if (
!atomicSizeSupported(TLI, RMWI)42.2k
) {
237
4
        expandAtomicRMWToLibcall(RMWI);
238
4
        MadeChange = true;
239
4
        continue;
240
4
      }
241
10.0k
    } else 
if (10.0k
CASI10.0k
) {
242
10.0k
      if (
!atomicSizeSupported(TLI, CASI)10.0k
) {
243
3
        expandAtomicCASToLibcall(CASI);
244
3
        MadeChange = true;
245
3
        continue;
246
3
      }
247
65.3k
    }
248
65.3k
249
65.3k
    
if (65.3k
TLI->shouldInsertFencesForAtomic(I)65.3k
) {
250
3.07k
      auto FenceOrdering = AtomicOrdering::Monotonic;
251
3.07k
      if (
LI && 3.07k
isAcquireOrStronger(LI->getOrdering())260
) {
252
191
        FenceOrdering = LI->getOrdering();
253
191
        LI->setOrdering(AtomicOrdering::Monotonic);
254
3.07k
      } else 
if (2.88k
SI && 2.88k
isReleaseOrStronger(SI->getOrdering())320
) {
255
233
        FenceOrdering = SI->getOrdering();
256
233
        SI->setOrdering(AtomicOrdering::Monotonic);
257
2.88k
      } else 
if (2.65k
RMWI && 2.65k
(isReleaseOrStronger(RMWI->getOrdering()) ||
258
2.65k
                          
isAcquireOrStronger(RMWI->getOrdering())764
)) {
259
1.37k
        FenceOrdering = RMWI->getOrdering();
260
1.37k
        RMWI->setOrdering(AtomicOrdering::Monotonic);
261
2.65k
      } else 
if (1.28k
CASI && 1.28k
!TLI->shouldExpandAtomicCmpXchgInIR(CASI)632
&&
262
188
                 (isReleaseOrStronger(CASI->getSuccessOrdering()) ||
263
1.28k
                  
isAcquireOrStronger(CASI->getSuccessOrdering())80
)) {
264
127
        // If a compare and swap is lowered to LL/SC, we can do smarter fence
265
127
        // insertion, with a stronger one on the success path than on the
266
127
        // failure path. As a result, fence insertion is directly done by
267
127
        // expandAtomicCmpXchg in that case.
268
127
        FenceOrdering = CASI->getSuccessOrdering();
269
127
        CASI->setSuccessOrdering(AtomicOrdering::Monotonic);
270
127
        CASI->setFailureOrdering(AtomicOrdering::Monotonic);
271
127
      }
272
3.07k
273
3.07k
      if (
FenceOrdering != AtomicOrdering::Monotonic3.07k
) {
274
1.92k
        MadeChange |= bracketInstWithFences(I, FenceOrdering);
275
1.92k
      }
276
3.07k
    }
277
65.3k
278
65.3k
    if (
LI65.3k
) {
279
995
      if (
LI->getType()->isFloatingPointTy()995
) {
280
14
        // TODO: add a TLI hook to control this so that each target can
281
14
        // convert to lowering the original type one at a time.
282
14
        LI = convertAtomicLoadToIntegerType(LI);
283
14
        assert(LI->getType()->isIntegerTy() && "invariant broken");
284
14
        MadeChange = true;
285
14
      }
286
995
287
995
      MadeChange |= tryExpandAtomicLoad(LI);
288
65.3k
    } else 
if (64.3k
SI64.3k
) {
289
12.1k
      if (
SI->getValueOperand()->getType()->isFloatingPointTy()12.1k
) {
290
14
        // TODO: add a TLI hook to control this so that each target can
291
14
        // convert to lowering the original type one at a time.
292
14
        SI = convertAtomicStoreToIntegerType(SI);
293
14
        assert(SI->getValueOperand()->getType()->isIntegerTy() &&
294
14
               "invariant broken");
295
14
        MadeChange = true;
296
14
      }
297
12.1k
298
12.1k
      if (TLI->shouldExpandAtomicStoreInIR(SI))
299
101
        MadeChange |= expandAtomicStore(SI);
300
64.3k
    } else 
if (52.2k
RMWI52.2k
) {
301
42.2k
      // There are two different ways of expanding RMW instructions:
302
42.2k
      // - into a load if it is idempotent
303
42.2k
      // - into a Cmpxchg/LL-SC loop otherwise
304
42.2k
      // we try them in that order.
305
42.2k
306
42.2k
      if (
isIdempotentRMW(RMWI) && 42.2k
simplifyIdempotentRMW(RMWI)14
) {
307
11
        MadeChange = true;
308
42.2k
      } else {
309
42.2k
        MadeChange |= tryExpandAtomicRMW(RMWI);
310
42.2k
      }
311
52.2k
    } else 
if (10.0k
CASI10.0k
) {
312
10.0k
      // TODO: when we're ready to make the change at the IR level, we can
313
10.0k
      // extend convertCmpXchgToInteger for floating point too.
314
10.0k
      assert(!CASI->getCompareOperand()->getType()->isFloatingPointTy() &&
315
10.0k
             "unimplemented - floating point not legal at IR level");
316
10.0k
      if (
CASI->getCompareOperand()->getType()->isPointerTy()10.0k
) {
317
6
        // TODO: add a TLI hook to control this so that each target can
318
6
        // convert to lowering the original type one at a time.
319
6
        CASI = convertCmpXchgToIntegerType(CASI);
320
6
        assert(CASI->getCompareOperand()->getType()->isIntegerTy() &&
321
6
               "invariant broken");
322
6
        MadeChange = true;
323
6
      }
324
10.0k
325
10.0k
      unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8;
326
10.0k
      unsigned ValueSize = getAtomicOpSize(CASI);
327
10.0k
      if (
ValueSize < MinCASSize10.0k
) {
328
4
        assert(!TLI->shouldExpandAtomicCmpXchgInIR(CASI) &&
329
4
               "MinCmpXchgSizeInBits not yet supported for LL/SC expansions.");
330
4
        expandPartwordCmpXchg(CASI);
331
10.0k
      } else {
332
10.0k
        if (TLI->shouldExpandAtomicCmpXchgInIR(CASI))
333
8.96k
          MadeChange |= expandAtomicCmpXchg(CASI);
334
10.0k
      }
335
64.3k
    }
336
65.4k
  }
337
570k
  return MadeChange;
338
570k
}
339
340
1.92k
bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order) {
341
1.92k
  IRBuilder<> Builder(I);
342
1.92k
343
1.92k
  auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order);
344
1.92k
345
1.92k
  auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order);
346
1.92k
  // We have a guard here because not every atomic operation generates a
347
1.92k
  // trailing fence.
348
1.92k
  if (TrailingFence)
349
1.51k
    TrailingFence->moveAfter(I);
350
1.92k
351
471
  return (LeadingFence || TrailingFence);
352
1.92k
}
353
354
/// Get the iX type with the same bitwidth as T.
355
IntegerType *AtomicExpand::getCorrespondingIntegerType(Type *T,
356
34
                                                       const DataLayout &DL) {
357
34
  EVT VT = TLI->getValueType(DL, T);
358
34
  unsigned BitWidth = VT.getStoreSizeInBits();
359
34
  assert(BitWidth == VT.getSizeInBits() && "must be a power of two");
360
34
  return IntegerType::get(T->getContext(), BitWidth);
361
34
}
362
363
/// Convert an atomic load of a non-integral type to an integer load of the
364
/// equivalent bitwidth.  See the function comment on
365
/// convertAtomicStoreToIntegerType for background.  
366
14
LoadInst *AtomicExpand::convertAtomicLoadToIntegerType(LoadInst *LI) {
367
14
  auto *M = LI->getModule();
368
14
  Type *NewTy = getCorrespondingIntegerType(LI->getType(),
369
14
                                            M->getDataLayout());
370
14
371
14
  IRBuilder<> Builder(LI);
372
14
  
373
14
  Value *Addr = LI->getPointerOperand();
374
14
  Type *PT = PointerType::get(NewTy,
375
14
                              Addr->getType()->getPointerAddressSpace());
376
14
  Value *NewAddr = Builder.CreateBitCast(Addr, PT);
377
14
  
378
14
  auto *NewLI = Builder.CreateLoad(NewAddr);
379
14
  NewLI->setAlignment(LI->getAlignment());
380
14
  NewLI->setVolatile(LI->isVolatile());
381
14
  NewLI->setAtomic(LI->getOrdering(), LI->getSyncScopeID());
382
14
  DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n");
383
14
  
384
14
  Value *NewVal = Builder.CreateBitCast(NewLI, LI->getType());
385
14
  LI->replaceAllUsesWith(NewVal);
386
14
  LI->eraseFromParent();
387
14
  return NewLI;
388
14
}
389
390
1.00k
bool AtomicExpand::tryExpandAtomicLoad(LoadInst *LI) {
391
1.00k
  switch (TLI->shouldExpandAtomicLoadInIR(LI)) {
392
911
  case TargetLoweringBase::AtomicExpansionKind::None:
393
911
    return false;
394
2
  case TargetLoweringBase::AtomicExpansionKind::LLSC:
395
2
    expandAtomicOpToLLSC(
396
2
        LI, LI->getType(), LI->getPointerOperand(), LI->getOrdering(),
397
2
        [](IRBuilder<> &Builder, Value *Loaded) { return Loaded; });
398
2
    return true;
399
50
  case TargetLoweringBase::AtomicExpansionKind::LLOnly:
400
50
    return expandAtomicLoadToLL(LI);
401
43
  case TargetLoweringBase::AtomicExpansionKind::CmpXChg:
402
43
    return expandAtomicLoadToCmpXchg(LI);
403
0
  }
404
0
  
llvm_unreachable0
("Unhandled case in tryExpandAtomicLoad");
405
0
}
406
407
50
bool AtomicExpand::expandAtomicLoadToLL(LoadInst *LI) {
408
50
  IRBuilder<> Builder(LI);
409
50
410
50
  // On some architectures, load-linked instructions are atomic for larger
411
50
  // sizes than normal loads. For example, the only 64-bit load guaranteed
412
50
  // to be single-copy atomic by ARM is an ldrexd (A3.5.3).
413
50
  Value *Val =
414
50
      TLI->emitLoadLinked(Builder, LI->getPointerOperand(), LI->getOrdering());
415
50
  TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder);
416
50
417
50
  LI->replaceAllUsesWith(Val);
418
50
  LI->eraseFromParent();
419
50
420
50
  return true;
421
50
}
422
423
43
bool AtomicExpand::expandAtomicLoadToCmpXchg(LoadInst *LI) {
424
43
  IRBuilder<> Builder(LI);
425
43
  AtomicOrdering Order = LI->getOrdering();
426
43
  Value *Addr = LI->getPointerOperand();
427
43
  Type *Ty = cast<PointerType>(Addr->getType())->getElementType();
428
43
  Constant *DummyVal = Constant::getNullValue(Ty);
429
43
430
43
  Value *Pair = Builder.CreateAtomicCmpXchg(
431
43
      Addr, DummyVal, DummyVal, Order,
432
43
      AtomicCmpXchgInst::getStrongestFailureOrdering(Order));
433
43
  Value *Loaded = Builder.CreateExtractValue(Pair, 0, "loaded");
434
43
435
43
  LI->replaceAllUsesWith(Loaded);
436
43
  LI->eraseFromParent();
437
43
438
43
  return true;
439
43
}
440
441
/// Convert an atomic store of a non-integral type to an integer store of the
442
/// equivalent bitwidth.  We used to not support floating point or vector
443
/// atomics in the IR at all.  The backends learned to deal with the bitcast
444
/// idiom because that was the only way of expressing the notion of a atomic
445
/// float or vector store.  The long term plan is to teach each backend to
446
/// instruction select from the original atomic store, but as a migration
447
/// mechanism, we convert back to the old format which the backends understand.
448
/// Each backend will need individual work to recognize the new format.
449
14
StoreInst *AtomicExpand::convertAtomicStoreToIntegerType(StoreInst *SI) {
450
14
  IRBuilder<> Builder(SI);
451
14
  auto *M = SI->getModule();
452
14
  Type *NewTy = getCorrespondingIntegerType(SI->getValueOperand()->getType(),
453
14
                                            M->getDataLayout());
454
14
  Value *NewVal = Builder.CreateBitCast(SI->getValueOperand(), NewTy);
455
14
  
456
14
  Value *Addr = SI->getPointerOperand();
457
14
  Type *PT = PointerType::get(NewTy,
458
14
                              Addr->getType()->getPointerAddressSpace());
459
14
  Value *NewAddr = Builder.CreateBitCast(Addr, PT);
460
14
461
14
  StoreInst *NewSI = Builder.CreateStore(NewVal, NewAddr);
462
14
  NewSI->setAlignment(SI->getAlignment());
463
14
  NewSI->setVolatile(SI->isVolatile());
464
14
  NewSI->setAtomic(SI->getOrdering(), SI->getSyncScopeID());
465
14
  DEBUG(dbgs() << "Replaced " << *SI << " with " << *NewSI << "\n");
466
14
  SI->eraseFromParent();
467
14
  return NewSI;
468
14
}
469
470
101
bool AtomicExpand::expandAtomicStore(StoreInst *SI) {
471
101
  // This function is only called on atomic stores that are too large to be
472
101
  // atomic if implemented as a native store. So we replace them by an
473
101
  // atomic swap, that can be implemented for example as a ldrex/strex on ARM
474
101
  // or lock cmpxchg8/16b on X86, as these are atomic for larger sizes.
475
101
  // It is the responsibility of the target to only signal expansion via
476
101
  // shouldExpandAtomicRMW in cases where this is required and possible.
477
101
  IRBuilder<> Builder(SI);
478
101
  AtomicRMWInst *AI =
479
101
      Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, SI->getPointerOperand(),
480
101
                              SI->getValueOperand(), SI->getOrdering());
481
101
  SI->eraseFromParent();
482
101
483
101
  // Now we have an appropriate swap instruction, lower it as usual.
484
101
  return tryExpandAtomicRMW(AI);
485
101
}
486
487
static void createCmpXchgInstFun(IRBuilder<> &Builder, Value *Addr,
488
                                 Value *Loaded, Value *NewVal,
489
                                 AtomicOrdering MemOpOrder,
490
1.10k
                                 Value *&Success, Value *&NewLoaded) {
491
1.10k
  Value* Pair = Builder.CreateAtomicCmpXchg(
492
1.10k
      Addr, Loaded, NewVal, MemOpOrder,
493
1.10k
      AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder));
494
1.10k
  Success = Builder.CreateExtractValue(Pair, 1, "success");
495
1.10k
  NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");
496
1.10k
}
497
498
/// Emit IR to implement the given atomicrmw operation on values in registers,
499
/// returning the new value.
500
static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder,
501
39.1k
                              Value *Loaded, Value *Inc) {
502
39.1k
  Value *NewVal;
503
39.1k
  switch (Op) {
504
11.9k
  case AtomicRMWInst::Xchg:
505
11.9k
    return Inc;
506
19.7k
  case AtomicRMWInst::Add:
507
19.7k
    return Builder.CreateAdd(Loaded, Inc, "new");
508
5.81k
  case AtomicRMWInst::Sub:
509
5.81k
    return Builder.CreateSub(Loaded, Inc, "new");
510
455
  case AtomicRMWInst::And:
511
455
    return Builder.CreateAnd(Loaded, Inc, "new");
512
83
  case AtomicRMWInst::Nand:
513
83
    return Builder.CreateNot(Builder.CreateAnd(Loaded, Inc), "new");
514
457
  case AtomicRMWInst::Or:
515
457
    return Builder.CreateOr(Loaded, Inc, "new");
516
454
  case AtomicRMWInst::Xor:
517
454
    return Builder.CreateXor(Loaded, Inc, "new");
518
51
  case AtomicRMWInst::Max:
519
51
    NewVal = Builder.CreateICmpSGT(Loaded, Inc);
520
51
    return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
521
59
  case AtomicRMWInst::Min:
522
59
    NewVal = Builder.CreateICmpSLE(Loaded, Inc);
523
59
    return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
524
58
  case AtomicRMWInst::UMax:
525
58
    NewVal = Builder.CreateICmpUGT(Loaded, Inc);
526
58
    return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
527
59
  case AtomicRMWInst::UMin:
528
59
    NewVal = Builder.CreateICmpULE(Loaded, Inc);
529
59
    return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
530
0
  default:
531
0
    llvm_unreachable("Unknown atomic op");
532
0
  }
533
0
}
534
535
42.3k
bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) {
536
42.3k
  switch (TLI->shouldExpandAtomicRMWInIR(AI)) {
537
3.16k
  case TargetLoweringBase::AtomicExpansionKind::None:
538
3.16k
    return false;
539
38.0k
  case TargetLoweringBase::AtomicExpansionKind::LLSC: {
540
38.0k
    unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8;
541
38.0k
    unsigned ValueSize = getAtomicOpSize(AI);
542
38.0k
    if (
ValueSize < MinCASSize38.0k
) {
543
0
      llvm_unreachable(
544
0
          "MinCmpXchgSizeInBits not yet supported for LL/SC architectures.");
545
0
    } else {
546
38.0k
      auto PerformOp = [&](IRBuilder<> &Builder, Value *Loaded) {
547
38.0k
        return performAtomicOp(AI->getOperation(), Builder, Loaded,
548
38.0k
                               AI->getValOperand());
549
38.0k
      };
550
38.0k
      expandAtomicOpToLLSC(AI, AI->getType(), AI->getPointerOperand(),
551
38.0k
                           AI->getOrdering(), PerformOp);
552
38.0k
    }
553
38.0k
    return true;
554
38.0k
  }
555
1.10k
  case TargetLoweringBase::AtomicExpansionKind::CmpXChg: {
556
1.10k
    unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8;
557
1.10k
    unsigned ValueSize = getAtomicOpSize(AI);
558
1.10k
    if (
ValueSize < MinCASSize1.10k
) {
559
7
      expandPartwordAtomicRMW(AI,
560
7
                              TargetLoweringBase::AtomicExpansionKind::CmpXChg);
561
1.10k
    } else {
562
1.09k
      expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun);
563
1.09k
    }
564
1.10k
    return true;
565
38.0k
  }
566
0
  default:
567
0
    llvm_unreachable("Unhandled case in tryExpandAtomicRMW");
568
0
  }
569
0
}
570
571
namespace {
572
573
/// Result values from createMaskInstrs helper.
574
struct PartwordMaskValues {
575
  Type *WordType;
576
  Type *ValueType;
577
  Value *AlignedAddr;
578
  Value *ShiftAmt;
579
  Value *Mask;
580
  Value *Inv_Mask;
581
};
582
583
} // end anonymous namespace
584
585
/// This is a helper function which builds instructions to provide
586
/// values necessary for partword atomic operations. It takes an
587
/// incoming address, Addr, and ValueType, and constructs the address,
588
/// shift-amounts and masks needed to work with a larger value of size
589
/// WordSize.
590
///
591
/// AlignedAddr: Addr rounded down to a multiple of WordSize
592
///
593
/// ShiftAmt: Number of bits to right-shift a WordSize value loaded
594
///           from AlignAddr for it to have the same value as if
595
///           ValueType was loaded from Addr.
596
///
597
/// Mask: Value to mask with the value loaded from AlignAddr to
598
///       include only the part that would've been loaded from Addr.
599
///
600
/// Inv_Mask: The inverse of Mask.
601
static PartwordMaskValues createMaskInstrs(IRBuilder<> &Builder, Instruction *I,
602
                                           Type *ValueType, Value *Addr,
603
11
                                           unsigned WordSize) {
604
11
  PartwordMaskValues Ret;
605
11
606
11
  BasicBlock *BB = I->getParent();
607
11
  Function *F = BB->getParent();
608
11
  Module *M = I->getModule();
609
11
610
11
  LLVMContext &Ctx = F->getContext();
611
11
  const DataLayout &DL = M->getDataLayout();
612
11
613
11
  unsigned ValueSize = DL.getTypeStoreSize(ValueType);
614
11
615
11
  assert(ValueSize < WordSize);
616
11
617
11
  Ret.ValueType = ValueType;
618
11
  Ret.WordType = Type::getIntNTy(Ctx, WordSize * 8);
619
11
620
11
  Type *WordPtrType =
621
11
      Ret.WordType->getPointerTo(Addr->getType()->getPointerAddressSpace());
622
11
623
11
  Value *AddrInt = Builder.CreatePtrToInt(Addr, DL.getIntPtrType(Ctx));
624
11
  Ret.AlignedAddr = Builder.CreateIntToPtr(
625
11
      Builder.CreateAnd(AddrInt, ~(uint64_t)(WordSize - 1)), WordPtrType,
626
11
      "AlignedAddr");
627
11
628
11
  Value *PtrLSB = Builder.CreateAnd(AddrInt, WordSize - 1, "PtrLSB");
629
11
  if (
DL.isLittleEndian()11
) {
630
0
    // turn bytes into bits
631
0
    Ret.ShiftAmt = Builder.CreateShl(PtrLSB, 3);
632
11
  } else {
633
11
    // turn bytes into bits, and count from the other side.
634
11
    Ret.ShiftAmt =
635
11
        Builder.CreateShl(Builder.CreateXor(PtrLSB, WordSize - ValueSize), 3);
636
11
  }
637
11
638
11
  Ret.ShiftAmt = Builder.CreateTrunc(Ret.ShiftAmt, Ret.WordType, "ShiftAmt");
639
11
  Ret.Mask = Builder.CreateShl(
640
11
      ConstantInt::get(Ret.WordType, (1 << ValueSize * 8) - 1), Ret.ShiftAmt,
641
11
      "Mask");
642
11
  Ret.Inv_Mask = Builder.CreateNot(Ret.Mask, "Inv_Mask");
643
11
644
11
  return Ret;
645
11
}
646
647
/// Emit IR to implement a masked version of a given atomicrmw
648
/// operation. (That is, only the bits under the Mask should be
649
/// affected by the operation)
650
static Value *performMaskedAtomicOp(AtomicRMWInst::BinOp Op,
651
                                    IRBuilder<> &Builder, Value *Loaded,
652
                                    Value *Shifted_Inc, Value *Inc,
653
7
                                    const PartwordMaskValues &PMV) {
654
7
  switch (Op) {
655
2
  case AtomicRMWInst::Xchg: {
656
2
    Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask);
657
2
    Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, Shifted_Inc);
658
2
    return FinalVal;
659
7
  }
660
1
  case AtomicRMWInst::Or:
661
1
  case AtomicRMWInst::Xor:
662
1
    // Or/Xor won't affect any other bits, so can just be done
663
1
    // directly.
664
1
    return performAtomicOp(Op, Builder, Loaded, Shifted_Inc);
665
3
  case AtomicRMWInst::Add:
666
3
  case AtomicRMWInst::Sub:
667
3
  case AtomicRMWInst::And:
668
3
  case AtomicRMWInst::Nand: {
669
3
    // The other arithmetic ops need to be masked into place.
670
3
    Value *NewVal = performAtomicOp(Op, Builder, Loaded, Shifted_Inc);
671
3
    Value *NewVal_Masked = Builder.CreateAnd(NewVal, PMV.Mask);
672
3
    Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask);
673
3
    Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Masked);
674
3
    return FinalVal;
675
3
  }
676
1
  case AtomicRMWInst::Max:
677
1
  case AtomicRMWInst::Min:
678
1
  case AtomicRMWInst::UMax:
679
1
  case AtomicRMWInst::UMin: {
680
1
    // Finally, comparison ops will operate on the full value, so
681
1
    // truncate down to the original size, and expand out again after
682
1
    // doing the operation.
683
1
    Value *Loaded_Shiftdown = Builder.CreateTrunc(
684
1
        Builder.CreateLShr(Loaded, PMV.ShiftAmt), PMV.ValueType);
685
1
    Value *NewVal = performAtomicOp(Op, Builder, Loaded_Shiftdown, Inc);
686
1
    Value *NewVal_Shiftup = Builder.CreateShl(
687
1
        Builder.CreateZExt(NewVal, PMV.WordType), PMV.ShiftAmt);
688
1
    Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask);
689
1
    Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Shiftup);
690
1
    return FinalVal;
691
1
  }
692
0
  default:
693
0
    llvm_unreachable("Unknown atomic op");
694
0
  }
695
0
}
696
697
/// Expand a sub-word atomicrmw operation into an appropriate
698
/// word-sized operation.
699
///
700
/// It will create an LL/SC or cmpxchg loop, as appropriate, the same
701
/// way as a typical atomicrmw expansion. The only difference here is
702
/// that the operation inside of the loop must operate only upon a
703
/// part of the value.
704
void AtomicExpand::expandPartwordAtomicRMW(
705
7
    AtomicRMWInst *AI, TargetLoweringBase::AtomicExpansionKind ExpansionKind) {
706
7
  assert(ExpansionKind == TargetLoweringBase::AtomicExpansionKind::CmpXChg);
707
7
708
7
  AtomicOrdering MemOpOrder = AI->getOrdering();
709
7
710
7
  IRBuilder<> Builder(AI);
711
7
712
7
  PartwordMaskValues PMV =
713
7
      createMaskInstrs(Builder, AI, AI->getType(), AI->getPointerOperand(),
714
7
                       TLI->getMinCmpXchgSizeInBits() / 8);
715
7
716
7
  Value *ValOperand_Shifted =
717
7
      Builder.CreateShl(Builder.CreateZExt(AI->getValOperand(), PMV.WordType),
718
7
                        PMV.ShiftAmt, "ValOperand_Shifted");
719
7
720
7
  auto PerformPartwordOp = [&](IRBuilder<> &Builder, Value *Loaded) {
721
7
    return performMaskedAtomicOp(AI->getOperation(), Builder, Loaded,
722
7
                                 ValOperand_Shifted, AI->getValOperand(), PMV);
723
7
  };
724
7
725
7
  // TODO: When we're ready to support LLSC conversions too, use
726
7
  // insertRMWLLSCLoop here for ExpansionKind==LLSC.
727
7
  Value *OldResult =
728
7
      insertRMWCmpXchgLoop(Builder, PMV.WordType, PMV.AlignedAddr, MemOpOrder,
729
7
                           PerformPartwordOp, createCmpXchgInstFun);
730
7
  Value *FinalOldResult = Builder.CreateTrunc(
731
7
      Builder.CreateLShr(OldResult, PMV.ShiftAmt), PMV.ValueType);
732
7
  AI->replaceAllUsesWith(FinalOldResult);
733
7
  AI->eraseFromParent();
734
7
}
735
736
4
void AtomicExpand::expandPartwordCmpXchg(AtomicCmpXchgInst *CI) {
737
4
  // The basic idea here is that we're expanding a cmpxchg of a
738
4
  // smaller memory size up to a word-sized cmpxchg. To do this, we
739
4
  // need to add a retry-loop for strong cmpxchg, so that
740
4
  // modifications to other parts of the word don't cause a spurious
741
4
  // failure.
742
4
743
4
  // This generates code like the following:
744
4
  //     [[Setup mask values PMV.*]]
745
4
  //     %NewVal_Shifted = shl i32 %NewVal, %PMV.ShiftAmt
746
4
  //     %Cmp_Shifted = shl i32 %Cmp, %PMV.ShiftAmt
747
4
  //     %InitLoaded = load i32* %addr
748
4
  //     %InitLoaded_MaskOut = and i32 %InitLoaded, %PMV.Inv_Mask
749
4
  //     br partword.cmpxchg.loop
750
4
  // partword.cmpxchg.loop:
751
4
  //     %Loaded_MaskOut = phi i32 [ %InitLoaded_MaskOut, %entry ],
752
4
  //        [ %OldVal_MaskOut, %partword.cmpxchg.failure ]
753
4
  //     %FullWord_NewVal = or i32 %Loaded_MaskOut, %NewVal_Shifted
754
4
  //     %FullWord_Cmp = or i32 %Loaded_MaskOut, %Cmp_Shifted
755
4
  //     %NewCI = cmpxchg i32* %PMV.AlignedAddr, i32 %FullWord_Cmp,
756
4
  //        i32 %FullWord_NewVal success_ordering failure_ordering
757
4
  //     %OldVal = extractvalue { i32, i1 } %NewCI, 0
758
4
  //     %Success = extractvalue { i32, i1 } %NewCI, 1
759
4
  //     br i1 %Success, label %partword.cmpxchg.end,
760
4
  //        label %partword.cmpxchg.failure
761
4
  // partword.cmpxchg.failure:
762
4
  //     %OldVal_MaskOut = and i32 %OldVal, %PMV.Inv_Mask
763
4
  //     %ShouldContinue = icmp ne i32 %Loaded_MaskOut, %OldVal_MaskOut
764
4
  //     br i1 %ShouldContinue, label %partword.cmpxchg.loop,
765
4
  //         label %partword.cmpxchg.end
766
4
  // partword.cmpxchg.end:
767
4
  //    %tmp1 = lshr i32 %OldVal, %PMV.ShiftAmt
768
4
  //    %FinalOldVal = trunc i32 %tmp1 to i8
769
4
  //    %tmp2 = insertvalue { i8, i1 } undef, i8 %FinalOldVal, 0
770
4
  //    %Res = insertvalue { i8, i1 } %25, i1 %Success, 1
771
4
772
4
  Value *Addr = CI->getPointerOperand();
773
4
  Value *Cmp = CI->getCompareOperand();
774
4
  Value *NewVal = CI->getNewValOperand();
775
4
776
4
  BasicBlock *BB = CI->getParent();
777
4
  Function *F = BB->getParent();
778
4
  IRBuilder<> Builder(CI);
779
4
  LLVMContext &Ctx = Builder.getContext();
780
4
781
4
  const int WordSize = TLI->getMinCmpXchgSizeInBits() / 8;
782
4
783
4
  BasicBlock *EndBB =
784
4
      BB->splitBasicBlock(CI->getIterator(), "partword.cmpxchg.end");
785
4
  auto FailureBB =
786
4
      BasicBlock::Create(Ctx, "partword.cmpxchg.failure", F, EndBB);
787
4
  auto LoopBB = BasicBlock::Create(Ctx, "partword.cmpxchg.loop", F, FailureBB);
788
4
789
4
  // The split call above "helpfully" added a branch at the end of BB
790
4
  // (to the wrong place).
791
4
  std::prev(BB->end())->eraseFromParent();
792
4
  Builder.SetInsertPoint(BB);
793
4
794
4
  PartwordMaskValues PMV = createMaskInstrs(
795
4
      Builder, CI, CI->getCompareOperand()->getType(), Addr, WordSize);
796
4
797
4
  // Shift the incoming values over, into the right location in the word.
798
4
  Value *NewVal_Shifted =
799
4
      Builder.CreateShl(Builder.CreateZExt(NewVal, PMV.WordType), PMV.ShiftAmt);
800
4
  Value *Cmp_Shifted =
801
4
      Builder.CreateShl(Builder.CreateZExt(Cmp, PMV.WordType), PMV.ShiftAmt);
802
4
803
4
  // Load the entire current word, and mask into place the expected and new
804
4
  // values
805
4
  LoadInst *InitLoaded = Builder.CreateLoad(PMV.WordType, PMV.AlignedAddr);
806
4
  InitLoaded->setVolatile(CI->isVolatile());
807
4
  Value *InitLoaded_MaskOut = Builder.CreateAnd(InitLoaded, PMV.Inv_Mask);
808
4
  Builder.CreateBr(LoopBB);
809
4
810
4
  // partword.cmpxchg.loop:
811
4
  Builder.SetInsertPoint(LoopBB);
812
4
  PHINode *Loaded_MaskOut = Builder.CreatePHI(PMV.WordType, 2);
813
4
  Loaded_MaskOut->addIncoming(InitLoaded_MaskOut, BB);
814
4
815
4
  // Mask/Or the expected and new values into place in the loaded word.
816
4
  Value *FullWord_NewVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Shifted);
817
4
  Value *FullWord_Cmp = Builder.CreateOr(Loaded_MaskOut, Cmp_Shifted);
818
4
  AtomicCmpXchgInst *NewCI = Builder.CreateAtomicCmpXchg(
819
4
      PMV.AlignedAddr, FullWord_Cmp, FullWord_NewVal, CI->getSuccessOrdering(),
820
4
      CI->getFailureOrdering(), CI->getSyncScopeID());
821
4
  NewCI->setVolatile(CI->isVolatile());
822
4
  // When we're building a strong cmpxchg, we need a loop, so you
823
4
  // might think we could use a weak cmpxchg inside. But, using strong
824
4
  // allows the below comparison for ShouldContinue, and we're
825
4
  // expecting the underlying cmpxchg to be a machine instruction,
826
4
  // which is strong anyways.
827
4
  NewCI->setWeak(CI->isWeak());
828
4
829
4
  Value *OldVal = Builder.CreateExtractValue(NewCI, 0);
830
4
  Value *Success = Builder.CreateExtractValue(NewCI, 1);
831
4
832
4
  if (CI->isWeak())
833
0
    Builder.CreateBr(EndBB);
834
4
  else
835
4
    Builder.CreateCondBr(Success, EndBB, FailureBB);
836
4
837
4
  // partword.cmpxchg.failure:
838
4
  Builder.SetInsertPoint(FailureBB);
839
4
  // Upon failure, verify that the masked-out part of the loaded value
840
4
  // has been modified.  If it didn't, abort the cmpxchg, since the
841
4
  // masked-in part must've.
842
4
  Value *OldVal_MaskOut = Builder.CreateAnd(OldVal, PMV.Inv_Mask);
843
4
  Value *ShouldContinue = Builder.CreateICmpNE(Loaded_MaskOut, OldVal_MaskOut);
844
4
  Builder.CreateCondBr(ShouldContinue, LoopBB, EndBB);
845
4
846
4
  // Add the second value to the phi from above
847
4
  Loaded_MaskOut->addIncoming(OldVal_MaskOut, FailureBB);
848
4
849
4
  // partword.cmpxchg.end:
850
4
  Builder.SetInsertPoint(CI);
851
4
852
4
  Value *FinalOldVal = Builder.CreateTrunc(
853
4
      Builder.CreateLShr(OldVal, PMV.ShiftAmt), PMV.ValueType);
854
4
  Value *Res = UndefValue::get(CI->getType());
855
4
  Res = Builder.CreateInsertValue(Res, FinalOldVal, 0);
856
4
  Res = Builder.CreateInsertValue(Res, Success, 1);
857
4
858
4
  CI->replaceAllUsesWith(Res);
859
4
  CI->eraseFromParent();
860
4
}
861
862
void AtomicExpand::expandAtomicOpToLLSC(
863
    Instruction *I, Type *ResultType, Value *Addr, AtomicOrdering MemOpOrder,
864
38.0k
    function_ref<Value *(IRBuilder<> &, Value *)> PerformOp) {
865
38.0k
  IRBuilder<> Builder(I);
866
38.0k
  Value *Loaded =
867
38.0k
      insertRMWLLSCLoop(Builder, ResultType, Addr, MemOpOrder, PerformOp);
868
38.0k
869
38.0k
  I->replaceAllUsesWith(Loaded);
870
38.0k
  I->eraseFromParent();
871
38.0k
}
872
873
Value *AtomicExpand::insertRMWLLSCLoop(
874
    IRBuilder<> &Builder, Type *ResultTy, Value *Addr,
875
    AtomicOrdering MemOpOrder,
876
38.0k
    function_ref<Value *(IRBuilder<> &, Value *)> PerformOp) {
877
38.0k
  LLVMContext &Ctx = Builder.getContext();
878
38.0k
  BasicBlock *BB = Builder.GetInsertBlock();
879
38.0k
  Function *F = BB->getParent();
880
38.0k
881
38.0k
  // Given: atomicrmw some_op iN* %addr, iN %incr ordering
882
38.0k
  //
883
38.0k
  // The standard expansion we produce is:
884
38.0k
  //     [...]
885
38.0k
  // atomicrmw.start:
886
38.0k
  //     %loaded = @load.linked(%addr)
887
38.0k
  //     %new = some_op iN %loaded, %incr
888
38.0k
  //     %stored = @store_conditional(%new, %addr)
889
38.0k
  //     %try_again = icmp i32 ne %stored, 0
890
38.0k
  //     br i1 %try_again, label %loop, label %atomicrmw.end
891
38.0k
  // atomicrmw.end:
892
38.0k
  //     [...]
893
38.0k
  BasicBlock *ExitBB =
894
38.0k
      BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
895
38.0k
  BasicBlock *LoopBB =  BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
896
38.0k
897
38.0k
  // The split call above "helpfully" added a branch at the end of BB (to the
898
38.0k
  // wrong place).
899
38.0k
  std::prev(BB->end())->eraseFromParent();
900
38.0k
  Builder.SetInsertPoint(BB);
901
38.0k
  Builder.CreateBr(LoopBB);
902
38.0k
903
38.0k
  // Start the main loop block now that we've taken care of the preliminaries.
904
38.0k
  Builder.SetInsertPoint(LoopBB);
905
38.0k
  Value *Loaded = TLI->emitLoadLinked(Builder, Addr, MemOpOrder);
906
38.0k
907
38.0k
  Value *NewVal = PerformOp(Builder, Loaded);
908
38.0k
909
38.0k
  Value *StoreSuccess =
910
38.0k
      TLI->emitStoreConditional(Builder, NewVal, Addr, MemOpOrder);
911
38.0k
  Value *TryAgain = Builder.CreateICmpNE(
912
38.0k
      StoreSuccess, ConstantInt::get(IntegerType::get(Ctx, 32), 0), "tryagain");
913
38.0k
  Builder.CreateCondBr(TryAgain, LoopBB, ExitBB);
914
38.0k
915
38.0k
  Builder.SetInsertPoint(ExitBB, ExitBB->begin());
916
38.0k
  return Loaded;
917
38.0k
}
918
919
/// Convert an atomic cmpxchg of a non-integral type to an integer cmpxchg of
920
/// the equivalent bitwidth.  We used to not support pointer cmpxchg in the
921
/// IR.  As a migration step, we convert back to what use to be the standard
922
/// way to represent a pointer cmpxchg so that we can update backends one by
923
/// one. 
924
6
AtomicCmpXchgInst *AtomicExpand::convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI) {
925
6
  auto *M = CI->getModule();
926
6
  Type *NewTy = getCorrespondingIntegerType(CI->getCompareOperand()->getType(),
927
6
                                            M->getDataLayout());
928
6
929
6
  IRBuilder<> Builder(CI);
930
6
  
931
6
  Value *Addr = CI->getPointerOperand();
932
6
  Type *PT = PointerType::get(NewTy,
933
6
                              Addr->getType()->getPointerAddressSpace());
934
6
  Value *NewAddr = Builder.CreateBitCast(Addr, PT);
935
6
936
6
  Value *NewCmp = Builder.CreatePtrToInt(CI->getCompareOperand(), NewTy);
937
6
  Value *NewNewVal = Builder.CreatePtrToInt(CI->getNewValOperand(), NewTy);
938
6
  
939
6
  
940
6
  auto *NewCI = Builder.CreateAtomicCmpXchg(NewAddr, NewCmp, NewNewVal,
941
6
                                            CI->getSuccessOrdering(),
942
6
                                            CI->getFailureOrdering(),
943
6
                                            CI->getSyncScopeID());
944
6
  NewCI->setVolatile(CI->isVolatile());
945
6
  NewCI->setWeak(CI->isWeak());
946
6
  DEBUG(dbgs() << "Replaced " << *CI << " with " << *NewCI << "\n");
947
6
948
6
  Value *OldVal = Builder.CreateExtractValue(NewCI, 0);
949
6
  Value *Succ = Builder.CreateExtractValue(NewCI, 1);
950
6
951
6
  OldVal = Builder.CreateIntToPtr(OldVal, CI->getCompareOperand()->getType());
952
6
953
6
  Value *Res = UndefValue::get(CI->getType());
954
6
  Res = Builder.CreateInsertValue(Res, OldVal, 0);
955
6
  Res = Builder.CreateInsertValue(Res, Succ, 1);
956
6
957
6
  CI->replaceAllUsesWith(Res);
958
6
  CI->eraseFromParent();
959
6
  return NewCI;
960
6
}
961
962
8.96k
bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
963
8.96k
  AtomicOrdering SuccessOrder = CI->getSuccessOrdering();
964
8.96k
  AtomicOrdering FailureOrder = CI->getFailureOrdering();
965
8.96k
  Value *Addr = CI->getPointerOperand();
966
8.96k
  BasicBlock *BB = CI->getParent();
967
8.96k
  Function *F = BB->getParent();
968
8.96k
  LLVMContext &Ctx = F->getContext();
969
8.96k
  // If shouldInsertFencesForAtomic() returns true, then the target does not
970
8.96k
  // want to deal with memory orders, and emitLeading/TrailingFence should take
971
8.96k
  // care of everything. Otherwise, emitLeading/TrailingFence are no-op and we
972
8.96k
  // should preserve the ordering.
973
8.96k
  bool ShouldInsertFencesForAtomic = TLI->shouldInsertFencesForAtomic(CI);
974
8.96k
  AtomicOrdering MemOpOrder =
975
8.96k
      ShouldInsertFencesForAtomic ? 
AtomicOrdering::Monotonic444
:
SuccessOrder8.52k
;
976
8.96k
977
8.96k
  // In implementations which use a barrier to achieve release semantics, we can
978
8.96k
  // delay emitting this barrier until we know a store is actually going to be
979
8.96k
  // attempted. The cost of this delay is that we need 2 copies of the block
980
8.96k
  // emitting the load-linked, affecting code size.
981
8.96k
  //
982
8.96k
  // Ideally, this logic would be unconditional except for the minsize check
983
8.96k
  // since in other cases the extra blocks naturally collapse down to the
984
8.96k
  // minimal loop. Unfortunately, this puts too much stress on later
985
8.96k
  // optimisations so we avoid emitting the extra logic in those cases too.
986
8.95k
  bool HasReleasedLoadBB = !CI->isWeak() && ShouldInsertFencesForAtomic &&
987
438
                           SuccessOrder != AtomicOrdering::Monotonic &&
988
390
                           SuccessOrder != AtomicOrdering::Acquire &&
989
303
                           !F->optForMinSize();
990
8.96k
991
8.96k
  // There's no overhead for sinking the release barrier in a weak cmpxchg, so
992
8.96k
  // do it even on minsize.
993
2
  bool UseUnconditionalReleaseBarrier = F->optForMinSize() && !CI->isWeak();
994
8.96k
995
8.96k
  // Given: cmpxchg some_op iN* %addr, iN %desired, iN %new success_ord fail_ord
996
8.96k
  //
997
8.96k
  // The full expansion we produce is:
998
8.96k
  //     [...]
999
8.96k
  // cmpxchg.start:
1000
8.96k
  //     %unreleasedload = @load.linked(%addr)
1001
8.96k
  //     %should_store = icmp eq %unreleasedload, %desired
1002
8.96k
  //     br i1 %should_store, label %cmpxchg.fencedstore,
1003
8.96k
  //                          label %cmpxchg.nostore
1004
8.96k
  // cmpxchg.releasingstore:
1005
8.96k
  //     fence?
1006
8.96k
  //     br label cmpxchg.trystore
1007
8.96k
  // cmpxchg.trystore:
1008
8.96k
  //     %loaded.trystore = phi [%unreleasedload, %releasingstore],
1009
8.96k
  //                            [%releasedload, %cmpxchg.releasedload]
1010
8.96k
  //     %stored = @store_conditional(%new, %addr)
1011
8.96k
  //     %success = icmp eq i32 %stored, 0
1012
8.96k
  //     br i1 %success, label %cmpxchg.success,
1013
8.96k
  //                     label %cmpxchg.releasedload/%cmpxchg.failure
1014
8.96k
  // cmpxchg.releasedload:
1015
8.96k
  //     %releasedload = @load.linked(%addr)
1016
8.96k
  //     %should_store = icmp eq %releasedload, %desired
1017
8.96k
  //     br i1 %should_store, label %cmpxchg.trystore,
1018
8.96k
  //                          label %cmpxchg.failure
1019
8.96k
  // cmpxchg.success:
1020
8.96k
  //     fence?
1021
8.96k
  //     br label %cmpxchg.end
1022
8.96k
  // cmpxchg.nostore:
1023
8.96k
  //     %loaded.nostore = phi [%unreleasedload, %cmpxchg.start],
1024
8.96k
  //                           [%releasedload,
1025
8.96k
  //                               %cmpxchg.releasedload/%cmpxchg.trystore]
1026
8.96k
  //     @load_linked_fail_balance()?
1027
8.96k
  //     br label %cmpxchg.failure
1028
8.96k
  // cmpxchg.failure:
1029
8.96k
  //     fence?
1030
8.96k
  //     br label %cmpxchg.end
1031
8.96k
  // cmpxchg.end:
1032
8.96k
  //     %loaded = phi [%loaded.nostore, %cmpxchg.failure],
1033
8.96k
  //                   [%loaded.trystore, %cmpxchg.trystore]
1034
8.96k
  //     %success = phi i1 [true, %cmpxchg.success], [false, %cmpxchg.failure]
1035
8.96k
  //     %restmp = insertvalue { iN, i1 } undef, iN %loaded, 0
1036
8.96k
  //     %res = insertvalue { iN, i1 } %restmp, i1 %success, 1
1037
8.96k
  //     [...]
1038
8.96k
  BasicBlock *ExitBB = BB->splitBasicBlock(CI->getIterator(), "cmpxchg.end");
1039
8.96k
  auto FailureBB = BasicBlock::Create(Ctx, "cmpxchg.failure", F, ExitBB);
1040
8.96k
  auto NoStoreBB = BasicBlock::Create(Ctx, "cmpxchg.nostore", F, FailureBB);
1041
8.96k
  auto SuccessBB = BasicBlock::Create(Ctx, "cmpxchg.success", F, NoStoreBB);
1042
8.96k
  auto ReleasedLoadBB =
1043
8.96k
      BasicBlock::Create(Ctx, "cmpxchg.releasedload", F, SuccessBB);
1044
8.96k
  auto TryStoreBB =
1045
8.96k
      BasicBlock::Create(Ctx, "cmpxchg.trystore", F, ReleasedLoadBB);
1046
8.96k
  auto ReleasingStoreBB =
1047
8.96k
      BasicBlock::Create(Ctx, "cmpxchg.fencedstore", F, TryStoreBB);
1048
8.96k
  auto StartBB = BasicBlock::Create(Ctx, "cmpxchg.start", F, ReleasingStoreBB);
1049
8.96k
1050
8.96k
  // This grabs the DebugLoc from CI
1051
8.96k
  IRBuilder<> Builder(CI);
1052
8.96k
1053
8.96k
  // The split call above "helpfully" added a branch at the end of BB (to the
1054
8.96k
  // wrong place), but we might want a fence too. It's easiest to just remove
1055
8.96k
  // the branch entirely.
1056
8.96k
  std::prev(BB->end())->eraseFromParent();
1057
8.96k
  Builder.SetInsertPoint(BB);
1058
8.96k
  if (
ShouldInsertFencesForAtomic && 8.96k
UseUnconditionalReleaseBarrier444
)
1059
1
    TLI->emitLeadingFence(Builder, CI, SuccessOrder);
1060
8.96k
  Builder.CreateBr(StartBB);
1061
8.96k
1062
8.96k
  // Start the main loop block now that we've taken care of the preliminaries.
1063
8.96k
  Builder.SetInsertPoint(StartBB);
1064
8.96k
  Value *UnreleasedLoad = TLI->emitLoadLinked(Builder, Addr, MemOpOrder);
1065
8.96k
  Value *ShouldStore = Builder.CreateICmpEQ(
1066
8.96k
      UnreleasedLoad, CI->getCompareOperand(), "should_store");
1067
8.96k
1068
8.96k
  // If the cmpxchg doesn't actually need any ordering when it fails, we can
1069
8.96k
  // jump straight past that fence instruction (if it exists).
1070
8.96k
  Builder.CreateCondBr(ShouldStore, ReleasingStoreBB, NoStoreBB);
1071
8.96k
1072
8.96k
  Builder.SetInsertPoint(ReleasingStoreBB);
1073
8.96k
  if (
ShouldInsertFencesForAtomic && 8.96k
!UseUnconditionalReleaseBarrier444
)
1074
443
    TLI->emitLeadingFence(Builder, CI, SuccessOrder);
1075
8.96k
  Builder.CreateBr(TryStoreBB);
1076
8.96k
1077
8.96k
  Builder.SetInsertPoint(TryStoreBB);
1078
8.96k
  Value *StoreSuccess = TLI->emitStoreConditional(
1079
8.96k
      Builder, CI->getNewValOperand(), Addr, MemOpOrder);
1080
8.96k
  StoreSuccess = Builder.CreateICmpEQ(
1081
8.96k
      StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success");
1082
8.96k
  BasicBlock *RetryBB = HasReleasedLoadBB ? 
ReleasedLoadBB302
:
StartBB8.66k
;
1083
8.96k
  Builder.CreateCondBr(StoreSuccess, SuccessBB,
1084
8.96k
                       CI->isWeak() ? 
FailureBB7
:
RetryBB8.95k
);
1085
8.96k
1086
8.96k
  Builder.SetInsertPoint(ReleasedLoadBB);
1087
8.96k
  Value *SecondLoad;
1088
8.96k
  if (
HasReleasedLoadBB8.96k
) {
1089
302
    SecondLoad = TLI->emitLoadLinked(Builder, Addr, MemOpOrder);
1090
302
    ShouldStore = Builder.CreateICmpEQ(SecondLoad, CI->getCompareOperand(),
1091
302
                                       "should_store");
1092
302
1093
302
    // If the cmpxchg doesn't actually need any ordering when it fails, we can
1094
302
    // jump straight past that fence instruction (if it exists).
1095
302
    Builder.CreateCondBr(ShouldStore, TryStoreBB, NoStoreBB);
1096
302
  } else
1097
8.66k
    Builder.CreateUnreachable();
1098
8.96k
1099
8.96k
  // Make sure later instructions don't get reordered with a fence if
1100
8.96k
  // necessary.
1101
8.96k
  Builder.SetInsertPoint(SuccessBB);
1102
8.96k
  if (ShouldInsertFencesForAtomic)
1103
444
    TLI->emitTrailingFence(Builder, CI, SuccessOrder);
1104
8.96k
  Builder.CreateBr(ExitBB);
1105
8.96k
1106
8.96k
  Builder.SetInsertPoint(NoStoreBB);
1107
8.96k
  // In the failing case, where we don't execute the store-conditional, the
1108
8.96k
  // target might want to balance out the load-linked with a dedicated
1109
8.96k
  // instruction (e.g., on ARM, clearing the exclusive monitor).
1110
8.96k
  TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder);
1111
8.96k
  Builder.CreateBr(FailureBB);
1112
8.96k
1113
8.96k
  Builder.SetInsertPoint(FailureBB);
1114
8.96k
  if (ShouldInsertFencesForAtomic)
1115
444
    TLI->emitTrailingFence(Builder, CI, FailureOrder);
1116
8.96k
  Builder.CreateBr(ExitBB);
1117
8.96k
1118
8.96k
  // Finally, we have control-flow based knowledge of whether the cmpxchg
1119
8.96k
  // succeeded or not. We expose this to later passes by converting any
1120
8.96k
  // subsequent "icmp eq/ne %loaded, %oldval" into a use of an appropriate
1121
8.96k
  // PHI.
1122
8.96k
  Builder.SetInsertPoint(ExitBB, ExitBB->begin());
1123
8.96k
  PHINode *Success = Builder.CreatePHI(Type::getInt1Ty(Ctx), 2);
1124
8.96k
  Success->addIncoming(ConstantInt::getTrue(Ctx), SuccessBB);
1125
8.96k
  Success->addIncoming(ConstantInt::getFalse(Ctx), FailureBB);
1126
8.96k
1127
8.96k
  // Setup the builder so we can create any PHIs we need.
1128
8.96k
  Value *Loaded;
1129
8.96k
  if (!HasReleasedLoadBB)
1130
8.66k
    Loaded = UnreleasedLoad;
1131
302
  else {
1132
302
    Builder.SetInsertPoint(TryStoreBB, TryStoreBB->begin());
1133
302
    PHINode *TryStoreLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2);
1134
302
    TryStoreLoaded->addIncoming(UnreleasedLoad, ReleasingStoreBB);
1135
302
    TryStoreLoaded->addIncoming(SecondLoad, ReleasedLoadBB);
1136
302
1137
302
    Builder.SetInsertPoint(NoStoreBB, NoStoreBB->begin());
1138
302
    PHINode *NoStoreLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2);
1139
302
    NoStoreLoaded->addIncoming(UnreleasedLoad, StartBB);
1140
302
    NoStoreLoaded->addIncoming(SecondLoad, ReleasedLoadBB);
1141
302
1142
302
    Builder.SetInsertPoint(ExitBB, ++ExitBB->begin());
1143
302
    PHINode *ExitLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2);
1144
302
    ExitLoaded->addIncoming(TryStoreLoaded, SuccessBB);
1145
302
    ExitLoaded->addIncoming(NoStoreLoaded, FailureBB);
1146
302
1147
302
    Loaded = ExitLoaded;
1148
302
  }
1149
8.96k
1150
8.96k
  // Look for any users of the cmpxchg that are just comparing the loaded value
1151
8.96k
  // against the desired one, and replace them with the CFG-derived version.
1152
8.96k
  SmallVector<ExtractValueInst *, 2> PrunedInsts;
1153
9.47k
  for (auto User : CI->users()) {
1154
9.47k
    ExtractValueInst *EV = dyn_cast<ExtractValueInst>(User);
1155
9.47k
    if (!EV)
1156
8
      continue;
1157
9.46k
1158
9.47k
    assert(EV->getNumIndices() == 1 && EV->getIndices()[0] <= 1 &&
1159
9.46k
           "weird extraction from { iN, i1 }");
1160
9.46k
1161
9.46k
    if (EV->getIndices()[0] == 0)
1162
8.90k
      EV->replaceAllUsesWith(Loaded);
1163
9.46k
    else
1164
567
      EV->replaceAllUsesWith(Success);
1165
9.47k
1166
9.47k
    PrunedInsts.push_back(EV);
1167
9.47k
  }
1168
8.96k
1169
8.96k
  // We can remove the instructions now we're no longer iterating through them.
1170
8.96k
  for (auto EV : PrunedInsts)
1171
9.46k
    EV->eraseFromParent();
1172
8.96k
1173
8.96k
  if (
!CI->use_empty()8.96k
) {
1174
8
    // Some use of the full struct return that we don't understand has happened,
1175
8
    // so we've got to reconstruct it properly.
1176
8
    Value *Res;
1177
8
    Res = Builder.CreateInsertValue(UndefValue::get(CI->getType()), Loaded, 0);
1178
8
    Res = Builder.CreateInsertValue(Res, Success, 1);
1179
8
1180
8
    CI->replaceAllUsesWith(Res);
1181
8
  }
1182
8.96k
1183
8.96k
  CI->eraseFromParent();
1184
8.96k
  return true;
1185
8.96k
}
1186
1187
42.2k
bool AtomicExpand::isIdempotentRMW(AtomicRMWInst* RMWI) {
1188
42.2k
  auto C = dyn_cast<ConstantInt>(RMWI->getValOperand());
1189
42.2k
  if(!C)
1190
21.4k
    return false;
1191
20.8k
1192
20.8k
  AtomicRMWInst::BinOp Op = RMWI->getOperation();
1193
20.8k
  switch(Op) {
1194
8.80k
    case AtomicRMWInst::Add:
1195
8.80k
    case AtomicRMWInst::Sub:
1196
8.80k
    case AtomicRMWInst::Or:
1197
8.80k
    case AtomicRMWInst::Xor:
1198
8.80k
      return C->isZero();
1199
71
    case AtomicRMWInst::And:
1200
71
      return C->isMinusOne();
1201
8.80k
    // FIXME: we could also treat Min/Max/UMin/UMax by the INT_MIN/INT_MAX/...
1202
11.9k
    default:
1203
11.9k
      return false;
1204
0
  }
1205
0
}
1206
1207
14
bool AtomicExpand::simplifyIdempotentRMW(AtomicRMWInst* RMWI) {
1208
14
  if (auto 
ResultingLoad14
= TLI->lowerIdempotentRMWIntoFencedLoad(RMWI)) {
1209
11
    tryExpandAtomicLoad(ResultingLoad);
1210
11
    return true;
1211
11
  }
1212
3
  return false;
1213
3
}
1214
1215
Value *AtomicExpand::insertRMWCmpXchgLoop(
1216
    IRBuilder<> &Builder, Type *ResultTy, Value *Addr,
1217
    AtomicOrdering MemOpOrder,
1218
    function_ref<Value *(IRBuilder<> &, Value *)> PerformOp,
1219
1.10k
    CreateCmpXchgInstFun CreateCmpXchg) {
1220
1.10k
  LLVMContext &Ctx = Builder.getContext();
1221
1.10k
  BasicBlock *BB = Builder.GetInsertBlock();
1222
1.10k
  Function *F = BB->getParent();
1223
1.10k
1224
1.10k
  // Given: atomicrmw some_op iN* %addr, iN %incr ordering
1225
1.10k
  //
1226
1.10k
  // The standard expansion we produce is:
1227
1.10k
  //     [...]
1228
1.10k
  //     %init_loaded = load atomic iN* %addr
1229
1.10k
  //     br label %loop
1230
1.10k
  // loop:
1231
1.10k
  //     %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ]
1232
1.10k
  //     %new = some_op iN %loaded, %incr
1233
1.10k
  //     %pair = cmpxchg iN* %addr, iN %loaded, iN %new
1234
1.10k
  //     %new_loaded = extractvalue { iN, i1 } %pair, 0
1235
1.10k
  //     %success = extractvalue { iN, i1 } %pair, 1
1236
1.10k
  //     br i1 %success, label %atomicrmw.end, label %loop
1237
1.10k
  // atomicrmw.end:
1238
1.10k
  //     [...]
1239
1.10k
  BasicBlock *ExitBB =
1240
1.10k
      BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
1241
1.10k
  BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
1242
1.10k
1243
1.10k
  // The split call above "helpfully" added a branch at the end of BB (to the
1244
1.10k
  // wrong place), but we want a load. It's easiest to just remove
1245
1.10k
  // the branch entirely.
1246
1.10k
  std::prev(BB->end())->eraseFromParent();
1247
1.10k
  Builder.SetInsertPoint(BB);
1248
1.10k
  LoadInst *InitLoaded = Builder.CreateLoad(ResultTy, Addr);
1249
1.10k
  // Atomics require at least natural alignment.
1250
1.10k
  InitLoaded->setAlignment(ResultTy->getPrimitiveSizeInBits() / 8);
1251
1.10k
  Builder.CreateBr(LoopBB);
1252
1.10k
1253
1.10k
  // Start the main loop block now that we've taken care of the preliminaries.
1254
1.10k
  Builder.SetInsertPoint(LoopBB);
1255
1.10k
  PHINode *Loaded = Builder.CreatePHI(ResultTy, 2, "loaded");
1256
1.10k
  Loaded->addIncoming(InitLoaded, BB);
1257
1.10k
1258
1.10k
  Value *NewVal = PerformOp(Builder, Loaded);
1259
1.10k
1260
1.10k
  Value *NewLoaded = nullptr;
1261
1.10k
  Value *Success = nullptr;
1262
1.10k
1263
1.10k
  CreateCmpXchg(Builder, Addr, Loaded, NewVal,
1264
1.10k
                MemOpOrder == AtomicOrdering::Unordered
1265
1
                    ? AtomicOrdering::Monotonic
1266
1.10k
                    : MemOpOrder,
1267
1.10k
                Success, NewLoaded);
1268
1.10k
  assert(Success && NewLoaded);
1269
1.10k
1270
1.10k
  Loaded->addIncoming(NewLoaded, LoopBB);
1271
1.10k
1272
1.10k
  Builder.CreateCondBr(Success, ExitBB, LoopBB);
1273
1.10k
1274
1.10k
  Builder.SetInsertPoint(ExitBB, ExitBB->begin());
1275
1.10k
  return NewLoaded;
1276
1.10k
}
1277
1278
// Note: This function is exposed externally by AtomicExpandUtils.h
1279
bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
1280
1.09k
                                    CreateCmpXchgInstFun CreateCmpXchg) {
1281
1.09k
  IRBuilder<> Builder(AI);
1282
1.09k
  Value *Loaded = AtomicExpand::insertRMWCmpXchgLoop(
1283
1.09k
      Builder, AI->getType(), AI->getPointerOperand(), AI->getOrdering(),
1284
1.09k
      [&](IRBuilder<> &Builder, Value *Loaded) {
1285
1.09k
        return performAtomicOp(AI->getOperation(), Builder, Loaded,
1286
1.09k
                               AI->getValOperand());
1287
1.09k
      },
1288
1.09k
      CreateCmpXchg);
1289
1.09k
1290
1.09k
  AI->replaceAllUsesWith(Loaded);
1291
1.09k
  AI->eraseFromParent();
1292
1.09k
  return true;
1293
1.09k
}
1294
1295
// In order to use one of the sized library calls such as
1296
// __atomic_fetch_add_4, the alignment must be sufficient, the size
1297
// must be one of the potentially-specialized sizes, and the value
1298
// type must actually exist in C on the target (otherwise, the
1299
// function wouldn't actually be defined.)
1300
static bool canUseSizedAtomicCall(unsigned Size, unsigned Align,
1301
17
                                  const DataLayout &DL) {
1302
17
  // TODO: "LargestSize" is an approximation for "largest type that
1303
17
  // you can express in C". It seems to be the case that int128 is
1304
17
  // supported on all 64-bit platforms, otherwise only up to 64-bit
1305
17
  // integers are supported. If we get this wrong, then we'll try to
1306
17
  // call a sized libcall that doesn't actually exist. There should
1307
17
  // really be some more reliable way in LLVM of determining integer
1308
17
  // sizes which are valid in the target's C ABI...
1309
17
  unsigned LargestSize = DL.getLargestLegalIntTypeSizeInBits() >= 64 ? 
160
:
817
;
1310
17
  return Align >= Size &&
1311
15
         
(Size == 1 || 15
Size == 215
||
Size == 410
||
Size == 89
||
Size == 167
) &&
1312
15
         Size <= LargestSize;
1313
17
}
1314
1315
4
void AtomicExpand::expandAtomicLoadToLibcall(LoadInst *I) {
1316
4
  static const RTLIB::Libcall Libcalls[6] = {
1317
4
      RTLIB::ATOMIC_LOAD,   RTLIB::ATOMIC_LOAD_1, RTLIB::ATOMIC_LOAD_2,
1318
4
      RTLIB::ATOMIC_LOAD_4, RTLIB::ATOMIC_LOAD_8, RTLIB::ATOMIC_LOAD_16};
1319
4
  unsigned Size = getAtomicOpSize(I);
1320
4
  unsigned Align = getAtomicOpAlign(I);
1321
4
1322
4
  bool expanded = expandAtomicOpToLibcall(
1323
4
      I, Size, Align, I->getPointerOperand(), nullptr, nullptr,
1324
4
      I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls);
1325
4
  (void)expanded;
1326
4
  assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor Load");
1327
4
}
1328
1329
5
void AtomicExpand::expandAtomicStoreToLibcall(StoreInst *I) {
1330
5
  static const RTLIB::Libcall Libcalls[6] = {
1331
5
      RTLIB::ATOMIC_STORE,   RTLIB::ATOMIC_STORE_1, RTLIB::ATOMIC_STORE_2,
1332
5
      RTLIB::ATOMIC_STORE_4, RTLIB::ATOMIC_STORE_8, RTLIB::ATOMIC_STORE_16};
1333
5
  unsigned Size = getAtomicOpSize(I);
1334
5
  unsigned Align = getAtomicOpAlign(I);
1335
5
1336
5
  bool expanded = expandAtomicOpToLibcall(
1337
5
      I, Size, Align, I->getPointerOperand(), I->getValueOperand(), nullptr,
1338
5
      I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls);
1339
5
  (void)expanded;
1340
5
  assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor Store");
1341
5
}
1342
1343
4
void AtomicExpand::expandAtomicCASToLibcall(AtomicCmpXchgInst *I) {
1344
4
  static const RTLIB::Libcall Libcalls[6] = {
1345
4
      RTLIB::ATOMIC_COMPARE_EXCHANGE,   RTLIB::ATOMIC_COMPARE_EXCHANGE_1,
1346
4
      RTLIB::ATOMIC_COMPARE_EXCHANGE_2, RTLIB::ATOMIC_COMPARE_EXCHANGE_4,
1347
4
      RTLIB::ATOMIC_COMPARE_EXCHANGE_8, RTLIB::ATOMIC_COMPARE_EXCHANGE_16};
1348
4
  unsigned Size = getAtomicOpSize(I);
1349
4
  unsigned Align = getAtomicOpAlign(I);
1350
4
1351
4
  bool expanded = expandAtomicOpToLibcall(
1352
4
      I, Size, Align, I->getPointerOperand(), I->getNewValOperand(),
1353
4
      I->getCompareOperand(), I->getSuccessOrdering(), I->getFailureOrdering(),
1354
4
      Libcalls);
1355
4
  (void)expanded;
1356
4
  assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor CAS");
1357
4
}
1358
1359
4
static ArrayRef<RTLIB::Libcall> GetRMWLibcall(AtomicRMWInst::BinOp Op) {
1360
4
  static const RTLIB::Libcall LibcallsXchg[6] = {
1361
4
      RTLIB::ATOMIC_EXCHANGE,   RTLIB::ATOMIC_EXCHANGE_1,
1362
4
      RTLIB::ATOMIC_EXCHANGE_2, RTLIB::ATOMIC_EXCHANGE_4,
1363
4
      RTLIB::ATOMIC_EXCHANGE_8, RTLIB::ATOMIC_EXCHANGE_16};
1364
4
  static const RTLIB::Libcall LibcallsAdd[6] = {
1365
4
      RTLIB::UNKNOWN_LIBCALL,    RTLIB::ATOMIC_FETCH_ADD_1,
1366
4
      RTLIB::ATOMIC_FETCH_ADD_2, RTLIB::ATOMIC_FETCH_ADD_4,
1367
4
      RTLIB::ATOMIC_FETCH_ADD_8, RTLIB::ATOMIC_FETCH_ADD_16};
1368
4
  static const RTLIB::Libcall LibcallsSub[6] = {
1369
4
      RTLIB::UNKNOWN_LIBCALL,    RTLIB::ATOMIC_FETCH_SUB_1,
1370
4
      RTLIB::ATOMIC_FETCH_SUB_2, RTLIB::ATOMIC_FETCH_SUB_4,
1371
4
      RTLIB::ATOMIC_FETCH_SUB_8, RTLIB::ATOMIC_FETCH_SUB_16};
1372
4
  static const RTLIB::Libcall LibcallsAnd[6] = {
1373
4
      RTLIB::UNKNOWN_LIBCALL,    RTLIB::ATOMIC_FETCH_AND_1,
1374
4
      RTLIB::ATOMIC_FETCH_AND_2, RTLIB::ATOMIC_FETCH_AND_4,
1375
4
      RTLIB::ATOMIC_FETCH_AND_8, RTLIB::ATOMIC_FETCH_AND_16};
1376
4
  static const RTLIB::Libcall LibcallsOr[6] = {
1377
4
      RTLIB::UNKNOWN_LIBCALL,   RTLIB::ATOMIC_FETCH_OR_1,
1378
4
      RTLIB::ATOMIC_FETCH_OR_2, RTLIB::ATOMIC_FETCH_OR_4,
1379
4
      RTLIB::ATOMIC_FETCH_OR_8, RTLIB::ATOMIC_FETCH_OR_16};
1380
4
  static const RTLIB::Libcall LibcallsXor[6] = {
1381
4
      RTLIB::UNKNOWN_LIBCALL,    RTLIB::ATOMIC_FETCH_XOR_1,
1382
4
      RTLIB::ATOMIC_FETCH_XOR_2, RTLIB::ATOMIC_FETCH_XOR_4,
1383
4
      RTLIB::ATOMIC_FETCH_XOR_8, RTLIB::ATOMIC_FETCH_XOR_16};
1384
4
  static const RTLIB::Libcall LibcallsNand[6] = {
1385
4
      RTLIB::UNKNOWN_LIBCALL,     RTLIB::ATOMIC_FETCH_NAND_1,
1386
4
      RTLIB::ATOMIC_FETCH_NAND_2, RTLIB::ATOMIC_FETCH_NAND_4,
1387
4
      RTLIB::ATOMIC_FETCH_NAND_8, RTLIB::ATOMIC_FETCH_NAND_16};
1388
4
1389
4
  switch (Op) {
1390
0
  case AtomicRMWInst::BAD_BINOP:
1391
0
    llvm_unreachable("Should not have BAD_BINOP.");
1392
2
  case AtomicRMWInst::Xchg:
1393
2
    return makeArrayRef(LibcallsXchg);
1394
2
  case AtomicRMWInst::Add:
1395
2
    return makeArrayRef(LibcallsAdd);
1396
0
  case AtomicRMWInst::Sub:
1397
0
    return makeArrayRef(LibcallsSub);
1398
0
  case AtomicRMWInst::And:
1399
0
    return makeArrayRef(LibcallsAnd);
1400
0
  case AtomicRMWInst::Or:
1401
0
    return makeArrayRef(LibcallsOr);
1402
0
  case AtomicRMWInst::Xor:
1403
0
    return makeArrayRef(LibcallsXor);
1404
0
  case AtomicRMWInst::Nand:
1405
0
    return makeArrayRef(LibcallsNand);
1406
0
  case AtomicRMWInst::Max:
1407
0
  case AtomicRMWInst::Min:
1408
0
  case AtomicRMWInst::UMax:
1409
0
  case AtomicRMWInst::UMin:
1410
0
    // No atomic libcalls are available for max/min/umax/umin.
1411
0
    return {};
1412
0
  }
1413
0
  
llvm_unreachable0
("Unexpected AtomicRMW operation.");
1414
0
}
1415
1416
4
void AtomicExpand::expandAtomicRMWToLibcall(AtomicRMWInst *I) {
1417
4
  ArrayRef<RTLIB::Libcall> Libcalls = GetRMWLibcall(I->getOperation());
1418
4
1419
4
  unsigned Size = getAtomicOpSize(I);
1420
4
  unsigned Align = getAtomicOpAlign(I);
1421
4
1422
4
  bool Success = false;
1423
4
  if (!Libcalls.empty())
1424
4
    Success = expandAtomicOpToLibcall(
1425
4
        I, Size, Align, I->getPointerOperand(), I->getValOperand(), nullptr,
1426
4
        I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls);
1427
4
1428
4
  // The expansion failed: either there were no libcalls at all for
1429
4
  // the operation (min/max), or there were only size-specialized
1430
4
  // libcalls (add/sub/etc) and we needed a generic. So, expand to a
1431
4
  // CAS libcall, via a CAS loop, instead.
1432
4
  if (
!Success4
) {
1433
1
    expandAtomicRMWToCmpXchg(I, [this](IRBuilder<> &Builder, Value *Addr,
1434
1
                                       Value *Loaded, Value *NewVal,
1435
1
                                       AtomicOrdering MemOpOrder,
1436
1
                                       Value *&Success, Value *&NewLoaded) {
1437
1
      // Create the CAS instruction normally...
1438
1
      AtomicCmpXchgInst *Pair = Builder.CreateAtomicCmpXchg(
1439
1
          Addr, Loaded, NewVal, MemOpOrder,
1440
1
          AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder));
1441
1
      Success = Builder.CreateExtractValue(Pair, 1, "success");
1442
1
      NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");
1443
1
1444
1
      // ...and then expand the CAS into a libcall.
1445
1
      expandAtomicCASToLibcall(Pair);
1446
1
    });
1447
1
  }
1448
4
}
1449
1450
// A helper routine for the above expandAtomic*ToLibcall functions.
1451
//
1452
// 'Libcalls' contains an array of enum values for the particular
1453
// ATOMIC libcalls to be emitted. All of the other arguments besides
1454
// 'I' are extracted from the Instruction subclass by the
1455
// caller. Depending on the particular call, some will be null.
1456
bool AtomicExpand::expandAtomicOpToLibcall(
1457
    Instruction *I, unsigned Size, unsigned Align, Value *PointerOperand,
1458
    Value *ValueOperand, Value *CASExpected, AtomicOrdering Ordering,
1459
17
    AtomicOrdering Ordering2, ArrayRef<RTLIB::Libcall> Libcalls) {
1460
17
  assert(Libcalls.size() == 6);
1461
17
1462
17
  LLVMContext &Ctx = I->getContext();
1463
17
  Module *M = I->getModule();
1464
17
  const DataLayout &DL = M->getDataLayout();
1465
17
  IRBuilder<> Builder(I);
1466
17
  IRBuilder<> AllocaBuilder(&I->getFunction()->getEntryBlock().front());
1467
17
1468
17
  bool UseSizedLibcall = canUseSizedAtomicCall(Size, Align, DL);
1469
17
  Type *SizedIntTy = Type::getIntNTy(Ctx, Size * 8);
1470
17
1471
17
  unsigned AllocaAlignment = DL.getPrefTypeAlignment(SizedIntTy);
1472
17
1473
17
  // TODO: the "order" argument type is "int", not int32. So
1474
17
  // getInt32Ty may be wrong if the arch uses e.g. 16-bit ints.
1475
17
  ConstantInt *SizeVal64 = ConstantInt::get(Type::getInt64Ty(Ctx), Size);
1476
17
  assert(Ordering != AtomicOrdering::NotAtomic && "expect atomic MO");
1477
17
  Constant *OrderingVal =
1478
17
      ConstantInt::get(Type::getInt32Ty(Ctx), (int)toCABI(Ordering));
1479
17
  Constant *Ordering2Val = nullptr;
1480
17
  if (
CASExpected17
) {
1481
4
    assert(Ordering2 != AtomicOrdering::NotAtomic && "expect atomic MO");
1482
4
    Ordering2Val =
1483
4
        ConstantInt::get(Type::getInt32Ty(Ctx), (int)toCABI(Ordering2));
1484
4
  }
1485
17
  bool HasResult = I->getType() != Type::getVoidTy(Ctx);
1486
17
1487
17
  RTLIB::Libcall RTLibType;
1488
17
  if (
UseSizedLibcall17
) {
1489
8
    switch (Size) {
1490
0
    case 1: RTLibType = Libcalls[1]; break;
1491
5
    case 2: RTLibType = Libcalls[2]; break;
1492
1
    case 4: RTLibType = Libcalls[3]; break;
1493
2
    case 8: RTLibType = Libcalls[4]; break;
1494
0
    case 16: RTLibType = Libcalls[5]; break;
1495
17
    }
1496
9
  } else 
if (9
Libcalls[0] != RTLIB::UNKNOWN_LIBCALL9
) {
1497
8
    RTLibType = Libcalls[0];
1498
9
  } else {
1499
1
    // Can't use sized function, and there's no generic for this
1500
1
    // operation, so give up.
1501
1
    return false;
1502
1
  }
1503
16
1504
16
  // Build up the function call. There's two kinds. First, the sized
1505
16
  // variants.  These calls are going to be one of the following (with
1506
16
  // N=1,2,4,8,16):
1507
16
  //  iN    __atomic_load_N(iN *ptr, int ordering)
1508
16
  //  void  __atomic_store_N(iN *ptr, iN val, int ordering)
1509
16
  //  iN    __atomic_{exchange|fetch_*}_N(iN *ptr, iN val, int ordering)
1510
16
  //  bool  __atomic_compare_exchange_N(iN *ptr, iN *expected, iN desired,
1511
16
  //                                    int success_order, int failure_order)
1512
16
  //
1513
16
  // Note that these functions can be used for non-integer atomic
1514
16
  // operations, the values just need to be bitcast to integers on the
1515
16
  // way in and out.
1516
16
  //
1517
16
  // And, then, the generic variants. They look like the following:
1518
16
  //  void  __atomic_load(size_t size, void *ptr, void *ret, int ordering)
1519
16
  //  void  __atomic_store(size_t size, void *ptr, void *val, int ordering)
1520
16
  //  void  __atomic_exchange(size_t size, void *ptr, void *val, void *ret,
1521
16
  //                          int ordering)
1522
16
  //  bool  __atomic_compare_exchange(size_t size, void *ptr, void *expected,
1523
16
  //                                  void *desired, int success_order,
1524
16
  //                                  int failure_order)
1525
16
  //
1526
16
  // The different signatures are built up depending on the
1527
16
  // 'UseSizedLibcall', 'CASExpected', 'ValueOperand', and 'HasResult'
1528
16
  // variables.
1529
16
1530
16
  AllocaInst *AllocaCASExpected = nullptr;
1531
16
  Value *AllocaCASExpected_i8 = nullptr;
1532
16
  AllocaInst *AllocaValue = nullptr;
1533
16
  Value *AllocaValue_i8 = nullptr;
1534
16
  AllocaInst *AllocaResult = nullptr;
1535
16
  Value *AllocaResult_i8 = nullptr;
1536
16
1537
16
  Type *ResultTy;
1538
16
  SmallVector<Value *, 6> Args;
1539
16
  AttributeList Attr;
1540
16
1541
16
  // 'size' argument.
1542
16
  if (
!UseSizedLibcall16
) {
1543
8
    // Note, getIntPtrType is assumed equivalent to size_t.
1544
8
    Args.push_back(ConstantInt::get(DL.getIntPtrType(Ctx), Size));
1545
8
  }
1546
16
1547
16
  // 'ptr' argument.
1548
16
  Value *PtrVal =
1549
16
      Builder.CreateBitCast(PointerOperand, Type::getInt8PtrTy(Ctx));
1550
16
  Args.push_back(PtrVal);
1551
16
1552
16
  // 'expected' argument, if present.
1553
16
  if (
CASExpected16
) {
1554
4
    AllocaCASExpected = AllocaBuilder.CreateAlloca(CASExpected->getType());
1555
4
    AllocaCASExpected->setAlignment(AllocaAlignment);
1556
4
    AllocaCASExpected_i8 =
1557
4
        Builder.CreateBitCast(AllocaCASExpected, Type::getInt8PtrTy(Ctx));
1558
4
    Builder.CreateLifetimeStart(AllocaCASExpected_i8, SizeVal64);
1559
4
    Builder.CreateAlignedStore(CASExpected, AllocaCASExpected, AllocaAlignment);
1560
4
    Args.push_back(AllocaCASExpected_i8);
1561
4
  }
1562
16
1563
16
  // 'val' argument ('desired' for cas), if present.
1564
16
  if (
ValueOperand16
) {
1565
12
    if (
UseSizedLibcall12
) {
1566
6
      Value *IntValue =
1567
6
          Builder.CreateBitOrPointerCast(ValueOperand, SizedIntTy);
1568
6
      Args.push_back(IntValue);
1569
12
    } else {
1570
6
      AllocaValue = AllocaBuilder.CreateAlloca(ValueOperand->getType());
1571
6
      AllocaValue->setAlignment(AllocaAlignment);
1572
6
      AllocaValue_i8 =
1573
6
          Builder.CreateBitCast(AllocaValue, Type::getInt8PtrTy(Ctx));
1574
6
      Builder.CreateLifetimeStart(AllocaValue_i8, SizeVal64);
1575
6
      Builder.CreateAlignedStore(ValueOperand, AllocaValue, AllocaAlignment);
1576
6
      Args.push_back(AllocaValue_i8);
1577
6
    }
1578
12
  }
1579
16
1580
16
  // 'ret' argument.
1581
16
  if (
!CASExpected && 16
HasResult12
&&
!UseSizedLibcall7
) {
1582
3
    AllocaResult = AllocaBuilder.CreateAlloca(I->getType());
1583
3
    AllocaResult->setAlignment(AllocaAlignment);
1584
3
    AllocaResult_i8 =
1585
3
        Builder.CreateBitCast(AllocaResult, Type::getInt8PtrTy(Ctx));
1586
3
    Builder.CreateLifetimeStart(AllocaResult_i8, SizeVal64);
1587
3
    Args.push_back(AllocaResult_i8);
1588
3
  }
1589
16
1590
16
  // 'ordering' ('success_order' for cas) argument.
1591
16
  Args.push_back(OrderingVal);
1592
16
1593
16
  // 'failure_order' argument, if present.
1594
16
  if (Ordering2Val)
1595
4
    Args.push_back(Ordering2Val);
1596
16
1597
16
  // Now, the return type.
1598
16
  if (
CASExpected16
) {
1599
4
    ResultTy = Type::getInt1Ty(Ctx);
1600
4
    Attr = Attr.addAttribute(Ctx, AttributeList::ReturnIndex, Attribute::ZExt);
1601
16
  } else 
if (12
HasResult && 12
UseSizedLibcall7
)
1602
4
    ResultTy = SizedIntTy;
1603
12
  else
1604
8
    ResultTy = Type::getVoidTy(Ctx);
1605
16
1606
16
  // Done with setting up arguments and return types, create the call:
1607
16
  SmallVector<Type *, 6> ArgTys;
1608
16
  for (Value *Arg : Args)
1609
63
    ArgTys.push_back(Arg->getType());
1610
16
  FunctionType *FnType = FunctionType::get(ResultTy, ArgTys, false);
1611
16
  Constant *LibcallFn =
1612
16
      M->getOrInsertFunction(TLI->getLibcallName(RTLibType), FnType, Attr);
1613
16
  CallInst *Call = Builder.CreateCall(LibcallFn, Args);
1614
16
  Call->setAttributes(Attr);
1615
16
  Value *Result = Call;
1616
16
1617
16
  // And then, extract the results...
1618
16
  if (
ValueOperand && 16
!UseSizedLibcall12
)
1619
6
    Builder.CreateLifetimeEnd(AllocaValue_i8, SizeVal64);
1620
16
1621
16
  if (
CASExpected16
) {
1622
4
    // The final result from the CAS is {load of 'expected' alloca, bool result
1623
4
    // from call}
1624
4
    Type *FinalResultTy = I->getType();
1625
4
    Value *V = UndefValue::get(FinalResultTy);
1626
4
    Value *ExpectedOut =
1627
4
        Builder.CreateAlignedLoad(AllocaCASExpected, AllocaAlignment);
1628
4
    Builder.CreateLifetimeEnd(AllocaCASExpected_i8, SizeVal64);
1629
4
    V = Builder.CreateInsertValue(V, ExpectedOut, 0);
1630
4
    V = Builder.CreateInsertValue(V, Result, 1);
1631
4
    I->replaceAllUsesWith(V);
1632
16
  } else 
if (12
HasResult12
) {
1633
7
    Value *V;
1634
7
    if (UseSizedLibcall)
1635
4
      V = Builder.CreateBitOrPointerCast(Result, I->getType());
1636
3
    else {
1637
3
      V = Builder.CreateAlignedLoad(AllocaResult, AllocaAlignment);
1638
3
      Builder.CreateLifetimeEnd(AllocaResult_i8, SizeVal64);
1639
3
    }
1640
12
    I->replaceAllUsesWith(V);
1641
12
  }
1642
17
  I->eraseFromParent();
1643
17
  return true;
1644
17
}