/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/CodeGen/AtomicExpandPass.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===- AtomicExpandPass.cpp - Expand atomic instructions ------------------===// |
2 | | // |
3 | | // The LLVM Compiler Infrastructure |
4 | | // |
5 | | // This file is distributed under the University of Illinois Open Source |
6 | | // License. See LICENSE.TXT for details. |
7 | | // |
8 | | //===----------------------------------------------------------------------===// |
9 | | // |
10 | | // This file contains a pass (at IR level) to replace atomic instructions with |
11 | | // __atomic_* library calls, or target specific instruction which implement the |
12 | | // same semantics in a way which better fits the target backend. This can |
13 | | // include the use of (intrinsic-based) load-linked/store-conditional loops, |
14 | | // AtomicCmpXchg, or type coercions. |
15 | | // |
16 | | //===----------------------------------------------------------------------===// |
17 | | |
18 | | #include "llvm/ADT/ArrayRef.h" |
19 | | #include "llvm/ADT/STLExtras.h" |
20 | | #include "llvm/ADT/SmallVector.h" |
21 | | #include "llvm/CodeGen/AtomicExpandUtils.h" |
22 | | #include "llvm/CodeGen/RuntimeLibcalls.h" |
23 | | #include "llvm/CodeGen/TargetPassConfig.h" |
24 | | #include "llvm/CodeGen/ValueTypes.h" |
25 | | #include "llvm/IR/Attributes.h" |
26 | | #include "llvm/IR/BasicBlock.h" |
27 | | #include "llvm/IR/Constant.h" |
28 | | #include "llvm/IR/Constants.h" |
29 | | #include "llvm/IR/DataLayout.h" |
30 | | #include "llvm/IR/DerivedTypes.h" |
31 | | #include "llvm/IR/Function.h" |
32 | | #include "llvm/IR/IRBuilder.h" |
33 | | #include "llvm/IR/InstIterator.h" |
34 | | #include "llvm/IR/Instruction.h" |
35 | | #include "llvm/IR/Instructions.h" |
36 | | #include "llvm/IR/Module.h" |
37 | | #include "llvm/IR/Type.h" |
38 | | #include "llvm/IR/User.h" |
39 | | #include "llvm/IR/Value.h" |
40 | | #include "llvm/Pass.h" |
41 | | #include "llvm/Support/AtomicOrdering.h" |
42 | | #include "llvm/Support/Casting.h" |
43 | | #include "llvm/Support/Debug.h" |
44 | | #include "llvm/Support/ErrorHandling.h" |
45 | | #include "llvm/Support/raw_ostream.h" |
46 | | #include "llvm/Target/TargetLowering.h" |
47 | | #include "llvm/Target/TargetMachine.h" |
48 | | #include "llvm/Target/TargetSubtargetInfo.h" |
49 | | #include <cassert> |
50 | | #include <cstdint> |
51 | | #include <iterator> |
52 | | |
53 | | using namespace llvm; |
54 | | |
55 | | #define DEBUG_TYPE "atomic-expand" |
56 | | |
57 | | namespace { |
58 | | |
59 | | class AtomicExpand: public FunctionPass { |
60 | | const TargetLowering *TLI = nullptr; |
61 | | |
62 | | public: |
63 | | static char ID; // Pass identification, replacement for typeid |
64 | | |
65 | 30.6k | AtomicExpand() : FunctionPass(ID) { |
66 | 30.6k | initializeAtomicExpandPass(*PassRegistry::getPassRegistry()); |
67 | 30.6k | } |
68 | | |
69 | | bool runOnFunction(Function &F) override; |
70 | | |
71 | | private: |
72 | | bool bracketInstWithFences(Instruction *I, AtomicOrdering Order); |
73 | | IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL); |
74 | | LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI); |
75 | | bool tryExpandAtomicLoad(LoadInst *LI); |
76 | | bool expandAtomicLoadToLL(LoadInst *LI); |
77 | | bool expandAtomicLoadToCmpXchg(LoadInst *LI); |
78 | | StoreInst *convertAtomicStoreToIntegerType(StoreInst *SI); |
79 | | bool expandAtomicStore(StoreInst *SI); |
80 | | bool tryExpandAtomicRMW(AtomicRMWInst *AI); |
81 | | Value * |
82 | | insertRMWLLSCLoop(IRBuilder<> &Builder, Type *ResultTy, Value *Addr, |
83 | | AtomicOrdering MemOpOrder, |
84 | | function_ref<Value *(IRBuilder<> &, Value *)> PerformOp); |
85 | | void expandAtomicOpToLLSC( |
86 | | Instruction *I, Type *ResultTy, Value *Addr, AtomicOrdering MemOpOrder, |
87 | | function_ref<Value *(IRBuilder<> &, Value *)> PerformOp); |
88 | | void expandPartwordAtomicRMW( |
89 | | AtomicRMWInst *I, |
90 | | TargetLoweringBase::AtomicExpansionKind ExpansionKind); |
91 | | void expandPartwordCmpXchg(AtomicCmpXchgInst *I); |
92 | | |
93 | | AtomicCmpXchgInst *convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI); |
94 | | static Value *insertRMWCmpXchgLoop( |
95 | | IRBuilder<> &Builder, Type *ResultType, Value *Addr, |
96 | | AtomicOrdering MemOpOrder, |
97 | | function_ref<Value *(IRBuilder<> &, Value *)> PerformOp, |
98 | | CreateCmpXchgInstFun CreateCmpXchg); |
99 | | |
100 | | bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI); |
101 | | bool isIdempotentRMW(AtomicRMWInst *AI); |
102 | | bool simplifyIdempotentRMW(AtomicRMWInst *AI); |
103 | | |
104 | | bool expandAtomicOpToLibcall(Instruction *I, unsigned Size, unsigned Align, |
105 | | Value *PointerOperand, Value *ValueOperand, |
106 | | Value *CASExpected, AtomicOrdering Ordering, |
107 | | AtomicOrdering Ordering2, |
108 | | ArrayRef<RTLIB::Libcall> Libcalls); |
109 | | void expandAtomicLoadToLibcall(LoadInst *LI); |
110 | | void expandAtomicStoreToLibcall(StoreInst *LI); |
111 | | void expandAtomicRMWToLibcall(AtomicRMWInst *I); |
112 | | void expandAtomicCASToLibcall(AtomicCmpXchgInst *I); |
113 | | |
114 | | friend bool |
115 | | llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI, |
116 | | CreateCmpXchgInstFun CreateCmpXchg); |
117 | | }; |
118 | | |
119 | | } // end anonymous namespace |
120 | | |
121 | | char AtomicExpand::ID = 0; |
122 | | |
123 | | char &llvm::AtomicExpandID = AtomicExpand::ID; |
124 | | |
125 | | INITIALIZE_PASS(AtomicExpand, DEBUG_TYPE, "Expand Atomic instructions", |
126 | | false, false) |
127 | | |
128 | 30.6k | FunctionPass *llvm::createAtomicExpandPass() { return new AtomicExpand(); } |
129 | | |
130 | | // Helper functions to retrieve the size of atomic instructions. |
131 | 1.00k | static unsigned getAtomicOpSize(LoadInst *LI) { |
132 | 1.00k | const DataLayout &DL = LI->getModule()->getDataLayout(); |
133 | 1.00k | return DL.getTypeStoreSize(LI->getType()); |
134 | 1.00k | } |
135 | | |
136 | 12.1k | static unsigned getAtomicOpSize(StoreInst *SI) { |
137 | 12.1k | const DataLayout &DL = SI->getModule()->getDataLayout(); |
138 | 12.1k | return DL.getTypeStoreSize(SI->getValueOperand()->getType()); |
139 | 12.1k | } |
140 | | |
141 | 81.4k | static unsigned getAtomicOpSize(AtomicRMWInst *RMWI) { |
142 | 81.4k | const DataLayout &DL = RMWI->getModule()->getDataLayout(); |
143 | 81.4k | return DL.getTypeStoreSize(RMWI->getValOperand()->getType()); |
144 | 81.4k | } |
145 | | |
146 | 20.0k | static unsigned getAtomicOpSize(AtomicCmpXchgInst *CASI) { |
147 | 20.0k | const DataLayout &DL = CASI->getModule()->getDataLayout(); |
148 | 20.0k | return DL.getTypeStoreSize(CASI->getCompareOperand()->getType()); |
149 | 20.0k | } |
150 | | |
151 | | // Helper functions to retrieve the alignment of atomic instructions. |
152 | 1.00k | static unsigned getAtomicOpAlign(LoadInst *LI) { |
153 | 1.00k | unsigned Align = LI->getAlignment(); |
154 | 1.00k | // In the future, if this IR restriction is relaxed, we should |
155 | 1.00k | // return DataLayout::getABITypeAlignment when there's no align |
156 | 1.00k | // value. |
157 | 1.00k | assert(Align != 0 && "An atomic LoadInst always has an explicit alignment"); |
158 | 1.00k | return Align; |
159 | 1.00k | } |
160 | | |
161 | 12.1k | static unsigned getAtomicOpAlign(StoreInst *SI) { |
162 | 12.1k | unsigned Align = SI->getAlignment(); |
163 | 12.1k | // In the future, if this IR restriction is relaxed, we should |
164 | 12.1k | // return DataLayout::getABITypeAlignment when there's no align |
165 | 12.1k | // value. |
166 | 12.1k | assert(Align != 0 && "An atomic StoreInst always has an explicit alignment"); |
167 | 12.1k | return Align; |
168 | 12.1k | } |
169 | | |
170 | 42.2k | static unsigned getAtomicOpAlign(AtomicRMWInst *RMWI) { |
171 | 42.2k | // TODO(PR27168): This instruction has no alignment attribute, but unlike the |
172 | 42.2k | // default alignment for load/store, the default here is to assume |
173 | 42.2k | // it has NATURAL alignment, not DataLayout-specified alignment. |
174 | 42.2k | const DataLayout &DL = RMWI->getModule()->getDataLayout(); |
175 | 42.2k | return DL.getTypeStoreSize(RMWI->getValOperand()->getType()); |
176 | 42.2k | } |
177 | | |
178 | 10.0k | static unsigned getAtomicOpAlign(AtomicCmpXchgInst *CASI) { |
179 | 10.0k | // TODO(PR27168): same comment as above. |
180 | 10.0k | const DataLayout &DL = CASI->getModule()->getDataLayout(); |
181 | 10.0k | return DL.getTypeStoreSize(CASI->getCompareOperand()->getType()); |
182 | 10.0k | } |
183 | | |
184 | | // Determine if a particular atomic operation has a supported size, |
185 | | // and is of appropriate alignment, to be passed through for target |
186 | | // lowering. (Versus turning into a __atomic libcall) |
187 | | template <typename Inst> |
188 | 65.4k | static bool atomicSizeSupported(const TargetLowering *TLI, Inst *I) { |
189 | 65.4k | unsigned Size = getAtomicOpSize(I); |
190 | 65.4k | unsigned Align = getAtomicOpAlign(I); |
191 | 65.4k | return Align >= Size && Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8; |
192 | 65.4k | } AtomicExpandPass.cpp:bool atomicSizeSupported<llvm::LoadInst>(llvm::TargetLowering const*, llvm::LoadInst*) Line | Count | Source | 188 | 999 | static bool atomicSizeSupported(const TargetLowering *TLI, Inst *I) { | 189 | 999 | unsigned Size = getAtomicOpSize(I); | 190 | 999 | unsigned Align = getAtomicOpAlign(I); | 191 | 998 | return Align >= Size && Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8; | 192 | 999 | } |
AtomicExpandPass.cpp:bool atomicSizeSupported<llvm::StoreInst>(llvm::TargetLowering const*, llvm::StoreInst*) Line | Count | Source | 188 | 12.1k | static bool atomicSizeSupported(const TargetLowering *TLI, Inst *I) { | 189 | 12.1k | unsigned Size = getAtomicOpSize(I); | 190 | 12.1k | unsigned Align = getAtomicOpAlign(I); | 191 | 12.1k | return Align >= Size && Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8; | 192 | 12.1k | } |
AtomicExpandPass.cpp:bool atomicSizeSupported<llvm::AtomicRMWInst>(llvm::TargetLowering const*, llvm::AtomicRMWInst*) Line | Count | Source | 188 | 42.2k | static bool atomicSizeSupported(const TargetLowering *TLI, Inst *I) { | 189 | 42.2k | unsigned Size = getAtomicOpSize(I); | 190 | 42.2k | unsigned Align = getAtomicOpAlign(I); | 191 | 42.2k | return Align >= Size && Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8; | 192 | 42.2k | } |
AtomicExpandPass.cpp:bool atomicSizeSupported<llvm::AtomicCmpXchgInst>(llvm::TargetLowering const*, llvm::AtomicCmpXchgInst*) Line | Count | Source | 188 | 10.0k | static bool atomicSizeSupported(const TargetLowering *TLI, Inst *I) { | 189 | 10.0k | unsigned Size = getAtomicOpSize(I); | 190 | 10.0k | unsigned Align = getAtomicOpAlign(I); | 191 | 10.0k | return Align >= Size && Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8; | 192 | 10.0k | } |
|
193 | | |
194 | 570k | bool AtomicExpand::runOnFunction(Function &F) { |
195 | 570k | auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); |
196 | 570k | if (!TPC) |
197 | 0 | return false; |
198 | 570k | |
199 | 570k | auto &TM = TPC->getTM<TargetMachine>(); |
200 | 570k | if (!TM.getSubtargetImpl(F)->enableAtomicExpand()) |
201 | 4.07k | return false; |
202 | 566k | TLI = TM.getSubtargetImpl(F)->getTargetLowering(); |
203 | 566k | |
204 | 566k | SmallVector<Instruction *, 1> AtomicInsts; |
205 | 566k | |
206 | 566k | // Changing control-flow while iterating through it is a bad idea, so gather a |
207 | 566k | // list of all atomic instructions before we start. |
208 | 20.0M | for (inst_iterator II = inst_begin(F), E = inst_end(F); II != E20.0M ; ++II19.5M ) { |
209 | 19.5M | Instruction *I = &*II; |
210 | 19.5M | if (I->isAtomic() && 19.5M !isa<FenceInst>(I)68.9k ) |
211 | 65.4k | AtomicInsts.push_back(I); |
212 | 19.5M | } |
213 | 566k | |
214 | 566k | bool MadeChange = false; |
215 | 65.4k | for (auto I : AtomicInsts) { |
216 | 65.4k | auto LI = dyn_cast<LoadInst>(I); |
217 | 65.4k | auto SI = dyn_cast<StoreInst>(I); |
218 | 65.4k | auto RMWI = dyn_cast<AtomicRMWInst>(I); |
219 | 65.4k | auto CASI = dyn_cast<AtomicCmpXchgInst>(I); |
220 | 65.4k | assert((LI || SI || RMWI || CASI) && "Unknown atomic instruction"); |
221 | 65.4k | |
222 | 65.4k | // If the Size/Alignment is not supported, replace with a libcall. |
223 | 65.4k | if (LI65.4k ) { |
224 | 999 | if (!atomicSizeSupported(TLI, LI)999 ) { |
225 | 4 | expandAtomicLoadToLibcall(LI); |
226 | 4 | MadeChange = true; |
227 | 4 | continue; |
228 | 4 | } |
229 | 64.4k | } else if (64.4k SI64.4k ) { |
230 | 12.1k | if (!atomicSizeSupported(TLI, SI)12.1k ) { |
231 | 5 | expandAtomicStoreToLibcall(SI); |
232 | 5 | MadeChange = true; |
233 | 5 | continue; |
234 | 5 | } |
235 | 52.2k | } else if (52.2k RMWI52.2k ) { |
236 | 42.2k | if (!atomicSizeSupported(TLI, RMWI)42.2k ) { |
237 | 4 | expandAtomicRMWToLibcall(RMWI); |
238 | 4 | MadeChange = true; |
239 | 4 | continue; |
240 | 4 | } |
241 | 10.0k | } else if (10.0k CASI10.0k ) { |
242 | 10.0k | if (!atomicSizeSupported(TLI, CASI)10.0k ) { |
243 | 3 | expandAtomicCASToLibcall(CASI); |
244 | 3 | MadeChange = true; |
245 | 3 | continue; |
246 | 3 | } |
247 | 65.3k | } |
248 | 65.3k | |
249 | 65.3k | if (65.3k TLI->shouldInsertFencesForAtomic(I)65.3k ) { |
250 | 3.07k | auto FenceOrdering = AtomicOrdering::Monotonic; |
251 | 3.07k | if (LI && 3.07k isAcquireOrStronger(LI->getOrdering())260 ) { |
252 | 191 | FenceOrdering = LI->getOrdering(); |
253 | 191 | LI->setOrdering(AtomicOrdering::Monotonic); |
254 | 3.07k | } else if (2.88k SI && 2.88k isReleaseOrStronger(SI->getOrdering())320 ) { |
255 | 233 | FenceOrdering = SI->getOrdering(); |
256 | 233 | SI->setOrdering(AtomicOrdering::Monotonic); |
257 | 2.88k | } else if (2.65k RMWI && 2.65k (isReleaseOrStronger(RMWI->getOrdering()) || |
258 | 2.65k | isAcquireOrStronger(RMWI->getOrdering())764 )) { |
259 | 1.37k | FenceOrdering = RMWI->getOrdering(); |
260 | 1.37k | RMWI->setOrdering(AtomicOrdering::Monotonic); |
261 | 2.65k | } else if (1.28k CASI && 1.28k !TLI->shouldExpandAtomicCmpXchgInIR(CASI)632 && |
262 | 188 | (isReleaseOrStronger(CASI->getSuccessOrdering()) || |
263 | 1.28k | isAcquireOrStronger(CASI->getSuccessOrdering())80 )) { |
264 | 127 | // If a compare and swap is lowered to LL/SC, we can do smarter fence |
265 | 127 | // insertion, with a stronger one on the success path than on the |
266 | 127 | // failure path. As a result, fence insertion is directly done by |
267 | 127 | // expandAtomicCmpXchg in that case. |
268 | 127 | FenceOrdering = CASI->getSuccessOrdering(); |
269 | 127 | CASI->setSuccessOrdering(AtomicOrdering::Monotonic); |
270 | 127 | CASI->setFailureOrdering(AtomicOrdering::Monotonic); |
271 | 127 | } |
272 | 3.07k | |
273 | 3.07k | if (FenceOrdering != AtomicOrdering::Monotonic3.07k ) { |
274 | 1.92k | MadeChange |= bracketInstWithFences(I, FenceOrdering); |
275 | 1.92k | } |
276 | 3.07k | } |
277 | 65.3k | |
278 | 65.3k | if (LI65.3k ) { |
279 | 995 | if (LI->getType()->isFloatingPointTy()995 ) { |
280 | 14 | // TODO: add a TLI hook to control this so that each target can |
281 | 14 | // convert to lowering the original type one at a time. |
282 | 14 | LI = convertAtomicLoadToIntegerType(LI); |
283 | 14 | assert(LI->getType()->isIntegerTy() && "invariant broken"); |
284 | 14 | MadeChange = true; |
285 | 14 | } |
286 | 995 | |
287 | 995 | MadeChange |= tryExpandAtomicLoad(LI); |
288 | 65.3k | } else if (64.3k SI64.3k ) { |
289 | 12.1k | if (SI->getValueOperand()->getType()->isFloatingPointTy()12.1k ) { |
290 | 14 | // TODO: add a TLI hook to control this so that each target can |
291 | 14 | // convert to lowering the original type one at a time. |
292 | 14 | SI = convertAtomicStoreToIntegerType(SI); |
293 | 14 | assert(SI->getValueOperand()->getType()->isIntegerTy() && |
294 | 14 | "invariant broken"); |
295 | 14 | MadeChange = true; |
296 | 14 | } |
297 | 12.1k | |
298 | 12.1k | if (TLI->shouldExpandAtomicStoreInIR(SI)) |
299 | 101 | MadeChange |= expandAtomicStore(SI); |
300 | 64.3k | } else if (52.2k RMWI52.2k ) { |
301 | 42.2k | // There are two different ways of expanding RMW instructions: |
302 | 42.2k | // - into a load if it is idempotent |
303 | 42.2k | // - into a Cmpxchg/LL-SC loop otherwise |
304 | 42.2k | // we try them in that order. |
305 | 42.2k | |
306 | 42.2k | if (isIdempotentRMW(RMWI) && 42.2k simplifyIdempotentRMW(RMWI)14 ) { |
307 | 11 | MadeChange = true; |
308 | 42.2k | } else { |
309 | 42.2k | MadeChange |= tryExpandAtomicRMW(RMWI); |
310 | 42.2k | } |
311 | 52.2k | } else if (10.0k CASI10.0k ) { |
312 | 10.0k | // TODO: when we're ready to make the change at the IR level, we can |
313 | 10.0k | // extend convertCmpXchgToInteger for floating point too. |
314 | 10.0k | assert(!CASI->getCompareOperand()->getType()->isFloatingPointTy() && |
315 | 10.0k | "unimplemented - floating point not legal at IR level"); |
316 | 10.0k | if (CASI->getCompareOperand()->getType()->isPointerTy()10.0k ) { |
317 | 6 | // TODO: add a TLI hook to control this so that each target can |
318 | 6 | // convert to lowering the original type one at a time. |
319 | 6 | CASI = convertCmpXchgToIntegerType(CASI); |
320 | 6 | assert(CASI->getCompareOperand()->getType()->isIntegerTy() && |
321 | 6 | "invariant broken"); |
322 | 6 | MadeChange = true; |
323 | 6 | } |
324 | 10.0k | |
325 | 10.0k | unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8; |
326 | 10.0k | unsigned ValueSize = getAtomicOpSize(CASI); |
327 | 10.0k | if (ValueSize < MinCASSize10.0k ) { |
328 | 4 | assert(!TLI->shouldExpandAtomicCmpXchgInIR(CASI) && |
329 | 4 | "MinCmpXchgSizeInBits not yet supported for LL/SC expansions."); |
330 | 4 | expandPartwordCmpXchg(CASI); |
331 | 10.0k | } else { |
332 | 10.0k | if (TLI->shouldExpandAtomicCmpXchgInIR(CASI)) |
333 | 8.96k | MadeChange |= expandAtomicCmpXchg(CASI); |
334 | 10.0k | } |
335 | 64.3k | } |
336 | 65.4k | } |
337 | 570k | return MadeChange; |
338 | 570k | } |
339 | | |
340 | 1.92k | bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order) { |
341 | 1.92k | IRBuilder<> Builder(I); |
342 | 1.92k | |
343 | 1.92k | auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order); |
344 | 1.92k | |
345 | 1.92k | auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order); |
346 | 1.92k | // We have a guard here because not every atomic operation generates a |
347 | 1.92k | // trailing fence. |
348 | 1.92k | if (TrailingFence) |
349 | 1.51k | TrailingFence->moveAfter(I); |
350 | 1.92k | |
351 | 471 | return (LeadingFence || TrailingFence); |
352 | 1.92k | } |
353 | | |
354 | | /// Get the iX type with the same bitwidth as T. |
355 | | IntegerType *AtomicExpand::getCorrespondingIntegerType(Type *T, |
356 | 34 | const DataLayout &DL) { |
357 | 34 | EVT VT = TLI->getValueType(DL, T); |
358 | 34 | unsigned BitWidth = VT.getStoreSizeInBits(); |
359 | 34 | assert(BitWidth == VT.getSizeInBits() && "must be a power of two"); |
360 | 34 | return IntegerType::get(T->getContext(), BitWidth); |
361 | 34 | } |
362 | | |
363 | | /// Convert an atomic load of a non-integral type to an integer load of the |
364 | | /// equivalent bitwidth. See the function comment on |
365 | | /// convertAtomicStoreToIntegerType for background. |
366 | 14 | LoadInst *AtomicExpand::convertAtomicLoadToIntegerType(LoadInst *LI) { |
367 | 14 | auto *M = LI->getModule(); |
368 | 14 | Type *NewTy = getCorrespondingIntegerType(LI->getType(), |
369 | 14 | M->getDataLayout()); |
370 | 14 | |
371 | 14 | IRBuilder<> Builder(LI); |
372 | 14 | |
373 | 14 | Value *Addr = LI->getPointerOperand(); |
374 | 14 | Type *PT = PointerType::get(NewTy, |
375 | 14 | Addr->getType()->getPointerAddressSpace()); |
376 | 14 | Value *NewAddr = Builder.CreateBitCast(Addr, PT); |
377 | 14 | |
378 | 14 | auto *NewLI = Builder.CreateLoad(NewAddr); |
379 | 14 | NewLI->setAlignment(LI->getAlignment()); |
380 | 14 | NewLI->setVolatile(LI->isVolatile()); |
381 | 14 | NewLI->setAtomic(LI->getOrdering(), LI->getSyncScopeID()); |
382 | 14 | DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n"); |
383 | 14 | |
384 | 14 | Value *NewVal = Builder.CreateBitCast(NewLI, LI->getType()); |
385 | 14 | LI->replaceAllUsesWith(NewVal); |
386 | 14 | LI->eraseFromParent(); |
387 | 14 | return NewLI; |
388 | 14 | } |
389 | | |
390 | 1.00k | bool AtomicExpand::tryExpandAtomicLoad(LoadInst *LI) { |
391 | 1.00k | switch (TLI->shouldExpandAtomicLoadInIR(LI)) { |
392 | 911 | case TargetLoweringBase::AtomicExpansionKind::None: |
393 | 911 | return false; |
394 | 2 | case TargetLoweringBase::AtomicExpansionKind::LLSC: |
395 | 2 | expandAtomicOpToLLSC( |
396 | 2 | LI, LI->getType(), LI->getPointerOperand(), LI->getOrdering(), |
397 | 2 | [](IRBuilder<> &Builder, Value *Loaded) { return Loaded; }); |
398 | 2 | return true; |
399 | 50 | case TargetLoweringBase::AtomicExpansionKind::LLOnly: |
400 | 50 | return expandAtomicLoadToLL(LI); |
401 | 43 | case TargetLoweringBase::AtomicExpansionKind::CmpXChg: |
402 | 43 | return expandAtomicLoadToCmpXchg(LI); |
403 | 0 | } |
404 | 0 | llvm_unreachable0 ("Unhandled case in tryExpandAtomicLoad"); |
405 | 0 | } |
406 | | |
407 | 50 | bool AtomicExpand::expandAtomicLoadToLL(LoadInst *LI) { |
408 | 50 | IRBuilder<> Builder(LI); |
409 | 50 | |
410 | 50 | // On some architectures, load-linked instructions are atomic for larger |
411 | 50 | // sizes than normal loads. For example, the only 64-bit load guaranteed |
412 | 50 | // to be single-copy atomic by ARM is an ldrexd (A3.5.3). |
413 | 50 | Value *Val = |
414 | 50 | TLI->emitLoadLinked(Builder, LI->getPointerOperand(), LI->getOrdering()); |
415 | 50 | TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder); |
416 | 50 | |
417 | 50 | LI->replaceAllUsesWith(Val); |
418 | 50 | LI->eraseFromParent(); |
419 | 50 | |
420 | 50 | return true; |
421 | 50 | } |
422 | | |
423 | 43 | bool AtomicExpand::expandAtomicLoadToCmpXchg(LoadInst *LI) { |
424 | 43 | IRBuilder<> Builder(LI); |
425 | 43 | AtomicOrdering Order = LI->getOrdering(); |
426 | 43 | Value *Addr = LI->getPointerOperand(); |
427 | 43 | Type *Ty = cast<PointerType>(Addr->getType())->getElementType(); |
428 | 43 | Constant *DummyVal = Constant::getNullValue(Ty); |
429 | 43 | |
430 | 43 | Value *Pair = Builder.CreateAtomicCmpXchg( |
431 | 43 | Addr, DummyVal, DummyVal, Order, |
432 | 43 | AtomicCmpXchgInst::getStrongestFailureOrdering(Order)); |
433 | 43 | Value *Loaded = Builder.CreateExtractValue(Pair, 0, "loaded"); |
434 | 43 | |
435 | 43 | LI->replaceAllUsesWith(Loaded); |
436 | 43 | LI->eraseFromParent(); |
437 | 43 | |
438 | 43 | return true; |
439 | 43 | } |
440 | | |
441 | | /// Convert an atomic store of a non-integral type to an integer store of the |
442 | | /// equivalent bitwidth. We used to not support floating point or vector |
443 | | /// atomics in the IR at all. The backends learned to deal with the bitcast |
444 | | /// idiom because that was the only way of expressing the notion of a atomic |
445 | | /// float or vector store. The long term plan is to teach each backend to |
446 | | /// instruction select from the original atomic store, but as a migration |
447 | | /// mechanism, we convert back to the old format which the backends understand. |
448 | | /// Each backend will need individual work to recognize the new format. |
449 | 14 | StoreInst *AtomicExpand::convertAtomicStoreToIntegerType(StoreInst *SI) { |
450 | 14 | IRBuilder<> Builder(SI); |
451 | 14 | auto *M = SI->getModule(); |
452 | 14 | Type *NewTy = getCorrespondingIntegerType(SI->getValueOperand()->getType(), |
453 | 14 | M->getDataLayout()); |
454 | 14 | Value *NewVal = Builder.CreateBitCast(SI->getValueOperand(), NewTy); |
455 | 14 | |
456 | 14 | Value *Addr = SI->getPointerOperand(); |
457 | 14 | Type *PT = PointerType::get(NewTy, |
458 | 14 | Addr->getType()->getPointerAddressSpace()); |
459 | 14 | Value *NewAddr = Builder.CreateBitCast(Addr, PT); |
460 | 14 | |
461 | 14 | StoreInst *NewSI = Builder.CreateStore(NewVal, NewAddr); |
462 | 14 | NewSI->setAlignment(SI->getAlignment()); |
463 | 14 | NewSI->setVolatile(SI->isVolatile()); |
464 | 14 | NewSI->setAtomic(SI->getOrdering(), SI->getSyncScopeID()); |
465 | 14 | DEBUG(dbgs() << "Replaced " << *SI << " with " << *NewSI << "\n"); |
466 | 14 | SI->eraseFromParent(); |
467 | 14 | return NewSI; |
468 | 14 | } |
469 | | |
470 | 101 | bool AtomicExpand::expandAtomicStore(StoreInst *SI) { |
471 | 101 | // This function is only called on atomic stores that are too large to be |
472 | 101 | // atomic if implemented as a native store. So we replace them by an |
473 | 101 | // atomic swap, that can be implemented for example as a ldrex/strex on ARM |
474 | 101 | // or lock cmpxchg8/16b on X86, as these are atomic for larger sizes. |
475 | 101 | // It is the responsibility of the target to only signal expansion via |
476 | 101 | // shouldExpandAtomicRMW in cases where this is required and possible. |
477 | 101 | IRBuilder<> Builder(SI); |
478 | 101 | AtomicRMWInst *AI = |
479 | 101 | Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, SI->getPointerOperand(), |
480 | 101 | SI->getValueOperand(), SI->getOrdering()); |
481 | 101 | SI->eraseFromParent(); |
482 | 101 | |
483 | 101 | // Now we have an appropriate swap instruction, lower it as usual. |
484 | 101 | return tryExpandAtomicRMW(AI); |
485 | 101 | } |
486 | | |
487 | | static void createCmpXchgInstFun(IRBuilder<> &Builder, Value *Addr, |
488 | | Value *Loaded, Value *NewVal, |
489 | | AtomicOrdering MemOpOrder, |
490 | 1.10k | Value *&Success, Value *&NewLoaded) { |
491 | 1.10k | Value* Pair = Builder.CreateAtomicCmpXchg( |
492 | 1.10k | Addr, Loaded, NewVal, MemOpOrder, |
493 | 1.10k | AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder)); |
494 | 1.10k | Success = Builder.CreateExtractValue(Pair, 1, "success"); |
495 | 1.10k | NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded"); |
496 | 1.10k | } |
497 | | |
498 | | /// Emit IR to implement the given atomicrmw operation on values in registers, |
499 | | /// returning the new value. |
500 | | static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder, |
501 | 39.1k | Value *Loaded, Value *Inc) { |
502 | 39.1k | Value *NewVal; |
503 | 39.1k | switch (Op) { |
504 | 11.9k | case AtomicRMWInst::Xchg: |
505 | 11.9k | return Inc; |
506 | 19.7k | case AtomicRMWInst::Add: |
507 | 19.7k | return Builder.CreateAdd(Loaded, Inc, "new"); |
508 | 5.81k | case AtomicRMWInst::Sub: |
509 | 5.81k | return Builder.CreateSub(Loaded, Inc, "new"); |
510 | 455 | case AtomicRMWInst::And: |
511 | 455 | return Builder.CreateAnd(Loaded, Inc, "new"); |
512 | 83 | case AtomicRMWInst::Nand: |
513 | 83 | return Builder.CreateNot(Builder.CreateAnd(Loaded, Inc), "new"); |
514 | 457 | case AtomicRMWInst::Or: |
515 | 457 | return Builder.CreateOr(Loaded, Inc, "new"); |
516 | 454 | case AtomicRMWInst::Xor: |
517 | 454 | return Builder.CreateXor(Loaded, Inc, "new"); |
518 | 51 | case AtomicRMWInst::Max: |
519 | 51 | NewVal = Builder.CreateICmpSGT(Loaded, Inc); |
520 | 51 | return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); |
521 | 59 | case AtomicRMWInst::Min: |
522 | 59 | NewVal = Builder.CreateICmpSLE(Loaded, Inc); |
523 | 59 | return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); |
524 | 58 | case AtomicRMWInst::UMax: |
525 | 58 | NewVal = Builder.CreateICmpUGT(Loaded, Inc); |
526 | 58 | return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); |
527 | 59 | case AtomicRMWInst::UMin: |
528 | 59 | NewVal = Builder.CreateICmpULE(Loaded, Inc); |
529 | 59 | return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); |
530 | 0 | default: |
531 | 0 | llvm_unreachable("Unknown atomic op"); |
532 | 0 | } |
533 | 0 | } |
534 | | |
535 | 42.3k | bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) { |
536 | 42.3k | switch (TLI->shouldExpandAtomicRMWInIR(AI)) { |
537 | 3.16k | case TargetLoweringBase::AtomicExpansionKind::None: |
538 | 3.16k | return false; |
539 | 38.0k | case TargetLoweringBase::AtomicExpansionKind::LLSC: { |
540 | 38.0k | unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8; |
541 | 38.0k | unsigned ValueSize = getAtomicOpSize(AI); |
542 | 38.0k | if (ValueSize < MinCASSize38.0k ) { |
543 | 0 | llvm_unreachable( |
544 | 0 | "MinCmpXchgSizeInBits not yet supported for LL/SC architectures."); |
545 | 0 | } else { |
546 | 38.0k | auto PerformOp = [&](IRBuilder<> &Builder, Value *Loaded) { |
547 | 38.0k | return performAtomicOp(AI->getOperation(), Builder, Loaded, |
548 | 38.0k | AI->getValOperand()); |
549 | 38.0k | }; |
550 | 38.0k | expandAtomicOpToLLSC(AI, AI->getType(), AI->getPointerOperand(), |
551 | 38.0k | AI->getOrdering(), PerformOp); |
552 | 38.0k | } |
553 | 38.0k | return true; |
554 | 38.0k | } |
555 | 1.10k | case TargetLoweringBase::AtomicExpansionKind::CmpXChg: { |
556 | 1.10k | unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8; |
557 | 1.10k | unsigned ValueSize = getAtomicOpSize(AI); |
558 | 1.10k | if (ValueSize < MinCASSize1.10k ) { |
559 | 7 | expandPartwordAtomicRMW(AI, |
560 | 7 | TargetLoweringBase::AtomicExpansionKind::CmpXChg); |
561 | 1.10k | } else { |
562 | 1.09k | expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun); |
563 | 1.09k | } |
564 | 1.10k | return true; |
565 | 38.0k | } |
566 | 0 | default: |
567 | 0 | llvm_unreachable("Unhandled case in tryExpandAtomicRMW"); |
568 | 0 | } |
569 | 0 | } |
570 | | |
571 | | namespace { |
572 | | |
573 | | /// Result values from createMaskInstrs helper. |
574 | | struct PartwordMaskValues { |
575 | | Type *WordType; |
576 | | Type *ValueType; |
577 | | Value *AlignedAddr; |
578 | | Value *ShiftAmt; |
579 | | Value *Mask; |
580 | | Value *Inv_Mask; |
581 | | }; |
582 | | |
583 | | } // end anonymous namespace |
584 | | |
585 | | /// This is a helper function which builds instructions to provide |
586 | | /// values necessary for partword atomic operations. It takes an |
587 | | /// incoming address, Addr, and ValueType, and constructs the address, |
588 | | /// shift-amounts and masks needed to work with a larger value of size |
589 | | /// WordSize. |
590 | | /// |
591 | | /// AlignedAddr: Addr rounded down to a multiple of WordSize |
592 | | /// |
593 | | /// ShiftAmt: Number of bits to right-shift a WordSize value loaded |
594 | | /// from AlignAddr for it to have the same value as if |
595 | | /// ValueType was loaded from Addr. |
596 | | /// |
597 | | /// Mask: Value to mask with the value loaded from AlignAddr to |
598 | | /// include only the part that would've been loaded from Addr. |
599 | | /// |
600 | | /// Inv_Mask: The inverse of Mask. |
601 | | static PartwordMaskValues createMaskInstrs(IRBuilder<> &Builder, Instruction *I, |
602 | | Type *ValueType, Value *Addr, |
603 | 11 | unsigned WordSize) { |
604 | 11 | PartwordMaskValues Ret; |
605 | 11 | |
606 | 11 | BasicBlock *BB = I->getParent(); |
607 | 11 | Function *F = BB->getParent(); |
608 | 11 | Module *M = I->getModule(); |
609 | 11 | |
610 | 11 | LLVMContext &Ctx = F->getContext(); |
611 | 11 | const DataLayout &DL = M->getDataLayout(); |
612 | 11 | |
613 | 11 | unsigned ValueSize = DL.getTypeStoreSize(ValueType); |
614 | 11 | |
615 | 11 | assert(ValueSize < WordSize); |
616 | 11 | |
617 | 11 | Ret.ValueType = ValueType; |
618 | 11 | Ret.WordType = Type::getIntNTy(Ctx, WordSize * 8); |
619 | 11 | |
620 | 11 | Type *WordPtrType = |
621 | 11 | Ret.WordType->getPointerTo(Addr->getType()->getPointerAddressSpace()); |
622 | 11 | |
623 | 11 | Value *AddrInt = Builder.CreatePtrToInt(Addr, DL.getIntPtrType(Ctx)); |
624 | 11 | Ret.AlignedAddr = Builder.CreateIntToPtr( |
625 | 11 | Builder.CreateAnd(AddrInt, ~(uint64_t)(WordSize - 1)), WordPtrType, |
626 | 11 | "AlignedAddr"); |
627 | 11 | |
628 | 11 | Value *PtrLSB = Builder.CreateAnd(AddrInt, WordSize - 1, "PtrLSB"); |
629 | 11 | if (DL.isLittleEndian()11 ) { |
630 | 0 | // turn bytes into bits |
631 | 0 | Ret.ShiftAmt = Builder.CreateShl(PtrLSB, 3); |
632 | 11 | } else { |
633 | 11 | // turn bytes into bits, and count from the other side. |
634 | 11 | Ret.ShiftAmt = |
635 | 11 | Builder.CreateShl(Builder.CreateXor(PtrLSB, WordSize - ValueSize), 3); |
636 | 11 | } |
637 | 11 | |
638 | 11 | Ret.ShiftAmt = Builder.CreateTrunc(Ret.ShiftAmt, Ret.WordType, "ShiftAmt"); |
639 | 11 | Ret.Mask = Builder.CreateShl( |
640 | 11 | ConstantInt::get(Ret.WordType, (1 << ValueSize * 8) - 1), Ret.ShiftAmt, |
641 | 11 | "Mask"); |
642 | 11 | Ret.Inv_Mask = Builder.CreateNot(Ret.Mask, "Inv_Mask"); |
643 | 11 | |
644 | 11 | return Ret; |
645 | 11 | } |
646 | | |
647 | | /// Emit IR to implement a masked version of a given atomicrmw |
648 | | /// operation. (That is, only the bits under the Mask should be |
649 | | /// affected by the operation) |
650 | | static Value *performMaskedAtomicOp(AtomicRMWInst::BinOp Op, |
651 | | IRBuilder<> &Builder, Value *Loaded, |
652 | | Value *Shifted_Inc, Value *Inc, |
653 | 7 | const PartwordMaskValues &PMV) { |
654 | 7 | switch (Op) { |
655 | 2 | case AtomicRMWInst::Xchg: { |
656 | 2 | Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask); |
657 | 2 | Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, Shifted_Inc); |
658 | 2 | return FinalVal; |
659 | 7 | } |
660 | 1 | case AtomicRMWInst::Or: |
661 | 1 | case AtomicRMWInst::Xor: |
662 | 1 | // Or/Xor won't affect any other bits, so can just be done |
663 | 1 | // directly. |
664 | 1 | return performAtomicOp(Op, Builder, Loaded, Shifted_Inc); |
665 | 3 | case AtomicRMWInst::Add: |
666 | 3 | case AtomicRMWInst::Sub: |
667 | 3 | case AtomicRMWInst::And: |
668 | 3 | case AtomicRMWInst::Nand: { |
669 | 3 | // The other arithmetic ops need to be masked into place. |
670 | 3 | Value *NewVal = performAtomicOp(Op, Builder, Loaded, Shifted_Inc); |
671 | 3 | Value *NewVal_Masked = Builder.CreateAnd(NewVal, PMV.Mask); |
672 | 3 | Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask); |
673 | 3 | Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Masked); |
674 | 3 | return FinalVal; |
675 | 3 | } |
676 | 1 | case AtomicRMWInst::Max: |
677 | 1 | case AtomicRMWInst::Min: |
678 | 1 | case AtomicRMWInst::UMax: |
679 | 1 | case AtomicRMWInst::UMin: { |
680 | 1 | // Finally, comparison ops will operate on the full value, so |
681 | 1 | // truncate down to the original size, and expand out again after |
682 | 1 | // doing the operation. |
683 | 1 | Value *Loaded_Shiftdown = Builder.CreateTrunc( |
684 | 1 | Builder.CreateLShr(Loaded, PMV.ShiftAmt), PMV.ValueType); |
685 | 1 | Value *NewVal = performAtomicOp(Op, Builder, Loaded_Shiftdown, Inc); |
686 | 1 | Value *NewVal_Shiftup = Builder.CreateShl( |
687 | 1 | Builder.CreateZExt(NewVal, PMV.WordType), PMV.ShiftAmt); |
688 | 1 | Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask); |
689 | 1 | Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Shiftup); |
690 | 1 | return FinalVal; |
691 | 1 | } |
692 | 0 | default: |
693 | 0 | llvm_unreachable("Unknown atomic op"); |
694 | 0 | } |
695 | 0 | } |
696 | | |
697 | | /// Expand a sub-word atomicrmw operation into an appropriate |
698 | | /// word-sized operation. |
699 | | /// |
700 | | /// It will create an LL/SC or cmpxchg loop, as appropriate, the same |
701 | | /// way as a typical atomicrmw expansion. The only difference here is |
702 | | /// that the operation inside of the loop must operate only upon a |
703 | | /// part of the value. |
704 | | void AtomicExpand::expandPartwordAtomicRMW( |
705 | 7 | AtomicRMWInst *AI, TargetLoweringBase::AtomicExpansionKind ExpansionKind) { |
706 | 7 | assert(ExpansionKind == TargetLoweringBase::AtomicExpansionKind::CmpXChg); |
707 | 7 | |
708 | 7 | AtomicOrdering MemOpOrder = AI->getOrdering(); |
709 | 7 | |
710 | 7 | IRBuilder<> Builder(AI); |
711 | 7 | |
712 | 7 | PartwordMaskValues PMV = |
713 | 7 | createMaskInstrs(Builder, AI, AI->getType(), AI->getPointerOperand(), |
714 | 7 | TLI->getMinCmpXchgSizeInBits() / 8); |
715 | 7 | |
716 | 7 | Value *ValOperand_Shifted = |
717 | 7 | Builder.CreateShl(Builder.CreateZExt(AI->getValOperand(), PMV.WordType), |
718 | 7 | PMV.ShiftAmt, "ValOperand_Shifted"); |
719 | 7 | |
720 | 7 | auto PerformPartwordOp = [&](IRBuilder<> &Builder, Value *Loaded) { |
721 | 7 | return performMaskedAtomicOp(AI->getOperation(), Builder, Loaded, |
722 | 7 | ValOperand_Shifted, AI->getValOperand(), PMV); |
723 | 7 | }; |
724 | 7 | |
725 | 7 | // TODO: When we're ready to support LLSC conversions too, use |
726 | 7 | // insertRMWLLSCLoop here for ExpansionKind==LLSC. |
727 | 7 | Value *OldResult = |
728 | 7 | insertRMWCmpXchgLoop(Builder, PMV.WordType, PMV.AlignedAddr, MemOpOrder, |
729 | 7 | PerformPartwordOp, createCmpXchgInstFun); |
730 | 7 | Value *FinalOldResult = Builder.CreateTrunc( |
731 | 7 | Builder.CreateLShr(OldResult, PMV.ShiftAmt), PMV.ValueType); |
732 | 7 | AI->replaceAllUsesWith(FinalOldResult); |
733 | 7 | AI->eraseFromParent(); |
734 | 7 | } |
735 | | |
736 | 4 | void AtomicExpand::expandPartwordCmpXchg(AtomicCmpXchgInst *CI) { |
737 | 4 | // The basic idea here is that we're expanding a cmpxchg of a |
738 | 4 | // smaller memory size up to a word-sized cmpxchg. To do this, we |
739 | 4 | // need to add a retry-loop for strong cmpxchg, so that |
740 | 4 | // modifications to other parts of the word don't cause a spurious |
741 | 4 | // failure. |
742 | 4 | |
743 | 4 | // This generates code like the following: |
744 | 4 | // [[Setup mask values PMV.*]] |
745 | 4 | // %NewVal_Shifted = shl i32 %NewVal, %PMV.ShiftAmt |
746 | 4 | // %Cmp_Shifted = shl i32 %Cmp, %PMV.ShiftAmt |
747 | 4 | // %InitLoaded = load i32* %addr |
748 | 4 | // %InitLoaded_MaskOut = and i32 %InitLoaded, %PMV.Inv_Mask |
749 | 4 | // br partword.cmpxchg.loop |
750 | 4 | // partword.cmpxchg.loop: |
751 | 4 | // %Loaded_MaskOut = phi i32 [ %InitLoaded_MaskOut, %entry ], |
752 | 4 | // [ %OldVal_MaskOut, %partword.cmpxchg.failure ] |
753 | 4 | // %FullWord_NewVal = or i32 %Loaded_MaskOut, %NewVal_Shifted |
754 | 4 | // %FullWord_Cmp = or i32 %Loaded_MaskOut, %Cmp_Shifted |
755 | 4 | // %NewCI = cmpxchg i32* %PMV.AlignedAddr, i32 %FullWord_Cmp, |
756 | 4 | // i32 %FullWord_NewVal success_ordering failure_ordering |
757 | 4 | // %OldVal = extractvalue { i32, i1 } %NewCI, 0 |
758 | 4 | // %Success = extractvalue { i32, i1 } %NewCI, 1 |
759 | 4 | // br i1 %Success, label %partword.cmpxchg.end, |
760 | 4 | // label %partword.cmpxchg.failure |
761 | 4 | // partword.cmpxchg.failure: |
762 | 4 | // %OldVal_MaskOut = and i32 %OldVal, %PMV.Inv_Mask |
763 | 4 | // %ShouldContinue = icmp ne i32 %Loaded_MaskOut, %OldVal_MaskOut |
764 | 4 | // br i1 %ShouldContinue, label %partword.cmpxchg.loop, |
765 | 4 | // label %partword.cmpxchg.end |
766 | 4 | // partword.cmpxchg.end: |
767 | 4 | // %tmp1 = lshr i32 %OldVal, %PMV.ShiftAmt |
768 | 4 | // %FinalOldVal = trunc i32 %tmp1 to i8 |
769 | 4 | // %tmp2 = insertvalue { i8, i1 } undef, i8 %FinalOldVal, 0 |
770 | 4 | // %Res = insertvalue { i8, i1 } %25, i1 %Success, 1 |
771 | 4 | |
772 | 4 | Value *Addr = CI->getPointerOperand(); |
773 | 4 | Value *Cmp = CI->getCompareOperand(); |
774 | 4 | Value *NewVal = CI->getNewValOperand(); |
775 | 4 | |
776 | 4 | BasicBlock *BB = CI->getParent(); |
777 | 4 | Function *F = BB->getParent(); |
778 | 4 | IRBuilder<> Builder(CI); |
779 | 4 | LLVMContext &Ctx = Builder.getContext(); |
780 | 4 | |
781 | 4 | const int WordSize = TLI->getMinCmpXchgSizeInBits() / 8; |
782 | 4 | |
783 | 4 | BasicBlock *EndBB = |
784 | 4 | BB->splitBasicBlock(CI->getIterator(), "partword.cmpxchg.end"); |
785 | 4 | auto FailureBB = |
786 | 4 | BasicBlock::Create(Ctx, "partword.cmpxchg.failure", F, EndBB); |
787 | 4 | auto LoopBB = BasicBlock::Create(Ctx, "partword.cmpxchg.loop", F, FailureBB); |
788 | 4 | |
789 | 4 | // The split call above "helpfully" added a branch at the end of BB |
790 | 4 | // (to the wrong place). |
791 | 4 | std::prev(BB->end())->eraseFromParent(); |
792 | 4 | Builder.SetInsertPoint(BB); |
793 | 4 | |
794 | 4 | PartwordMaskValues PMV = createMaskInstrs( |
795 | 4 | Builder, CI, CI->getCompareOperand()->getType(), Addr, WordSize); |
796 | 4 | |
797 | 4 | // Shift the incoming values over, into the right location in the word. |
798 | 4 | Value *NewVal_Shifted = |
799 | 4 | Builder.CreateShl(Builder.CreateZExt(NewVal, PMV.WordType), PMV.ShiftAmt); |
800 | 4 | Value *Cmp_Shifted = |
801 | 4 | Builder.CreateShl(Builder.CreateZExt(Cmp, PMV.WordType), PMV.ShiftAmt); |
802 | 4 | |
803 | 4 | // Load the entire current word, and mask into place the expected and new |
804 | 4 | // values |
805 | 4 | LoadInst *InitLoaded = Builder.CreateLoad(PMV.WordType, PMV.AlignedAddr); |
806 | 4 | InitLoaded->setVolatile(CI->isVolatile()); |
807 | 4 | Value *InitLoaded_MaskOut = Builder.CreateAnd(InitLoaded, PMV.Inv_Mask); |
808 | 4 | Builder.CreateBr(LoopBB); |
809 | 4 | |
810 | 4 | // partword.cmpxchg.loop: |
811 | 4 | Builder.SetInsertPoint(LoopBB); |
812 | 4 | PHINode *Loaded_MaskOut = Builder.CreatePHI(PMV.WordType, 2); |
813 | 4 | Loaded_MaskOut->addIncoming(InitLoaded_MaskOut, BB); |
814 | 4 | |
815 | 4 | // Mask/Or the expected and new values into place in the loaded word. |
816 | 4 | Value *FullWord_NewVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Shifted); |
817 | 4 | Value *FullWord_Cmp = Builder.CreateOr(Loaded_MaskOut, Cmp_Shifted); |
818 | 4 | AtomicCmpXchgInst *NewCI = Builder.CreateAtomicCmpXchg( |
819 | 4 | PMV.AlignedAddr, FullWord_Cmp, FullWord_NewVal, CI->getSuccessOrdering(), |
820 | 4 | CI->getFailureOrdering(), CI->getSyncScopeID()); |
821 | 4 | NewCI->setVolatile(CI->isVolatile()); |
822 | 4 | // When we're building a strong cmpxchg, we need a loop, so you |
823 | 4 | // might think we could use a weak cmpxchg inside. But, using strong |
824 | 4 | // allows the below comparison for ShouldContinue, and we're |
825 | 4 | // expecting the underlying cmpxchg to be a machine instruction, |
826 | 4 | // which is strong anyways. |
827 | 4 | NewCI->setWeak(CI->isWeak()); |
828 | 4 | |
829 | 4 | Value *OldVal = Builder.CreateExtractValue(NewCI, 0); |
830 | 4 | Value *Success = Builder.CreateExtractValue(NewCI, 1); |
831 | 4 | |
832 | 4 | if (CI->isWeak()) |
833 | 0 | Builder.CreateBr(EndBB); |
834 | 4 | else |
835 | 4 | Builder.CreateCondBr(Success, EndBB, FailureBB); |
836 | 4 | |
837 | 4 | // partword.cmpxchg.failure: |
838 | 4 | Builder.SetInsertPoint(FailureBB); |
839 | 4 | // Upon failure, verify that the masked-out part of the loaded value |
840 | 4 | // has been modified. If it didn't, abort the cmpxchg, since the |
841 | 4 | // masked-in part must've. |
842 | 4 | Value *OldVal_MaskOut = Builder.CreateAnd(OldVal, PMV.Inv_Mask); |
843 | 4 | Value *ShouldContinue = Builder.CreateICmpNE(Loaded_MaskOut, OldVal_MaskOut); |
844 | 4 | Builder.CreateCondBr(ShouldContinue, LoopBB, EndBB); |
845 | 4 | |
846 | 4 | // Add the second value to the phi from above |
847 | 4 | Loaded_MaskOut->addIncoming(OldVal_MaskOut, FailureBB); |
848 | 4 | |
849 | 4 | // partword.cmpxchg.end: |
850 | 4 | Builder.SetInsertPoint(CI); |
851 | 4 | |
852 | 4 | Value *FinalOldVal = Builder.CreateTrunc( |
853 | 4 | Builder.CreateLShr(OldVal, PMV.ShiftAmt), PMV.ValueType); |
854 | 4 | Value *Res = UndefValue::get(CI->getType()); |
855 | 4 | Res = Builder.CreateInsertValue(Res, FinalOldVal, 0); |
856 | 4 | Res = Builder.CreateInsertValue(Res, Success, 1); |
857 | 4 | |
858 | 4 | CI->replaceAllUsesWith(Res); |
859 | 4 | CI->eraseFromParent(); |
860 | 4 | } |
861 | | |
862 | | void AtomicExpand::expandAtomicOpToLLSC( |
863 | | Instruction *I, Type *ResultType, Value *Addr, AtomicOrdering MemOpOrder, |
864 | 38.0k | function_ref<Value *(IRBuilder<> &, Value *)> PerformOp) { |
865 | 38.0k | IRBuilder<> Builder(I); |
866 | 38.0k | Value *Loaded = |
867 | 38.0k | insertRMWLLSCLoop(Builder, ResultType, Addr, MemOpOrder, PerformOp); |
868 | 38.0k | |
869 | 38.0k | I->replaceAllUsesWith(Loaded); |
870 | 38.0k | I->eraseFromParent(); |
871 | 38.0k | } |
872 | | |
873 | | Value *AtomicExpand::insertRMWLLSCLoop( |
874 | | IRBuilder<> &Builder, Type *ResultTy, Value *Addr, |
875 | | AtomicOrdering MemOpOrder, |
876 | 38.0k | function_ref<Value *(IRBuilder<> &, Value *)> PerformOp) { |
877 | 38.0k | LLVMContext &Ctx = Builder.getContext(); |
878 | 38.0k | BasicBlock *BB = Builder.GetInsertBlock(); |
879 | 38.0k | Function *F = BB->getParent(); |
880 | 38.0k | |
881 | 38.0k | // Given: atomicrmw some_op iN* %addr, iN %incr ordering |
882 | 38.0k | // |
883 | 38.0k | // The standard expansion we produce is: |
884 | 38.0k | // [...] |
885 | 38.0k | // atomicrmw.start: |
886 | 38.0k | // %loaded = @load.linked(%addr) |
887 | 38.0k | // %new = some_op iN %loaded, %incr |
888 | 38.0k | // %stored = @store_conditional(%new, %addr) |
889 | 38.0k | // %try_again = icmp i32 ne %stored, 0 |
890 | 38.0k | // br i1 %try_again, label %loop, label %atomicrmw.end |
891 | 38.0k | // atomicrmw.end: |
892 | 38.0k | // [...] |
893 | 38.0k | BasicBlock *ExitBB = |
894 | 38.0k | BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end"); |
895 | 38.0k | BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB); |
896 | 38.0k | |
897 | 38.0k | // The split call above "helpfully" added a branch at the end of BB (to the |
898 | 38.0k | // wrong place). |
899 | 38.0k | std::prev(BB->end())->eraseFromParent(); |
900 | 38.0k | Builder.SetInsertPoint(BB); |
901 | 38.0k | Builder.CreateBr(LoopBB); |
902 | 38.0k | |
903 | 38.0k | // Start the main loop block now that we've taken care of the preliminaries. |
904 | 38.0k | Builder.SetInsertPoint(LoopBB); |
905 | 38.0k | Value *Loaded = TLI->emitLoadLinked(Builder, Addr, MemOpOrder); |
906 | 38.0k | |
907 | 38.0k | Value *NewVal = PerformOp(Builder, Loaded); |
908 | 38.0k | |
909 | 38.0k | Value *StoreSuccess = |
910 | 38.0k | TLI->emitStoreConditional(Builder, NewVal, Addr, MemOpOrder); |
911 | 38.0k | Value *TryAgain = Builder.CreateICmpNE( |
912 | 38.0k | StoreSuccess, ConstantInt::get(IntegerType::get(Ctx, 32), 0), "tryagain"); |
913 | 38.0k | Builder.CreateCondBr(TryAgain, LoopBB, ExitBB); |
914 | 38.0k | |
915 | 38.0k | Builder.SetInsertPoint(ExitBB, ExitBB->begin()); |
916 | 38.0k | return Loaded; |
917 | 38.0k | } |
918 | | |
919 | | /// Convert an atomic cmpxchg of a non-integral type to an integer cmpxchg of |
920 | | /// the equivalent bitwidth. We used to not support pointer cmpxchg in the |
921 | | /// IR. As a migration step, we convert back to what use to be the standard |
922 | | /// way to represent a pointer cmpxchg so that we can update backends one by |
923 | | /// one. |
924 | 6 | AtomicCmpXchgInst *AtomicExpand::convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI) { |
925 | 6 | auto *M = CI->getModule(); |
926 | 6 | Type *NewTy = getCorrespondingIntegerType(CI->getCompareOperand()->getType(), |
927 | 6 | M->getDataLayout()); |
928 | 6 | |
929 | 6 | IRBuilder<> Builder(CI); |
930 | 6 | |
931 | 6 | Value *Addr = CI->getPointerOperand(); |
932 | 6 | Type *PT = PointerType::get(NewTy, |
933 | 6 | Addr->getType()->getPointerAddressSpace()); |
934 | 6 | Value *NewAddr = Builder.CreateBitCast(Addr, PT); |
935 | 6 | |
936 | 6 | Value *NewCmp = Builder.CreatePtrToInt(CI->getCompareOperand(), NewTy); |
937 | 6 | Value *NewNewVal = Builder.CreatePtrToInt(CI->getNewValOperand(), NewTy); |
938 | 6 | |
939 | 6 | |
940 | 6 | auto *NewCI = Builder.CreateAtomicCmpXchg(NewAddr, NewCmp, NewNewVal, |
941 | 6 | CI->getSuccessOrdering(), |
942 | 6 | CI->getFailureOrdering(), |
943 | 6 | CI->getSyncScopeID()); |
944 | 6 | NewCI->setVolatile(CI->isVolatile()); |
945 | 6 | NewCI->setWeak(CI->isWeak()); |
946 | 6 | DEBUG(dbgs() << "Replaced " << *CI << " with " << *NewCI << "\n"); |
947 | 6 | |
948 | 6 | Value *OldVal = Builder.CreateExtractValue(NewCI, 0); |
949 | 6 | Value *Succ = Builder.CreateExtractValue(NewCI, 1); |
950 | 6 | |
951 | 6 | OldVal = Builder.CreateIntToPtr(OldVal, CI->getCompareOperand()->getType()); |
952 | 6 | |
953 | 6 | Value *Res = UndefValue::get(CI->getType()); |
954 | 6 | Res = Builder.CreateInsertValue(Res, OldVal, 0); |
955 | 6 | Res = Builder.CreateInsertValue(Res, Succ, 1); |
956 | 6 | |
957 | 6 | CI->replaceAllUsesWith(Res); |
958 | 6 | CI->eraseFromParent(); |
959 | 6 | return NewCI; |
960 | 6 | } |
961 | | |
962 | 8.96k | bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) { |
963 | 8.96k | AtomicOrdering SuccessOrder = CI->getSuccessOrdering(); |
964 | 8.96k | AtomicOrdering FailureOrder = CI->getFailureOrdering(); |
965 | 8.96k | Value *Addr = CI->getPointerOperand(); |
966 | 8.96k | BasicBlock *BB = CI->getParent(); |
967 | 8.96k | Function *F = BB->getParent(); |
968 | 8.96k | LLVMContext &Ctx = F->getContext(); |
969 | 8.96k | // If shouldInsertFencesForAtomic() returns true, then the target does not |
970 | 8.96k | // want to deal with memory orders, and emitLeading/TrailingFence should take |
971 | 8.96k | // care of everything. Otherwise, emitLeading/TrailingFence are no-op and we |
972 | 8.96k | // should preserve the ordering. |
973 | 8.96k | bool ShouldInsertFencesForAtomic = TLI->shouldInsertFencesForAtomic(CI); |
974 | 8.96k | AtomicOrdering MemOpOrder = |
975 | 8.96k | ShouldInsertFencesForAtomic ? AtomicOrdering::Monotonic444 : SuccessOrder8.52k ; |
976 | 8.96k | |
977 | 8.96k | // In implementations which use a barrier to achieve release semantics, we can |
978 | 8.96k | // delay emitting this barrier until we know a store is actually going to be |
979 | 8.96k | // attempted. The cost of this delay is that we need 2 copies of the block |
980 | 8.96k | // emitting the load-linked, affecting code size. |
981 | 8.96k | // |
982 | 8.96k | // Ideally, this logic would be unconditional except for the minsize check |
983 | 8.96k | // since in other cases the extra blocks naturally collapse down to the |
984 | 8.96k | // minimal loop. Unfortunately, this puts too much stress on later |
985 | 8.96k | // optimisations so we avoid emitting the extra logic in those cases too. |
986 | 8.95k | bool HasReleasedLoadBB = !CI->isWeak() && ShouldInsertFencesForAtomic && |
987 | 438 | SuccessOrder != AtomicOrdering::Monotonic && |
988 | 390 | SuccessOrder != AtomicOrdering::Acquire && |
989 | 303 | !F->optForMinSize(); |
990 | 8.96k | |
991 | 8.96k | // There's no overhead for sinking the release barrier in a weak cmpxchg, so |
992 | 8.96k | // do it even on minsize. |
993 | 2 | bool UseUnconditionalReleaseBarrier = F->optForMinSize() && !CI->isWeak(); |
994 | 8.96k | |
995 | 8.96k | // Given: cmpxchg some_op iN* %addr, iN %desired, iN %new success_ord fail_ord |
996 | 8.96k | // |
997 | 8.96k | // The full expansion we produce is: |
998 | 8.96k | // [...] |
999 | 8.96k | // cmpxchg.start: |
1000 | 8.96k | // %unreleasedload = @load.linked(%addr) |
1001 | 8.96k | // %should_store = icmp eq %unreleasedload, %desired |
1002 | 8.96k | // br i1 %should_store, label %cmpxchg.fencedstore, |
1003 | 8.96k | // label %cmpxchg.nostore |
1004 | 8.96k | // cmpxchg.releasingstore: |
1005 | 8.96k | // fence? |
1006 | 8.96k | // br label cmpxchg.trystore |
1007 | 8.96k | // cmpxchg.trystore: |
1008 | 8.96k | // %loaded.trystore = phi [%unreleasedload, %releasingstore], |
1009 | 8.96k | // [%releasedload, %cmpxchg.releasedload] |
1010 | 8.96k | // %stored = @store_conditional(%new, %addr) |
1011 | 8.96k | // %success = icmp eq i32 %stored, 0 |
1012 | 8.96k | // br i1 %success, label %cmpxchg.success, |
1013 | 8.96k | // label %cmpxchg.releasedload/%cmpxchg.failure |
1014 | 8.96k | // cmpxchg.releasedload: |
1015 | 8.96k | // %releasedload = @load.linked(%addr) |
1016 | 8.96k | // %should_store = icmp eq %releasedload, %desired |
1017 | 8.96k | // br i1 %should_store, label %cmpxchg.trystore, |
1018 | 8.96k | // label %cmpxchg.failure |
1019 | 8.96k | // cmpxchg.success: |
1020 | 8.96k | // fence? |
1021 | 8.96k | // br label %cmpxchg.end |
1022 | 8.96k | // cmpxchg.nostore: |
1023 | 8.96k | // %loaded.nostore = phi [%unreleasedload, %cmpxchg.start], |
1024 | 8.96k | // [%releasedload, |
1025 | 8.96k | // %cmpxchg.releasedload/%cmpxchg.trystore] |
1026 | 8.96k | // @load_linked_fail_balance()? |
1027 | 8.96k | // br label %cmpxchg.failure |
1028 | 8.96k | // cmpxchg.failure: |
1029 | 8.96k | // fence? |
1030 | 8.96k | // br label %cmpxchg.end |
1031 | 8.96k | // cmpxchg.end: |
1032 | 8.96k | // %loaded = phi [%loaded.nostore, %cmpxchg.failure], |
1033 | 8.96k | // [%loaded.trystore, %cmpxchg.trystore] |
1034 | 8.96k | // %success = phi i1 [true, %cmpxchg.success], [false, %cmpxchg.failure] |
1035 | 8.96k | // %restmp = insertvalue { iN, i1 } undef, iN %loaded, 0 |
1036 | 8.96k | // %res = insertvalue { iN, i1 } %restmp, i1 %success, 1 |
1037 | 8.96k | // [...] |
1038 | 8.96k | BasicBlock *ExitBB = BB->splitBasicBlock(CI->getIterator(), "cmpxchg.end"); |
1039 | 8.96k | auto FailureBB = BasicBlock::Create(Ctx, "cmpxchg.failure", F, ExitBB); |
1040 | 8.96k | auto NoStoreBB = BasicBlock::Create(Ctx, "cmpxchg.nostore", F, FailureBB); |
1041 | 8.96k | auto SuccessBB = BasicBlock::Create(Ctx, "cmpxchg.success", F, NoStoreBB); |
1042 | 8.96k | auto ReleasedLoadBB = |
1043 | 8.96k | BasicBlock::Create(Ctx, "cmpxchg.releasedload", F, SuccessBB); |
1044 | 8.96k | auto TryStoreBB = |
1045 | 8.96k | BasicBlock::Create(Ctx, "cmpxchg.trystore", F, ReleasedLoadBB); |
1046 | 8.96k | auto ReleasingStoreBB = |
1047 | 8.96k | BasicBlock::Create(Ctx, "cmpxchg.fencedstore", F, TryStoreBB); |
1048 | 8.96k | auto StartBB = BasicBlock::Create(Ctx, "cmpxchg.start", F, ReleasingStoreBB); |
1049 | 8.96k | |
1050 | 8.96k | // This grabs the DebugLoc from CI |
1051 | 8.96k | IRBuilder<> Builder(CI); |
1052 | 8.96k | |
1053 | 8.96k | // The split call above "helpfully" added a branch at the end of BB (to the |
1054 | 8.96k | // wrong place), but we might want a fence too. It's easiest to just remove |
1055 | 8.96k | // the branch entirely. |
1056 | 8.96k | std::prev(BB->end())->eraseFromParent(); |
1057 | 8.96k | Builder.SetInsertPoint(BB); |
1058 | 8.96k | if (ShouldInsertFencesForAtomic && 8.96k UseUnconditionalReleaseBarrier444 ) |
1059 | 1 | TLI->emitLeadingFence(Builder, CI, SuccessOrder); |
1060 | 8.96k | Builder.CreateBr(StartBB); |
1061 | 8.96k | |
1062 | 8.96k | // Start the main loop block now that we've taken care of the preliminaries. |
1063 | 8.96k | Builder.SetInsertPoint(StartBB); |
1064 | 8.96k | Value *UnreleasedLoad = TLI->emitLoadLinked(Builder, Addr, MemOpOrder); |
1065 | 8.96k | Value *ShouldStore = Builder.CreateICmpEQ( |
1066 | 8.96k | UnreleasedLoad, CI->getCompareOperand(), "should_store"); |
1067 | 8.96k | |
1068 | 8.96k | // If the cmpxchg doesn't actually need any ordering when it fails, we can |
1069 | 8.96k | // jump straight past that fence instruction (if it exists). |
1070 | 8.96k | Builder.CreateCondBr(ShouldStore, ReleasingStoreBB, NoStoreBB); |
1071 | 8.96k | |
1072 | 8.96k | Builder.SetInsertPoint(ReleasingStoreBB); |
1073 | 8.96k | if (ShouldInsertFencesForAtomic && 8.96k !UseUnconditionalReleaseBarrier444 ) |
1074 | 443 | TLI->emitLeadingFence(Builder, CI, SuccessOrder); |
1075 | 8.96k | Builder.CreateBr(TryStoreBB); |
1076 | 8.96k | |
1077 | 8.96k | Builder.SetInsertPoint(TryStoreBB); |
1078 | 8.96k | Value *StoreSuccess = TLI->emitStoreConditional( |
1079 | 8.96k | Builder, CI->getNewValOperand(), Addr, MemOpOrder); |
1080 | 8.96k | StoreSuccess = Builder.CreateICmpEQ( |
1081 | 8.96k | StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success"); |
1082 | 8.96k | BasicBlock *RetryBB = HasReleasedLoadBB ? ReleasedLoadBB302 : StartBB8.66k ; |
1083 | 8.96k | Builder.CreateCondBr(StoreSuccess, SuccessBB, |
1084 | 8.96k | CI->isWeak() ? FailureBB7 : RetryBB8.95k ); |
1085 | 8.96k | |
1086 | 8.96k | Builder.SetInsertPoint(ReleasedLoadBB); |
1087 | 8.96k | Value *SecondLoad; |
1088 | 8.96k | if (HasReleasedLoadBB8.96k ) { |
1089 | 302 | SecondLoad = TLI->emitLoadLinked(Builder, Addr, MemOpOrder); |
1090 | 302 | ShouldStore = Builder.CreateICmpEQ(SecondLoad, CI->getCompareOperand(), |
1091 | 302 | "should_store"); |
1092 | 302 | |
1093 | 302 | // If the cmpxchg doesn't actually need any ordering when it fails, we can |
1094 | 302 | // jump straight past that fence instruction (if it exists). |
1095 | 302 | Builder.CreateCondBr(ShouldStore, TryStoreBB, NoStoreBB); |
1096 | 302 | } else |
1097 | 8.66k | Builder.CreateUnreachable(); |
1098 | 8.96k | |
1099 | 8.96k | // Make sure later instructions don't get reordered with a fence if |
1100 | 8.96k | // necessary. |
1101 | 8.96k | Builder.SetInsertPoint(SuccessBB); |
1102 | 8.96k | if (ShouldInsertFencesForAtomic) |
1103 | 444 | TLI->emitTrailingFence(Builder, CI, SuccessOrder); |
1104 | 8.96k | Builder.CreateBr(ExitBB); |
1105 | 8.96k | |
1106 | 8.96k | Builder.SetInsertPoint(NoStoreBB); |
1107 | 8.96k | // In the failing case, where we don't execute the store-conditional, the |
1108 | 8.96k | // target might want to balance out the load-linked with a dedicated |
1109 | 8.96k | // instruction (e.g., on ARM, clearing the exclusive monitor). |
1110 | 8.96k | TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder); |
1111 | 8.96k | Builder.CreateBr(FailureBB); |
1112 | 8.96k | |
1113 | 8.96k | Builder.SetInsertPoint(FailureBB); |
1114 | 8.96k | if (ShouldInsertFencesForAtomic) |
1115 | 444 | TLI->emitTrailingFence(Builder, CI, FailureOrder); |
1116 | 8.96k | Builder.CreateBr(ExitBB); |
1117 | 8.96k | |
1118 | 8.96k | // Finally, we have control-flow based knowledge of whether the cmpxchg |
1119 | 8.96k | // succeeded or not. We expose this to later passes by converting any |
1120 | 8.96k | // subsequent "icmp eq/ne %loaded, %oldval" into a use of an appropriate |
1121 | 8.96k | // PHI. |
1122 | 8.96k | Builder.SetInsertPoint(ExitBB, ExitBB->begin()); |
1123 | 8.96k | PHINode *Success = Builder.CreatePHI(Type::getInt1Ty(Ctx), 2); |
1124 | 8.96k | Success->addIncoming(ConstantInt::getTrue(Ctx), SuccessBB); |
1125 | 8.96k | Success->addIncoming(ConstantInt::getFalse(Ctx), FailureBB); |
1126 | 8.96k | |
1127 | 8.96k | // Setup the builder so we can create any PHIs we need. |
1128 | 8.96k | Value *Loaded; |
1129 | 8.96k | if (!HasReleasedLoadBB) |
1130 | 8.66k | Loaded = UnreleasedLoad; |
1131 | 302 | else { |
1132 | 302 | Builder.SetInsertPoint(TryStoreBB, TryStoreBB->begin()); |
1133 | 302 | PHINode *TryStoreLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2); |
1134 | 302 | TryStoreLoaded->addIncoming(UnreleasedLoad, ReleasingStoreBB); |
1135 | 302 | TryStoreLoaded->addIncoming(SecondLoad, ReleasedLoadBB); |
1136 | 302 | |
1137 | 302 | Builder.SetInsertPoint(NoStoreBB, NoStoreBB->begin()); |
1138 | 302 | PHINode *NoStoreLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2); |
1139 | 302 | NoStoreLoaded->addIncoming(UnreleasedLoad, StartBB); |
1140 | 302 | NoStoreLoaded->addIncoming(SecondLoad, ReleasedLoadBB); |
1141 | 302 | |
1142 | 302 | Builder.SetInsertPoint(ExitBB, ++ExitBB->begin()); |
1143 | 302 | PHINode *ExitLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2); |
1144 | 302 | ExitLoaded->addIncoming(TryStoreLoaded, SuccessBB); |
1145 | 302 | ExitLoaded->addIncoming(NoStoreLoaded, FailureBB); |
1146 | 302 | |
1147 | 302 | Loaded = ExitLoaded; |
1148 | 302 | } |
1149 | 8.96k | |
1150 | 8.96k | // Look for any users of the cmpxchg that are just comparing the loaded value |
1151 | 8.96k | // against the desired one, and replace them with the CFG-derived version. |
1152 | 8.96k | SmallVector<ExtractValueInst *, 2> PrunedInsts; |
1153 | 9.47k | for (auto User : CI->users()) { |
1154 | 9.47k | ExtractValueInst *EV = dyn_cast<ExtractValueInst>(User); |
1155 | 9.47k | if (!EV) |
1156 | 8 | continue; |
1157 | 9.46k | |
1158 | 9.47k | assert(EV->getNumIndices() == 1 && EV->getIndices()[0] <= 1 && |
1159 | 9.46k | "weird extraction from { iN, i1 }"); |
1160 | 9.46k | |
1161 | 9.46k | if (EV->getIndices()[0] == 0) |
1162 | 8.90k | EV->replaceAllUsesWith(Loaded); |
1163 | 9.46k | else |
1164 | 567 | EV->replaceAllUsesWith(Success); |
1165 | 9.47k | |
1166 | 9.47k | PrunedInsts.push_back(EV); |
1167 | 9.47k | } |
1168 | 8.96k | |
1169 | 8.96k | // We can remove the instructions now we're no longer iterating through them. |
1170 | 8.96k | for (auto EV : PrunedInsts) |
1171 | 9.46k | EV->eraseFromParent(); |
1172 | 8.96k | |
1173 | 8.96k | if (!CI->use_empty()8.96k ) { |
1174 | 8 | // Some use of the full struct return that we don't understand has happened, |
1175 | 8 | // so we've got to reconstruct it properly. |
1176 | 8 | Value *Res; |
1177 | 8 | Res = Builder.CreateInsertValue(UndefValue::get(CI->getType()), Loaded, 0); |
1178 | 8 | Res = Builder.CreateInsertValue(Res, Success, 1); |
1179 | 8 | |
1180 | 8 | CI->replaceAllUsesWith(Res); |
1181 | 8 | } |
1182 | 8.96k | |
1183 | 8.96k | CI->eraseFromParent(); |
1184 | 8.96k | return true; |
1185 | 8.96k | } |
1186 | | |
1187 | 42.2k | bool AtomicExpand::isIdempotentRMW(AtomicRMWInst* RMWI) { |
1188 | 42.2k | auto C = dyn_cast<ConstantInt>(RMWI->getValOperand()); |
1189 | 42.2k | if(!C) |
1190 | 21.4k | return false; |
1191 | 20.8k | |
1192 | 20.8k | AtomicRMWInst::BinOp Op = RMWI->getOperation(); |
1193 | 20.8k | switch(Op) { |
1194 | 8.80k | case AtomicRMWInst::Add: |
1195 | 8.80k | case AtomicRMWInst::Sub: |
1196 | 8.80k | case AtomicRMWInst::Or: |
1197 | 8.80k | case AtomicRMWInst::Xor: |
1198 | 8.80k | return C->isZero(); |
1199 | 71 | case AtomicRMWInst::And: |
1200 | 71 | return C->isMinusOne(); |
1201 | 8.80k | // FIXME: we could also treat Min/Max/UMin/UMax by the INT_MIN/INT_MAX/... |
1202 | 11.9k | default: |
1203 | 11.9k | return false; |
1204 | 0 | } |
1205 | 0 | } |
1206 | | |
1207 | 14 | bool AtomicExpand::simplifyIdempotentRMW(AtomicRMWInst* RMWI) { |
1208 | 14 | if (auto ResultingLoad14 = TLI->lowerIdempotentRMWIntoFencedLoad(RMWI)) { |
1209 | 11 | tryExpandAtomicLoad(ResultingLoad); |
1210 | 11 | return true; |
1211 | 11 | } |
1212 | 3 | return false; |
1213 | 3 | } |
1214 | | |
1215 | | Value *AtomicExpand::insertRMWCmpXchgLoop( |
1216 | | IRBuilder<> &Builder, Type *ResultTy, Value *Addr, |
1217 | | AtomicOrdering MemOpOrder, |
1218 | | function_ref<Value *(IRBuilder<> &, Value *)> PerformOp, |
1219 | 1.10k | CreateCmpXchgInstFun CreateCmpXchg) { |
1220 | 1.10k | LLVMContext &Ctx = Builder.getContext(); |
1221 | 1.10k | BasicBlock *BB = Builder.GetInsertBlock(); |
1222 | 1.10k | Function *F = BB->getParent(); |
1223 | 1.10k | |
1224 | 1.10k | // Given: atomicrmw some_op iN* %addr, iN %incr ordering |
1225 | 1.10k | // |
1226 | 1.10k | // The standard expansion we produce is: |
1227 | 1.10k | // [...] |
1228 | 1.10k | // %init_loaded = load atomic iN* %addr |
1229 | 1.10k | // br label %loop |
1230 | 1.10k | // loop: |
1231 | 1.10k | // %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ] |
1232 | 1.10k | // %new = some_op iN %loaded, %incr |
1233 | 1.10k | // %pair = cmpxchg iN* %addr, iN %loaded, iN %new |
1234 | 1.10k | // %new_loaded = extractvalue { iN, i1 } %pair, 0 |
1235 | 1.10k | // %success = extractvalue { iN, i1 } %pair, 1 |
1236 | 1.10k | // br i1 %success, label %atomicrmw.end, label %loop |
1237 | 1.10k | // atomicrmw.end: |
1238 | 1.10k | // [...] |
1239 | 1.10k | BasicBlock *ExitBB = |
1240 | 1.10k | BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end"); |
1241 | 1.10k | BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB); |
1242 | 1.10k | |
1243 | 1.10k | // The split call above "helpfully" added a branch at the end of BB (to the |
1244 | 1.10k | // wrong place), but we want a load. It's easiest to just remove |
1245 | 1.10k | // the branch entirely. |
1246 | 1.10k | std::prev(BB->end())->eraseFromParent(); |
1247 | 1.10k | Builder.SetInsertPoint(BB); |
1248 | 1.10k | LoadInst *InitLoaded = Builder.CreateLoad(ResultTy, Addr); |
1249 | 1.10k | // Atomics require at least natural alignment. |
1250 | 1.10k | InitLoaded->setAlignment(ResultTy->getPrimitiveSizeInBits() / 8); |
1251 | 1.10k | Builder.CreateBr(LoopBB); |
1252 | 1.10k | |
1253 | 1.10k | // Start the main loop block now that we've taken care of the preliminaries. |
1254 | 1.10k | Builder.SetInsertPoint(LoopBB); |
1255 | 1.10k | PHINode *Loaded = Builder.CreatePHI(ResultTy, 2, "loaded"); |
1256 | 1.10k | Loaded->addIncoming(InitLoaded, BB); |
1257 | 1.10k | |
1258 | 1.10k | Value *NewVal = PerformOp(Builder, Loaded); |
1259 | 1.10k | |
1260 | 1.10k | Value *NewLoaded = nullptr; |
1261 | 1.10k | Value *Success = nullptr; |
1262 | 1.10k | |
1263 | 1.10k | CreateCmpXchg(Builder, Addr, Loaded, NewVal, |
1264 | 1.10k | MemOpOrder == AtomicOrdering::Unordered |
1265 | 1 | ? AtomicOrdering::Monotonic |
1266 | 1.10k | : MemOpOrder, |
1267 | 1.10k | Success, NewLoaded); |
1268 | 1.10k | assert(Success && NewLoaded); |
1269 | 1.10k | |
1270 | 1.10k | Loaded->addIncoming(NewLoaded, LoopBB); |
1271 | 1.10k | |
1272 | 1.10k | Builder.CreateCondBr(Success, ExitBB, LoopBB); |
1273 | 1.10k | |
1274 | 1.10k | Builder.SetInsertPoint(ExitBB, ExitBB->begin()); |
1275 | 1.10k | return NewLoaded; |
1276 | 1.10k | } |
1277 | | |
1278 | | // Note: This function is exposed externally by AtomicExpandUtils.h |
1279 | | bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI, |
1280 | 1.09k | CreateCmpXchgInstFun CreateCmpXchg) { |
1281 | 1.09k | IRBuilder<> Builder(AI); |
1282 | 1.09k | Value *Loaded = AtomicExpand::insertRMWCmpXchgLoop( |
1283 | 1.09k | Builder, AI->getType(), AI->getPointerOperand(), AI->getOrdering(), |
1284 | 1.09k | [&](IRBuilder<> &Builder, Value *Loaded) { |
1285 | 1.09k | return performAtomicOp(AI->getOperation(), Builder, Loaded, |
1286 | 1.09k | AI->getValOperand()); |
1287 | 1.09k | }, |
1288 | 1.09k | CreateCmpXchg); |
1289 | 1.09k | |
1290 | 1.09k | AI->replaceAllUsesWith(Loaded); |
1291 | 1.09k | AI->eraseFromParent(); |
1292 | 1.09k | return true; |
1293 | 1.09k | } |
1294 | | |
1295 | | // In order to use one of the sized library calls such as |
1296 | | // __atomic_fetch_add_4, the alignment must be sufficient, the size |
1297 | | // must be one of the potentially-specialized sizes, and the value |
1298 | | // type must actually exist in C on the target (otherwise, the |
1299 | | // function wouldn't actually be defined.) |
1300 | | static bool canUseSizedAtomicCall(unsigned Size, unsigned Align, |
1301 | 17 | const DataLayout &DL) { |
1302 | 17 | // TODO: "LargestSize" is an approximation for "largest type that |
1303 | 17 | // you can express in C". It seems to be the case that int128 is |
1304 | 17 | // supported on all 64-bit platforms, otherwise only up to 64-bit |
1305 | 17 | // integers are supported. If we get this wrong, then we'll try to |
1306 | 17 | // call a sized libcall that doesn't actually exist. There should |
1307 | 17 | // really be some more reliable way in LLVM of determining integer |
1308 | 17 | // sizes which are valid in the target's C ABI... |
1309 | 17 | unsigned LargestSize = DL.getLargestLegalIntTypeSizeInBits() >= 64 ? 160 : 817 ; |
1310 | 17 | return Align >= Size && |
1311 | 15 | (Size == 1 || 15 Size == 215 || Size == 410 || Size == 89 || Size == 167 ) && |
1312 | 15 | Size <= LargestSize; |
1313 | 17 | } |
1314 | | |
1315 | 4 | void AtomicExpand::expandAtomicLoadToLibcall(LoadInst *I) { |
1316 | 4 | static const RTLIB::Libcall Libcalls[6] = { |
1317 | 4 | RTLIB::ATOMIC_LOAD, RTLIB::ATOMIC_LOAD_1, RTLIB::ATOMIC_LOAD_2, |
1318 | 4 | RTLIB::ATOMIC_LOAD_4, RTLIB::ATOMIC_LOAD_8, RTLIB::ATOMIC_LOAD_16}; |
1319 | 4 | unsigned Size = getAtomicOpSize(I); |
1320 | 4 | unsigned Align = getAtomicOpAlign(I); |
1321 | 4 | |
1322 | 4 | bool expanded = expandAtomicOpToLibcall( |
1323 | 4 | I, Size, Align, I->getPointerOperand(), nullptr, nullptr, |
1324 | 4 | I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls); |
1325 | 4 | (void)expanded; |
1326 | 4 | assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor Load"); |
1327 | 4 | } |
1328 | | |
1329 | 5 | void AtomicExpand::expandAtomicStoreToLibcall(StoreInst *I) { |
1330 | 5 | static const RTLIB::Libcall Libcalls[6] = { |
1331 | 5 | RTLIB::ATOMIC_STORE, RTLIB::ATOMIC_STORE_1, RTLIB::ATOMIC_STORE_2, |
1332 | 5 | RTLIB::ATOMIC_STORE_4, RTLIB::ATOMIC_STORE_8, RTLIB::ATOMIC_STORE_16}; |
1333 | 5 | unsigned Size = getAtomicOpSize(I); |
1334 | 5 | unsigned Align = getAtomicOpAlign(I); |
1335 | 5 | |
1336 | 5 | bool expanded = expandAtomicOpToLibcall( |
1337 | 5 | I, Size, Align, I->getPointerOperand(), I->getValueOperand(), nullptr, |
1338 | 5 | I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls); |
1339 | 5 | (void)expanded; |
1340 | 5 | assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor Store"); |
1341 | 5 | } |
1342 | | |
1343 | 4 | void AtomicExpand::expandAtomicCASToLibcall(AtomicCmpXchgInst *I) { |
1344 | 4 | static const RTLIB::Libcall Libcalls[6] = { |
1345 | 4 | RTLIB::ATOMIC_COMPARE_EXCHANGE, RTLIB::ATOMIC_COMPARE_EXCHANGE_1, |
1346 | 4 | RTLIB::ATOMIC_COMPARE_EXCHANGE_2, RTLIB::ATOMIC_COMPARE_EXCHANGE_4, |
1347 | 4 | RTLIB::ATOMIC_COMPARE_EXCHANGE_8, RTLIB::ATOMIC_COMPARE_EXCHANGE_16}; |
1348 | 4 | unsigned Size = getAtomicOpSize(I); |
1349 | 4 | unsigned Align = getAtomicOpAlign(I); |
1350 | 4 | |
1351 | 4 | bool expanded = expandAtomicOpToLibcall( |
1352 | 4 | I, Size, Align, I->getPointerOperand(), I->getNewValOperand(), |
1353 | 4 | I->getCompareOperand(), I->getSuccessOrdering(), I->getFailureOrdering(), |
1354 | 4 | Libcalls); |
1355 | 4 | (void)expanded; |
1356 | 4 | assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor CAS"); |
1357 | 4 | } |
1358 | | |
1359 | 4 | static ArrayRef<RTLIB::Libcall> GetRMWLibcall(AtomicRMWInst::BinOp Op) { |
1360 | 4 | static const RTLIB::Libcall LibcallsXchg[6] = { |
1361 | 4 | RTLIB::ATOMIC_EXCHANGE, RTLIB::ATOMIC_EXCHANGE_1, |
1362 | 4 | RTLIB::ATOMIC_EXCHANGE_2, RTLIB::ATOMIC_EXCHANGE_4, |
1363 | 4 | RTLIB::ATOMIC_EXCHANGE_8, RTLIB::ATOMIC_EXCHANGE_16}; |
1364 | 4 | static const RTLIB::Libcall LibcallsAdd[6] = { |
1365 | 4 | RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_ADD_1, |
1366 | 4 | RTLIB::ATOMIC_FETCH_ADD_2, RTLIB::ATOMIC_FETCH_ADD_4, |
1367 | 4 | RTLIB::ATOMIC_FETCH_ADD_8, RTLIB::ATOMIC_FETCH_ADD_16}; |
1368 | 4 | static const RTLIB::Libcall LibcallsSub[6] = { |
1369 | 4 | RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_SUB_1, |
1370 | 4 | RTLIB::ATOMIC_FETCH_SUB_2, RTLIB::ATOMIC_FETCH_SUB_4, |
1371 | 4 | RTLIB::ATOMIC_FETCH_SUB_8, RTLIB::ATOMIC_FETCH_SUB_16}; |
1372 | 4 | static const RTLIB::Libcall LibcallsAnd[6] = { |
1373 | 4 | RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_AND_1, |
1374 | 4 | RTLIB::ATOMIC_FETCH_AND_2, RTLIB::ATOMIC_FETCH_AND_4, |
1375 | 4 | RTLIB::ATOMIC_FETCH_AND_8, RTLIB::ATOMIC_FETCH_AND_16}; |
1376 | 4 | static const RTLIB::Libcall LibcallsOr[6] = { |
1377 | 4 | RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_OR_1, |
1378 | 4 | RTLIB::ATOMIC_FETCH_OR_2, RTLIB::ATOMIC_FETCH_OR_4, |
1379 | 4 | RTLIB::ATOMIC_FETCH_OR_8, RTLIB::ATOMIC_FETCH_OR_16}; |
1380 | 4 | static const RTLIB::Libcall LibcallsXor[6] = { |
1381 | 4 | RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_XOR_1, |
1382 | 4 | RTLIB::ATOMIC_FETCH_XOR_2, RTLIB::ATOMIC_FETCH_XOR_4, |
1383 | 4 | RTLIB::ATOMIC_FETCH_XOR_8, RTLIB::ATOMIC_FETCH_XOR_16}; |
1384 | 4 | static const RTLIB::Libcall LibcallsNand[6] = { |
1385 | 4 | RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_NAND_1, |
1386 | 4 | RTLIB::ATOMIC_FETCH_NAND_2, RTLIB::ATOMIC_FETCH_NAND_4, |
1387 | 4 | RTLIB::ATOMIC_FETCH_NAND_8, RTLIB::ATOMIC_FETCH_NAND_16}; |
1388 | 4 | |
1389 | 4 | switch (Op) { |
1390 | 0 | case AtomicRMWInst::BAD_BINOP: |
1391 | 0 | llvm_unreachable("Should not have BAD_BINOP."); |
1392 | 2 | case AtomicRMWInst::Xchg: |
1393 | 2 | return makeArrayRef(LibcallsXchg); |
1394 | 2 | case AtomicRMWInst::Add: |
1395 | 2 | return makeArrayRef(LibcallsAdd); |
1396 | 0 | case AtomicRMWInst::Sub: |
1397 | 0 | return makeArrayRef(LibcallsSub); |
1398 | 0 | case AtomicRMWInst::And: |
1399 | 0 | return makeArrayRef(LibcallsAnd); |
1400 | 0 | case AtomicRMWInst::Or: |
1401 | 0 | return makeArrayRef(LibcallsOr); |
1402 | 0 | case AtomicRMWInst::Xor: |
1403 | 0 | return makeArrayRef(LibcallsXor); |
1404 | 0 | case AtomicRMWInst::Nand: |
1405 | 0 | return makeArrayRef(LibcallsNand); |
1406 | 0 | case AtomicRMWInst::Max: |
1407 | 0 | case AtomicRMWInst::Min: |
1408 | 0 | case AtomicRMWInst::UMax: |
1409 | 0 | case AtomicRMWInst::UMin: |
1410 | 0 | // No atomic libcalls are available for max/min/umax/umin. |
1411 | 0 | return {}; |
1412 | 0 | } |
1413 | 0 | llvm_unreachable0 ("Unexpected AtomicRMW operation."); |
1414 | 0 | } |
1415 | | |
1416 | 4 | void AtomicExpand::expandAtomicRMWToLibcall(AtomicRMWInst *I) { |
1417 | 4 | ArrayRef<RTLIB::Libcall> Libcalls = GetRMWLibcall(I->getOperation()); |
1418 | 4 | |
1419 | 4 | unsigned Size = getAtomicOpSize(I); |
1420 | 4 | unsigned Align = getAtomicOpAlign(I); |
1421 | 4 | |
1422 | 4 | bool Success = false; |
1423 | 4 | if (!Libcalls.empty()) |
1424 | 4 | Success = expandAtomicOpToLibcall( |
1425 | 4 | I, Size, Align, I->getPointerOperand(), I->getValOperand(), nullptr, |
1426 | 4 | I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls); |
1427 | 4 | |
1428 | 4 | // The expansion failed: either there were no libcalls at all for |
1429 | 4 | // the operation (min/max), or there were only size-specialized |
1430 | 4 | // libcalls (add/sub/etc) and we needed a generic. So, expand to a |
1431 | 4 | // CAS libcall, via a CAS loop, instead. |
1432 | 4 | if (!Success4 ) { |
1433 | 1 | expandAtomicRMWToCmpXchg(I, [this](IRBuilder<> &Builder, Value *Addr, |
1434 | 1 | Value *Loaded, Value *NewVal, |
1435 | 1 | AtomicOrdering MemOpOrder, |
1436 | 1 | Value *&Success, Value *&NewLoaded) { |
1437 | 1 | // Create the CAS instruction normally... |
1438 | 1 | AtomicCmpXchgInst *Pair = Builder.CreateAtomicCmpXchg( |
1439 | 1 | Addr, Loaded, NewVal, MemOpOrder, |
1440 | 1 | AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder)); |
1441 | 1 | Success = Builder.CreateExtractValue(Pair, 1, "success"); |
1442 | 1 | NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded"); |
1443 | 1 | |
1444 | 1 | // ...and then expand the CAS into a libcall. |
1445 | 1 | expandAtomicCASToLibcall(Pair); |
1446 | 1 | }); |
1447 | 1 | } |
1448 | 4 | } |
1449 | | |
1450 | | // A helper routine for the above expandAtomic*ToLibcall functions. |
1451 | | // |
1452 | | // 'Libcalls' contains an array of enum values for the particular |
1453 | | // ATOMIC libcalls to be emitted. All of the other arguments besides |
1454 | | // 'I' are extracted from the Instruction subclass by the |
1455 | | // caller. Depending on the particular call, some will be null. |
1456 | | bool AtomicExpand::expandAtomicOpToLibcall( |
1457 | | Instruction *I, unsigned Size, unsigned Align, Value *PointerOperand, |
1458 | | Value *ValueOperand, Value *CASExpected, AtomicOrdering Ordering, |
1459 | 17 | AtomicOrdering Ordering2, ArrayRef<RTLIB::Libcall> Libcalls) { |
1460 | 17 | assert(Libcalls.size() == 6); |
1461 | 17 | |
1462 | 17 | LLVMContext &Ctx = I->getContext(); |
1463 | 17 | Module *M = I->getModule(); |
1464 | 17 | const DataLayout &DL = M->getDataLayout(); |
1465 | 17 | IRBuilder<> Builder(I); |
1466 | 17 | IRBuilder<> AllocaBuilder(&I->getFunction()->getEntryBlock().front()); |
1467 | 17 | |
1468 | 17 | bool UseSizedLibcall = canUseSizedAtomicCall(Size, Align, DL); |
1469 | 17 | Type *SizedIntTy = Type::getIntNTy(Ctx, Size * 8); |
1470 | 17 | |
1471 | 17 | unsigned AllocaAlignment = DL.getPrefTypeAlignment(SizedIntTy); |
1472 | 17 | |
1473 | 17 | // TODO: the "order" argument type is "int", not int32. So |
1474 | 17 | // getInt32Ty may be wrong if the arch uses e.g. 16-bit ints. |
1475 | 17 | ConstantInt *SizeVal64 = ConstantInt::get(Type::getInt64Ty(Ctx), Size); |
1476 | 17 | assert(Ordering != AtomicOrdering::NotAtomic && "expect atomic MO"); |
1477 | 17 | Constant *OrderingVal = |
1478 | 17 | ConstantInt::get(Type::getInt32Ty(Ctx), (int)toCABI(Ordering)); |
1479 | 17 | Constant *Ordering2Val = nullptr; |
1480 | 17 | if (CASExpected17 ) { |
1481 | 4 | assert(Ordering2 != AtomicOrdering::NotAtomic && "expect atomic MO"); |
1482 | 4 | Ordering2Val = |
1483 | 4 | ConstantInt::get(Type::getInt32Ty(Ctx), (int)toCABI(Ordering2)); |
1484 | 4 | } |
1485 | 17 | bool HasResult = I->getType() != Type::getVoidTy(Ctx); |
1486 | 17 | |
1487 | 17 | RTLIB::Libcall RTLibType; |
1488 | 17 | if (UseSizedLibcall17 ) { |
1489 | 8 | switch (Size) { |
1490 | 0 | case 1: RTLibType = Libcalls[1]; break; |
1491 | 5 | case 2: RTLibType = Libcalls[2]; break; |
1492 | 1 | case 4: RTLibType = Libcalls[3]; break; |
1493 | 2 | case 8: RTLibType = Libcalls[4]; break; |
1494 | 0 | case 16: RTLibType = Libcalls[5]; break; |
1495 | 17 | } |
1496 | 9 | } else if (9 Libcalls[0] != RTLIB::UNKNOWN_LIBCALL9 ) { |
1497 | 8 | RTLibType = Libcalls[0]; |
1498 | 9 | } else { |
1499 | 1 | // Can't use sized function, and there's no generic for this |
1500 | 1 | // operation, so give up. |
1501 | 1 | return false; |
1502 | 1 | } |
1503 | 16 | |
1504 | 16 | // Build up the function call. There's two kinds. First, the sized |
1505 | 16 | // variants. These calls are going to be one of the following (with |
1506 | 16 | // N=1,2,4,8,16): |
1507 | 16 | // iN __atomic_load_N(iN *ptr, int ordering) |
1508 | 16 | // void __atomic_store_N(iN *ptr, iN val, int ordering) |
1509 | 16 | // iN __atomic_{exchange|fetch_*}_N(iN *ptr, iN val, int ordering) |
1510 | 16 | // bool __atomic_compare_exchange_N(iN *ptr, iN *expected, iN desired, |
1511 | 16 | // int success_order, int failure_order) |
1512 | 16 | // |
1513 | 16 | // Note that these functions can be used for non-integer atomic |
1514 | 16 | // operations, the values just need to be bitcast to integers on the |
1515 | 16 | // way in and out. |
1516 | 16 | // |
1517 | 16 | // And, then, the generic variants. They look like the following: |
1518 | 16 | // void __atomic_load(size_t size, void *ptr, void *ret, int ordering) |
1519 | 16 | // void __atomic_store(size_t size, void *ptr, void *val, int ordering) |
1520 | 16 | // void __atomic_exchange(size_t size, void *ptr, void *val, void *ret, |
1521 | 16 | // int ordering) |
1522 | 16 | // bool __atomic_compare_exchange(size_t size, void *ptr, void *expected, |
1523 | 16 | // void *desired, int success_order, |
1524 | 16 | // int failure_order) |
1525 | 16 | // |
1526 | 16 | // The different signatures are built up depending on the |
1527 | 16 | // 'UseSizedLibcall', 'CASExpected', 'ValueOperand', and 'HasResult' |
1528 | 16 | // variables. |
1529 | 16 | |
1530 | 16 | AllocaInst *AllocaCASExpected = nullptr; |
1531 | 16 | Value *AllocaCASExpected_i8 = nullptr; |
1532 | 16 | AllocaInst *AllocaValue = nullptr; |
1533 | 16 | Value *AllocaValue_i8 = nullptr; |
1534 | 16 | AllocaInst *AllocaResult = nullptr; |
1535 | 16 | Value *AllocaResult_i8 = nullptr; |
1536 | 16 | |
1537 | 16 | Type *ResultTy; |
1538 | 16 | SmallVector<Value *, 6> Args; |
1539 | 16 | AttributeList Attr; |
1540 | 16 | |
1541 | 16 | // 'size' argument. |
1542 | 16 | if (!UseSizedLibcall16 ) { |
1543 | 8 | // Note, getIntPtrType is assumed equivalent to size_t. |
1544 | 8 | Args.push_back(ConstantInt::get(DL.getIntPtrType(Ctx), Size)); |
1545 | 8 | } |
1546 | 16 | |
1547 | 16 | // 'ptr' argument. |
1548 | 16 | Value *PtrVal = |
1549 | 16 | Builder.CreateBitCast(PointerOperand, Type::getInt8PtrTy(Ctx)); |
1550 | 16 | Args.push_back(PtrVal); |
1551 | 16 | |
1552 | 16 | // 'expected' argument, if present. |
1553 | 16 | if (CASExpected16 ) { |
1554 | 4 | AllocaCASExpected = AllocaBuilder.CreateAlloca(CASExpected->getType()); |
1555 | 4 | AllocaCASExpected->setAlignment(AllocaAlignment); |
1556 | 4 | AllocaCASExpected_i8 = |
1557 | 4 | Builder.CreateBitCast(AllocaCASExpected, Type::getInt8PtrTy(Ctx)); |
1558 | 4 | Builder.CreateLifetimeStart(AllocaCASExpected_i8, SizeVal64); |
1559 | 4 | Builder.CreateAlignedStore(CASExpected, AllocaCASExpected, AllocaAlignment); |
1560 | 4 | Args.push_back(AllocaCASExpected_i8); |
1561 | 4 | } |
1562 | 16 | |
1563 | 16 | // 'val' argument ('desired' for cas), if present. |
1564 | 16 | if (ValueOperand16 ) { |
1565 | 12 | if (UseSizedLibcall12 ) { |
1566 | 6 | Value *IntValue = |
1567 | 6 | Builder.CreateBitOrPointerCast(ValueOperand, SizedIntTy); |
1568 | 6 | Args.push_back(IntValue); |
1569 | 12 | } else { |
1570 | 6 | AllocaValue = AllocaBuilder.CreateAlloca(ValueOperand->getType()); |
1571 | 6 | AllocaValue->setAlignment(AllocaAlignment); |
1572 | 6 | AllocaValue_i8 = |
1573 | 6 | Builder.CreateBitCast(AllocaValue, Type::getInt8PtrTy(Ctx)); |
1574 | 6 | Builder.CreateLifetimeStart(AllocaValue_i8, SizeVal64); |
1575 | 6 | Builder.CreateAlignedStore(ValueOperand, AllocaValue, AllocaAlignment); |
1576 | 6 | Args.push_back(AllocaValue_i8); |
1577 | 6 | } |
1578 | 12 | } |
1579 | 16 | |
1580 | 16 | // 'ret' argument. |
1581 | 16 | if (!CASExpected && 16 HasResult12 && !UseSizedLibcall7 ) { |
1582 | 3 | AllocaResult = AllocaBuilder.CreateAlloca(I->getType()); |
1583 | 3 | AllocaResult->setAlignment(AllocaAlignment); |
1584 | 3 | AllocaResult_i8 = |
1585 | 3 | Builder.CreateBitCast(AllocaResult, Type::getInt8PtrTy(Ctx)); |
1586 | 3 | Builder.CreateLifetimeStart(AllocaResult_i8, SizeVal64); |
1587 | 3 | Args.push_back(AllocaResult_i8); |
1588 | 3 | } |
1589 | 16 | |
1590 | 16 | // 'ordering' ('success_order' for cas) argument. |
1591 | 16 | Args.push_back(OrderingVal); |
1592 | 16 | |
1593 | 16 | // 'failure_order' argument, if present. |
1594 | 16 | if (Ordering2Val) |
1595 | 4 | Args.push_back(Ordering2Val); |
1596 | 16 | |
1597 | 16 | // Now, the return type. |
1598 | 16 | if (CASExpected16 ) { |
1599 | 4 | ResultTy = Type::getInt1Ty(Ctx); |
1600 | 4 | Attr = Attr.addAttribute(Ctx, AttributeList::ReturnIndex, Attribute::ZExt); |
1601 | 16 | } else if (12 HasResult && 12 UseSizedLibcall7 ) |
1602 | 4 | ResultTy = SizedIntTy; |
1603 | 12 | else |
1604 | 8 | ResultTy = Type::getVoidTy(Ctx); |
1605 | 16 | |
1606 | 16 | // Done with setting up arguments and return types, create the call: |
1607 | 16 | SmallVector<Type *, 6> ArgTys; |
1608 | 16 | for (Value *Arg : Args) |
1609 | 63 | ArgTys.push_back(Arg->getType()); |
1610 | 16 | FunctionType *FnType = FunctionType::get(ResultTy, ArgTys, false); |
1611 | 16 | Constant *LibcallFn = |
1612 | 16 | M->getOrInsertFunction(TLI->getLibcallName(RTLibType), FnType, Attr); |
1613 | 16 | CallInst *Call = Builder.CreateCall(LibcallFn, Args); |
1614 | 16 | Call->setAttributes(Attr); |
1615 | 16 | Value *Result = Call; |
1616 | 16 | |
1617 | 16 | // And then, extract the results... |
1618 | 16 | if (ValueOperand && 16 !UseSizedLibcall12 ) |
1619 | 6 | Builder.CreateLifetimeEnd(AllocaValue_i8, SizeVal64); |
1620 | 16 | |
1621 | 16 | if (CASExpected16 ) { |
1622 | 4 | // The final result from the CAS is {load of 'expected' alloca, bool result |
1623 | 4 | // from call} |
1624 | 4 | Type *FinalResultTy = I->getType(); |
1625 | 4 | Value *V = UndefValue::get(FinalResultTy); |
1626 | 4 | Value *ExpectedOut = |
1627 | 4 | Builder.CreateAlignedLoad(AllocaCASExpected, AllocaAlignment); |
1628 | 4 | Builder.CreateLifetimeEnd(AllocaCASExpected_i8, SizeVal64); |
1629 | 4 | V = Builder.CreateInsertValue(V, ExpectedOut, 0); |
1630 | 4 | V = Builder.CreateInsertValue(V, Result, 1); |
1631 | 4 | I->replaceAllUsesWith(V); |
1632 | 16 | } else if (12 HasResult12 ) { |
1633 | 7 | Value *V; |
1634 | 7 | if (UseSizedLibcall) |
1635 | 4 | V = Builder.CreateBitOrPointerCast(Result, I->getType()); |
1636 | 3 | else { |
1637 | 3 | V = Builder.CreateAlignedLoad(AllocaResult, AllocaAlignment); |
1638 | 3 | Builder.CreateLifetimeEnd(AllocaResult_i8, SizeVal64); |
1639 | 3 | } |
1640 | 12 | I->replaceAllUsesWith(V); |
1641 | 12 | } |
1642 | 17 | I->eraseFromParent(); |
1643 | 17 | return true; |
1644 | 17 | } |