/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===- LoopLoadElimination.cpp - Loop Load Elimination Pass ---------------===// |
2 | | // |
3 | | // The LLVM Compiler Infrastructure |
4 | | // |
5 | | // This file is distributed under the University of Illinois Open Source |
6 | | // License. See LICENSE.TXT for details. |
7 | | // |
8 | | //===----------------------------------------------------------------------===// |
9 | | // |
10 | | // This file implement a loop-aware load elimination pass. |
11 | | // |
12 | | // It uses LoopAccessAnalysis to identify loop-carried dependences with a |
13 | | // distance of one between stores and loads. These form the candidates for the |
14 | | // transformation. The source value of each store then propagated to the user |
15 | | // of the corresponding load. This makes the load dead. |
16 | | // |
17 | | // The pass can also version the loop and add memchecks in order to prove that |
18 | | // may-aliasing stores can't change the value in memory before it's read by the |
19 | | // load. |
20 | | // |
21 | | //===----------------------------------------------------------------------===// |
22 | | |
23 | | #include "llvm/Transforms/Scalar/LoopLoadElimination.h" |
24 | | #include "llvm/ADT/APInt.h" |
25 | | #include "llvm/ADT/DenseMap.h" |
26 | | #include "llvm/ADT/DepthFirstIterator.h" |
27 | | #include "llvm/ADT/STLExtras.h" |
28 | | #include "llvm/ADT/SmallSet.h" |
29 | | #include "llvm/ADT/SmallVector.h" |
30 | | #include "llvm/ADT/Statistic.h" |
31 | | #include "llvm/Analysis/GlobalsModRef.h" |
32 | | #include "llvm/Analysis/LoopAccessAnalysis.h" |
33 | | #include "llvm/Analysis/LoopInfo.h" |
34 | | #include "llvm/Analysis/ScalarEvolution.h" |
35 | | #include "llvm/Analysis/ScalarEvolutionExpander.h" |
36 | | #include "llvm/Analysis/ScalarEvolutionExpressions.h" |
37 | | #include "llvm/IR/DataLayout.h" |
38 | | #include "llvm/IR/Dominators.h" |
39 | | #include "llvm/IR/Instructions.h" |
40 | | #include "llvm/IR/Module.h" |
41 | | #include "llvm/IR/Type.h" |
42 | | #include "llvm/IR/Value.h" |
43 | | #include "llvm/Pass.h" |
44 | | #include "llvm/Support/Casting.h" |
45 | | #include "llvm/Support/CommandLine.h" |
46 | | #include "llvm/Support/Debug.h" |
47 | | #include "llvm/Transforms/Scalar.h" |
48 | | #include "llvm/Transforms/Utils/LoopVersioning.h" |
49 | | #include <algorithm> |
50 | | #include <cassert> |
51 | | #include <forward_list> |
52 | | #include <set> |
53 | | #include <tuple> |
54 | | #include <utility> |
55 | | |
56 | | #define LLE_OPTION "loop-load-elim" |
57 | | #define DEBUG_TYPE LLE_OPTION |
58 | | |
59 | | using namespace llvm; |
60 | | |
61 | | static cl::opt<unsigned> CheckPerElim( |
62 | | "runtime-check-per-loop-load-elim", cl::Hidden, |
63 | | cl::desc("Max number of memchecks allowed per eliminated load on average"), |
64 | | cl::init(1)); |
65 | | |
66 | | static cl::opt<unsigned> LoadElimSCEVCheckThreshold( |
67 | | "loop-load-elimination-scev-check-threshold", cl::init(8), cl::Hidden, |
68 | | cl::desc("The maximum number of SCEV checks allowed for Loop " |
69 | | "Load Elimination")); |
70 | | |
71 | | STATISTIC(NumLoopLoadEliminted, "Number of loads eliminated by LLE"); |
72 | | |
73 | | namespace { |
74 | | |
75 | | /// \brief Represent a store-to-forwarding candidate. |
76 | | struct StoreToLoadForwardingCandidate { |
77 | | LoadInst *Load; |
78 | | StoreInst *Store; |
79 | | |
80 | | StoreToLoadForwardingCandidate(LoadInst *Load, StoreInst *Store) |
81 | 1.43k | : Load(Load), Store(Store) {} |
82 | | |
83 | | /// \brief Return true if the dependence from the store to the load has a |
84 | | /// distance of one. E.g. A[i+1] = A[i] |
85 | | bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE, |
86 | 643 | Loop *L) const { |
87 | 643 | Value *LoadPtr = Load->getPointerOperand(); |
88 | 643 | Value *StorePtr = Store->getPointerOperand(); |
89 | 643 | Type *LoadPtrType = LoadPtr->getType(); |
90 | 643 | Type *LoadType = LoadPtrType->getPointerElementType(); |
91 | 643 | |
92 | 643 | assert(LoadPtrType->getPointerAddressSpace() == |
93 | 643 | StorePtr->getType()->getPointerAddressSpace() && |
94 | 643 | LoadType == StorePtr->getType()->getPointerElementType() && |
95 | 643 | "Should be a known dependence"); |
96 | 643 | |
97 | 643 | // Currently we only support accesses with unit stride. FIXME: we should be |
98 | 643 | // able to handle non unit stirde as well as long as the stride is equal to |
99 | 643 | // the dependence distance. |
100 | 643 | if (getPtrStride(PSE, LoadPtr, L) != 1 || |
101 | 506 | getPtrStride(PSE, StorePtr, L) != 1) |
102 | 137 | return false; |
103 | 506 | |
104 | 506 | auto &DL = Load->getParent()->getModule()->getDataLayout(); |
105 | 506 | unsigned TypeByteSize = DL.getTypeAllocSize(const_cast<Type *>(LoadType)); |
106 | 506 | |
107 | 506 | auto *LoadPtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(LoadPtr)); |
108 | 506 | auto *StorePtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(StorePtr)); |
109 | 506 | |
110 | 506 | // We don't need to check non-wrapping here because forward/backward |
111 | 506 | // dependence wouldn't be valid if these weren't monotonic accesses. |
112 | 506 | auto *Dist = cast<SCEVConstant>( |
113 | 506 | PSE.getSE()->getMinusSCEV(StorePtrSCEV, LoadPtrSCEV)); |
114 | 506 | const APInt &Val = Dist->getAPInt(); |
115 | 506 | return Val == TypeByteSize; |
116 | 506 | } |
117 | | |
118 | 38 | Value *getLoadPtr() const { return Load->getPointerOperand(); } |
119 | | |
120 | | #ifndef NDEBUG |
121 | | friend raw_ostream &operator<<(raw_ostream &OS, |
122 | | const StoreToLoadForwardingCandidate &Cand) { |
123 | | OS << *Cand.Store << " -->\n"; |
124 | | OS.indent(2) << *Cand.Load << "\n"; |
125 | | return OS; |
126 | | } |
127 | | #endif |
128 | | }; |
129 | | |
130 | | /// \brief Check if the store dominates all latches, so as long as there is no |
131 | | /// intervening store this value will be loaded in the next iteration. |
132 | | bool doesStoreDominatesAllLatches(BasicBlock *StoreBlock, Loop *L, |
133 | 540 | DominatorTree *DT) { |
134 | 540 | SmallVector<BasicBlock *, 8> Latches; |
135 | 540 | L->getLoopLatches(Latches); |
136 | 540 | return llvm::all_of(Latches, [&](const BasicBlock *Latch) { |
137 | 540 | return DT->dominates(StoreBlock, Latch); |
138 | 540 | }); |
139 | 540 | } |
140 | | |
141 | | /// \brief Return true if the load is not executed on all paths in the loop. |
142 | 470 | static bool isLoadConditional(LoadInst *Load, Loop *L) { |
143 | 470 | return Load->getParent() != L->getHeader(); |
144 | 470 | } |
145 | | |
146 | | /// \brief The per-loop class that does most of the work. |
147 | | class LoadEliminationForLoop { |
148 | | public: |
149 | | LoadEliminationForLoop(Loop *L, LoopInfo *LI, const LoopAccessInfo &LAI, |
150 | | DominatorTree *DT) |
151 | 276k | : L(L), LI(LI), LAI(LAI), DT(DT), PSE(LAI.getPSE()) {} |
152 | | |
153 | | /// \brief Look through the loop-carried and loop-independent dependences in |
154 | | /// this loop and find store->load dependences. |
155 | | /// |
156 | | /// Note that no candidate is returned if LAA has failed to analyze the loop |
157 | | /// (e.g. if it's not bottom-tested, contains volatile memops, etc.) |
158 | | std::forward_list<StoreToLoadForwardingCandidate> |
159 | 276k | findStoreToLoadDependences(const LoopAccessInfo &LAI) { |
160 | 276k | std::forward_list<StoreToLoadForwardingCandidate> Candidates; |
161 | 276k | |
162 | 276k | const auto *Deps = LAI.getDepChecker().getDependences(); |
163 | 276k | if (!Deps) |
164 | 270 | return Candidates; |
165 | 275k | |
166 | 275k | // Find store->load dependences (consequently true dep). Both lexically |
167 | 275k | // forward and backward dependences qualify. Disqualify loads that have |
168 | 275k | // other unknown dependences. |
169 | 275k | |
170 | 275k | SmallSet<Instruction *, 4> LoadsWithUnknownDepedence; |
171 | 275k | |
172 | 12.8k | for (const auto &Dep : *Deps) { |
173 | 12.8k | Instruction *Source = Dep.getSource(LAI); |
174 | 12.8k | Instruction *Destination = Dep.getDestination(LAI); |
175 | 12.8k | |
176 | 12.8k | if (Dep.Type == MemoryDepChecker::Dependence::Unknown12.8k ) { |
177 | 3.69k | if (isa<LoadInst>(Source)) |
178 | 1.87k | LoadsWithUnknownDepedence.insert(Source); |
179 | 3.69k | if (isa<LoadInst>(Destination)) |
180 | 627 | LoadsWithUnknownDepedence.insert(Destination); |
181 | 3.69k | continue; |
182 | 3.69k | } |
183 | 9.20k | |
184 | 9.20k | if (9.20k Dep.isBackward()9.20k ) |
185 | 9.20k | // Note that the designations source and destination follow the program |
186 | 9.20k | // order, i.e. source is always first. (The direction is given by the |
187 | 9.20k | // DepType.) |
188 | 3.51k | std::swap(Source, Destination); |
189 | 9.20k | else |
190 | 9.20k | assert(Dep.isForward() && "Needs to be a forward dependence"); |
191 | 9.20k | |
192 | 9.20k | auto *Store = dyn_cast<StoreInst>(Source); |
193 | 9.20k | if (!Store) |
194 | 4.93k | continue; |
195 | 4.26k | auto *Load = dyn_cast<LoadInst>(Destination); |
196 | 4.26k | if (!Load) |
197 | 2.82k | continue; |
198 | 1.43k | |
199 | 1.43k | // Only progagate the value if they are of the same type. |
200 | 1.43k | if (1.43k Store->getPointerOperandType() != Load->getPointerOperandType()1.43k ) |
201 | 5 | continue; |
202 | 1.43k | |
203 | 1.43k | Candidates.emplace_front(Load, Store); |
204 | 1.43k | } |
205 | 275k | |
206 | 275k | if (!LoadsWithUnknownDepedence.empty()) |
207 | 464 | Candidates.remove_if([&](const StoreToLoadForwardingCandidate &C) 464 { |
208 | 77 | return LoadsWithUnknownDepedence.count(C.Load); |
209 | 464 | }); |
210 | 276k | |
211 | 276k | return Candidates; |
212 | 276k | } |
213 | | |
214 | | /// \brief Return the index of the instruction according to program order. |
215 | 86 | unsigned getInstrIndex(Instruction *Inst) { |
216 | 86 | auto I = InstOrder.find(Inst); |
217 | 86 | assert(I != InstOrder.end() && "No index for instruction"); |
218 | 86 | return I->second; |
219 | 86 | } |
220 | | |
221 | | /// \brief If a load has multiple candidates associated (i.e. different |
222 | | /// stores), it means that it could be forwarding from multiple stores |
223 | | /// depending on control flow. Remove these candidates. |
224 | | /// |
225 | | /// Here, we rely on LAA to include the relevant loop-independent dependences. |
226 | | /// LAA is known to omit these in the very simple case when the read and the |
227 | | /// write within an alias set always takes place using the *same* pointer. |
228 | | /// |
229 | | /// However, we know that this is not the case here, i.e. we can rely on LAA |
230 | | /// to provide us with loop-independent dependences for the cases we're |
231 | | /// interested. Consider the case for example where a loop-independent |
232 | | /// dependece S1->S2 invalidates the forwarding S3->S2. |
233 | | /// |
234 | | /// A[i] = ... (S1) |
235 | | /// ... = A[i] (S2) |
236 | | /// A[i+1] = ... (S3) |
237 | | /// |
238 | | /// LAA will perform dependence analysis here because there are two |
239 | | /// *different* pointers involved in the same alias set (&A[i] and &A[i+1]). |
240 | | void removeDependencesFromMultipleStores( |
241 | 409 | std::forward_list<StoreToLoadForwardingCandidate> &Candidates) { |
242 | 409 | // If Store is nullptr it means that we have multiple stores forwarding to |
243 | 409 | // this store. |
244 | 409 | typedef DenseMap<LoadInst *, const StoreToLoadForwardingCandidate *> |
245 | 409 | LoadToSingleCandT; |
246 | 409 | LoadToSingleCandT LoadToSingleCand; |
247 | 409 | |
248 | 1.42k | for (const auto &Cand : Candidates) { |
249 | 1.42k | bool NewElt; |
250 | 1.42k | LoadToSingleCandT::iterator Iter; |
251 | 1.42k | |
252 | 1.42k | std::tie(Iter, NewElt) = |
253 | 1.42k | LoadToSingleCand.insert(std::make_pair(Cand.Load, &Cand)); |
254 | 1.42k | if (!NewElt1.42k ) { |
255 | 654 | const StoreToLoadForwardingCandidate *&OtherCand = Iter->second; |
256 | 654 | // Already multiple stores forward to this load. |
257 | 654 | if (OtherCand == nullptr) |
258 | 418 | continue; |
259 | 236 | |
260 | 236 | // Handle the very basic case when the two stores are in the same block |
261 | 236 | // so deciding which one forwards is easy. The later one forwards as |
262 | 236 | // long as they both have a dependence distance of one to the load. |
263 | 236 | if (236 Cand.Store->getParent() == OtherCand->Store->getParent() && |
264 | 188 | Cand.isDependenceDistanceOfOne(PSE, L) && |
265 | 236 | OtherCand->isDependenceDistanceOfOne(PSE, L)2 ) { |
266 | 1 | // They are in the same block, the later one will forward to the load. |
267 | 1 | if (getInstrIndex(OtherCand->Store) < getInstrIndex(Cand.Store)) |
268 | 0 | OtherCand = &Cand; |
269 | 1 | } else |
270 | 235 | OtherCand = nullptr; |
271 | 654 | } |
272 | 1.42k | } |
273 | 409 | |
274 | 1.42k | Candidates.remove_if([&](const StoreToLoadForwardingCandidate &Cand) { |
275 | 1.42k | if (LoadToSingleCand[Cand.Load] != &Cand1.42k ) { |
276 | 889 | DEBUG(dbgs() << "Removing from candidates: \n" << Cand |
277 | 889 | << " The load may have multiple stores forwarding to " |
278 | 889 | << "it\n"); |
279 | 889 | return true; |
280 | 889 | } |
281 | 540 | return false; |
282 | 540 | }); |
283 | 409 | } |
284 | | |
285 | | /// \brief Given two pointers operations by their RuntimePointerChecking |
286 | | /// indices, return true if they require an alias check. |
287 | | /// |
288 | | /// We need a check if one is a pointer for a candidate load and the other is |
289 | | /// a pointer for a possibly intervening store. |
290 | | bool needsChecking(unsigned PtrIdx1, unsigned PtrIdx2, |
291 | | const SmallSet<Value *, 4> &PtrsWrittenOnFwdingPath, |
292 | 64 | const std::set<Value *> &CandLoadPtrs) { |
293 | 64 | Value *Ptr1 = |
294 | 64 | LAI.getRuntimePointerChecking()->getPointerInfo(PtrIdx1).PointerValue; |
295 | 64 | Value *Ptr2 = |
296 | 64 | LAI.getRuntimePointerChecking()->getPointerInfo(PtrIdx2).PointerValue; |
297 | 22 | return ((PtrsWrittenOnFwdingPath.count(Ptr1) && CandLoadPtrs.count(Ptr2)) || |
298 | 64 | (PtrsWrittenOnFwdingPath.count(Ptr2) && 64 CandLoadPtrs.count(Ptr1)13 )); |
299 | 64 | } |
300 | | |
301 | | /// \brief Return pointers that are possibly written to on the path from a |
302 | | /// forwarding store to a load. |
303 | | /// |
304 | | /// These pointers need to be alias-checked against the forwarding candidates. |
305 | | SmallSet<Value *, 4> findPointersWrittenOnForwardingPath( |
306 | 34 | const SmallVectorImpl<StoreToLoadForwardingCandidate> &Candidates) { |
307 | 34 | // From FirstStore to LastLoad neither of the elimination candidate loads |
308 | 34 | // should overlap with any of the stores. |
309 | 34 | // |
310 | 34 | // E.g.: |
311 | 34 | // |
312 | 34 | // st1 C[i] |
313 | 34 | // ld1 B[i] <-------, |
314 | 34 | // ld0 A[i] <----, | * LastLoad |
315 | 34 | // ... | | |
316 | 34 | // st2 E[i] | | |
317 | 34 | // st3 B[i+1] -- | -' * FirstStore |
318 | 34 | // st0 A[i+1] ---' |
319 | 34 | // st4 D[i] |
320 | 34 | // |
321 | 34 | // st0 forwards to ld0 if the accesses in st4 and st1 don't overlap with |
322 | 34 | // ld0. |
323 | 34 | |
324 | 34 | LoadInst *LastLoad = |
325 | 34 | std::max_element(Candidates.begin(), Candidates.end(), |
326 | 34 | [&](const StoreToLoadForwardingCandidate &A, |
327 | 4 | const StoreToLoadForwardingCandidate &B) { |
328 | 4 | return getInstrIndex(A.Load) < getInstrIndex(B.Load); |
329 | 4 | }) |
330 | 34 | ->Load; |
331 | 34 | StoreInst *FirstStore = |
332 | 34 | std::min_element(Candidates.begin(), Candidates.end(), |
333 | 34 | [&](const StoreToLoadForwardingCandidate &A, |
334 | 4 | const StoreToLoadForwardingCandidate &B) { |
335 | 4 | return getInstrIndex(A.Store) < |
336 | 4 | getInstrIndex(B.Store); |
337 | 4 | }) |
338 | 34 | ->Store; |
339 | 34 | |
340 | 34 | // We're looking for stores after the first forwarding store until the end |
341 | 34 | // of the loop, then from the beginning of the loop until the last |
342 | 34 | // forwarded-to load. Collect the pointer for the stores. |
343 | 34 | SmallSet<Value *, 4> PtrsWrittenOnFwdingPath; |
344 | 34 | |
345 | 54 | auto InsertStorePtr = [&](Instruction *I) { |
346 | 54 | if (auto *S = dyn_cast<StoreInst>(I)) |
347 | 22 | PtrsWrittenOnFwdingPath.insert(S->getPointerOperand()); |
348 | 54 | }; |
349 | 34 | const auto &MemInstrs = LAI.getDepChecker().getMemoryInstructions(); |
350 | 34 | std::for_each(MemInstrs.begin() + getInstrIndex(FirstStore) + 1, |
351 | 34 | MemInstrs.end(), InsertStorePtr); |
352 | 34 | std::for_each(MemInstrs.begin(), &MemInstrs[getInstrIndex(LastLoad)], |
353 | 34 | InsertStorePtr); |
354 | 34 | |
355 | 34 | return PtrsWrittenOnFwdingPath; |
356 | 34 | } |
357 | | |
358 | | /// \brief Determine the pointer alias checks to prove that there are no |
359 | | /// intervening stores. |
360 | | SmallVector<RuntimePointerChecking::PointerCheck, 4> collectMemchecks( |
361 | 34 | const SmallVectorImpl<StoreToLoadForwardingCandidate> &Candidates) { |
362 | 34 | |
363 | 34 | SmallSet<Value *, 4> PtrsWrittenOnFwdingPath = |
364 | 34 | findPointersWrittenOnForwardingPath(Candidates); |
365 | 34 | |
366 | 34 | // Collect the pointers of the candidate loads. |
367 | 34 | // FIXME: SmallSet does not work with std::inserter. |
368 | 34 | std::set<Value *> CandLoadPtrs; |
369 | 34 | transform(Candidates, |
370 | 34 | std::inserter(CandLoadPtrs, CandLoadPtrs.begin()), |
371 | 34 | std::mem_fn(&StoreToLoadForwardingCandidate::getLoadPtr)); |
372 | 34 | |
373 | 34 | const auto &AllChecks = LAI.getRuntimePointerChecking()->getChecks(); |
374 | 34 | SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks; |
375 | 34 | |
376 | 34 | copy_if(AllChecks, std::back_inserter(Checks), |
377 | 38 | [&](const RuntimePointerChecking::PointerCheck &Check) { |
378 | 38 | for (auto PtrIdx1 : Check.first->Members) |
379 | 60 | for (auto PtrIdx2 : Check.second->Members) |
380 | 64 | if (64 needsChecking(PtrIdx1, PtrIdx2, PtrsWrittenOnFwdingPath, |
381 | 64 | CandLoadPtrs)) |
382 | 11 | return true; |
383 | 27 | return false; |
384 | 27 | }); |
385 | 34 | |
386 | 34 | DEBUG(dbgs() << "\nPointer Checks (count: " << Checks.size() << "):\n"); |
387 | 34 | DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks)); |
388 | 34 | |
389 | 34 | return Checks; |
390 | 34 | } |
391 | | |
392 | | /// \brief Perform the transformation for a candidate. |
393 | | void |
394 | | propagateStoredValueToLoadUsers(const StoreToLoadForwardingCandidate &Cand, |
395 | 35 | SCEVExpander &SEE) { |
396 | 35 | // |
397 | 35 | // loop: |
398 | 35 | // %x = load %gep_i |
399 | 35 | // = ... %x |
400 | 35 | // store %y, %gep_i_plus_1 |
401 | 35 | // |
402 | 35 | // => |
403 | 35 | // |
404 | 35 | // ph: |
405 | 35 | // %x.initial = load %gep_0 |
406 | 35 | // loop: |
407 | 35 | // %x.storeforward = phi [%x.initial, %ph] [%y, %loop] |
408 | 35 | // %x = load %gep_i <---- now dead |
409 | 35 | // = ... %x.storeforward |
410 | 35 | // store %y, %gep_i_plus_1 |
411 | 35 | |
412 | 35 | Value *Ptr = Cand.Load->getPointerOperand(); |
413 | 35 | auto *PtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(Ptr)); |
414 | 35 | auto *PH = L->getLoopPreheader(); |
415 | 35 | Value *InitialPtr = SEE.expandCodeFor(PtrSCEV->getStart(), Ptr->getType(), |
416 | 35 | PH->getTerminator()); |
417 | 35 | Value *Initial = |
418 | 35 | new LoadInst(InitialPtr, "load_initial", /* isVolatile */ false, |
419 | 35 | Cand.Load->getAlignment(), PH->getTerminator()); |
420 | 35 | |
421 | 35 | PHINode *PHI = PHINode::Create(Initial->getType(), 2, "store_forwarded", |
422 | 35 | &L->getHeader()->front()); |
423 | 35 | PHI->addIncoming(Initial, PH); |
424 | 35 | PHI->addIncoming(Cand.Store->getOperand(0), L->getLoopLatch()); |
425 | 35 | |
426 | 35 | Cand.Load->replaceAllUsesWith(PHI); |
427 | 35 | } |
428 | | |
429 | | /// \brief Top-level driver for each loop: find store->load forwarding |
430 | | /// candidates, add run-time checks and perform transformation. |
431 | 276k | bool processLoop() { |
432 | 276k | DEBUG(dbgs() << "\nIn \"" << L->getHeader()->getParent()->getName() |
433 | 276k | << "\" checking " << *L << "\n"); |
434 | 276k | // Look for store-to-load forwarding cases across the |
435 | 276k | // backedge. E.g.: |
436 | 276k | // |
437 | 276k | // loop: |
438 | 276k | // %x = load %gep_i |
439 | 276k | // = ... %x |
440 | 276k | // store %y, %gep_i_plus_1 |
441 | 276k | // |
442 | 276k | // => |
443 | 276k | // |
444 | 276k | // ph: |
445 | 276k | // %x.initial = load %gep_0 |
446 | 276k | // loop: |
447 | 276k | // %x.storeforward = phi [%x.initial, %ph] [%y, %loop] |
448 | 276k | // %x = load %gep_i <---- now dead |
449 | 276k | // = ... %x.storeforward |
450 | 276k | // store %y, %gep_i_plus_1 |
451 | 276k | |
452 | 276k | // First start with store->load dependences. |
453 | 276k | auto StoreToLoadDependences = findStoreToLoadDependences(LAI); |
454 | 276k | if (StoreToLoadDependences.empty()) |
455 | 275k | return false; |
456 | 409 | |
457 | 409 | // Generate an index for each load and store according to the original |
458 | 409 | // program order. This will be used later. |
459 | 409 | InstOrder = LAI.getDepChecker().generateInstructionOrderMap(); |
460 | 409 | |
461 | 409 | // To keep things simple for now, remove those where the load is potentially |
462 | 409 | // fed by multiple stores. |
463 | 409 | removeDependencesFromMultipleStores(StoreToLoadDependences); |
464 | 409 | if (StoreToLoadDependences.empty()) |
465 | 9 | return false; |
466 | 400 | |
467 | 400 | // Filter the candidates further. |
468 | 400 | SmallVector<StoreToLoadForwardingCandidate, 4> Candidates; |
469 | 400 | unsigned NumForwarding = 0; |
470 | 540 | for (const StoreToLoadForwardingCandidate Cand : StoreToLoadDependences) { |
471 | 540 | DEBUG(dbgs() << "Candidate " << Cand); |
472 | 540 | |
473 | 540 | // Make sure that the stored values is available everywhere in the loop in |
474 | 540 | // the next iteration. |
475 | 540 | if (!doesStoreDominatesAllLatches(Cand.Store->getParent(), L, DT)) |
476 | 70 | continue; |
477 | 470 | |
478 | 470 | // If the load is conditional we can't hoist its 0-iteration instance to |
479 | 470 | // the preheader because that would make it unconditional. Thus we would |
480 | 470 | // access a memory location that the original loop did not access. |
481 | 470 | if (470 isLoadConditional(Cand.Load, L)470 ) |
482 | 17 | continue; |
483 | 453 | |
484 | 453 | // Check whether the SCEV difference is the same as the induction step, |
485 | 453 | // thus we load the value in the next iteration. |
486 | 453 | if (453 !Cand.isDependenceDistanceOfOne(PSE, L)453 ) |
487 | 415 | continue; |
488 | 38 | |
489 | 38 | ++NumForwarding; |
490 | 38 | DEBUG(dbgs() |
491 | 540 | << NumForwarding |
492 | 540 | << ". Valid store-to-load forwarding across the loop backedge\n"); |
493 | 540 | Candidates.push_back(Cand); |
494 | 540 | } |
495 | 400 | if (Candidates.empty()) |
496 | 366 | return false; |
497 | 34 | |
498 | 34 | // Check intervening may-alias stores. These need runtime checks for alias |
499 | 34 | // disambiguation. |
500 | 34 | SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks = |
501 | 34 | collectMemchecks(Candidates); |
502 | 34 | |
503 | 34 | // Too many checks are likely to outweigh the benefits of forwarding. |
504 | 34 | if (Checks.size() > Candidates.size() * CheckPerElim34 ) { |
505 | 1 | DEBUG(dbgs() << "Too many run-time checks needed.\n"); |
506 | 1 | return false; |
507 | 1 | } |
508 | 33 | |
509 | 33 | if (33 LAI.getPSE().getUnionPredicate().getComplexity() > |
510 | 33 | LoadElimSCEVCheckThreshold) { |
511 | 1 | DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n"); |
512 | 1 | return false; |
513 | 1 | } |
514 | 32 | |
515 | 32 | if (32 !Checks.empty() || 32 !LAI.getPSE().getUnionPredicate().isAlwaysTrue()24 ) { |
516 | 11 | if (L->getHeader()->getParent()->optForSize()11 ) { |
517 | 1 | DEBUG(dbgs() << "Versioning is needed but not allowed when optimizing " |
518 | 1 | "for size.\n"); |
519 | 1 | return false; |
520 | 1 | } |
521 | 10 | |
522 | 10 | if (10 !L->isLoopSimplifyForm()10 ) { |
523 | 0 | DEBUG(dbgs() << "Loop is not is loop-simplify form"); |
524 | 0 | return false; |
525 | 0 | } |
526 | 10 | |
527 | 10 | // Point of no-return, start the transformation. First, version the loop |
528 | 10 | // if necessary. |
529 | 10 | |
530 | 10 | LoopVersioning LV(LAI, L, LI, DT, PSE.getSE(), false); |
531 | 10 | LV.setAliasChecks(std::move(Checks)); |
532 | 10 | LV.setSCEVChecks(LAI.getPSE().getUnionPredicate()); |
533 | 10 | LV.versionLoop(); |
534 | 10 | } |
535 | 32 | |
536 | 32 | // Next, propagate the value stored by the store to the users of the load. |
537 | 32 | // Also for the first iteration, generate the initial value of the load. |
538 | 31 | SCEVExpander SEE(*PSE.getSE(), L->getHeader()->getModule()->getDataLayout(), |
539 | 31 | "storeforward"); |
540 | 31 | for (const auto &Cand : Candidates) |
541 | 35 | propagateStoredValueToLoadUsers(Cand, SEE); |
542 | 31 | NumLoopLoadEliminted += NumForwarding; |
543 | 31 | |
544 | 31 | return true; |
545 | 276k | } |
546 | | |
547 | | private: |
548 | | Loop *L; |
549 | | |
550 | | /// \brief Maps the load/store instructions to their index according to |
551 | | /// program order. |
552 | | DenseMap<Instruction *, unsigned> InstOrder; |
553 | | |
554 | | // Analyses used. |
555 | | LoopInfo *LI; |
556 | | const LoopAccessInfo &LAI; |
557 | | DominatorTree *DT; |
558 | | PredicatedScalarEvolution PSE; |
559 | | }; |
560 | | |
561 | | static bool |
562 | | eliminateLoadsAcrossLoops(Function &F, LoopInfo &LI, DominatorTree &DT, |
563 | 462k | function_ref<const LoopAccessInfo &(Loop &)> GetLAI) { |
564 | 462k | // Build up a worklist of inner-loops to transform to avoid iterator |
565 | 462k | // invalidation. |
566 | 462k | // FIXME: This logic comes from other passes that actually change the loop |
567 | 462k | // nest structure. It isn't clear this is necessary (or useful) for a pass |
568 | 462k | // which merely optimizes the use of loads in a loop. |
569 | 462k | SmallVector<Loop *, 8> Worklist; |
570 | 462k | |
571 | 462k | for (Loop *TopLevelLoop : LI) |
572 | 237k | for (Loop *L : depth_first(TopLevelLoop)) |
573 | 237k | // We only handle inner-most loops. |
574 | 333k | if (333k L->empty()333k ) |
575 | 276k | Worklist.push_back(L); |
576 | 462k | |
577 | 462k | // Now walk the identified inner loops. |
578 | 462k | bool Changed = false; |
579 | 276k | for (Loop *L : Worklist) { |
580 | 276k | // The actual work is performed by LoadEliminationForLoop. |
581 | 276k | LoadEliminationForLoop LEL(L, &LI, GetLAI(*L), &DT); |
582 | 276k | Changed |= LEL.processLoop(); |
583 | 276k | } |
584 | 462k | return Changed; |
585 | 462k | } |
586 | | |
587 | | /// \brief The pass. Most of the work is delegated to the per-loop |
588 | | /// LoadEliminationForLoop class. |
589 | | class LoopLoadElimination : public FunctionPass { |
590 | | public: |
591 | 17.2k | LoopLoadElimination() : FunctionPass(ID) { |
592 | 17.2k | initializeLoopLoadEliminationPass(*PassRegistry::getPassRegistry()); |
593 | 17.2k | } |
594 | | |
595 | 462k | bool runOnFunction(Function &F) override { |
596 | 462k | if (skipFunction(F)) |
597 | 75 | return false; |
598 | 462k | |
599 | 462k | auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); |
600 | 462k | auto &LAA = getAnalysis<LoopAccessLegacyAnalysis>(); |
601 | 462k | auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); |
602 | 462k | |
603 | 462k | // Process each loop nest in the function. |
604 | 462k | return eliminateLoadsAcrossLoops( |
605 | 462k | F, LI, DT, |
606 | 276k | [&LAA](Loop &L) -> const LoopAccessInfo & { return LAA.getInfo(&L); }); |
607 | 462k | } |
608 | | |
609 | 17.2k | void getAnalysisUsage(AnalysisUsage &AU) const override { |
610 | 17.2k | AU.addRequiredID(LoopSimplifyID); |
611 | 17.2k | AU.addRequired<LoopInfoWrapperPass>(); |
612 | 17.2k | AU.addPreserved<LoopInfoWrapperPass>(); |
613 | 17.2k | AU.addRequired<LoopAccessLegacyAnalysis>(); |
614 | 17.2k | AU.addRequired<ScalarEvolutionWrapperPass>(); |
615 | 17.2k | AU.addRequired<DominatorTreeWrapperPass>(); |
616 | 17.2k | AU.addPreserved<DominatorTreeWrapperPass>(); |
617 | 17.2k | AU.addPreserved<GlobalsAAWrapperPass>(); |
618 | 17.2k | } |
619 | | |
620 | | static char ID; |
621 | | }; |
622 | | |
623 | | } // end anonymous namespace |
624 | | |
625 | | char LoopLoadElimination::ID; |
626 | | static const char LLE_name[] = "Loop Load Elimination"; |
627 | | |
628 | 41.6k | INITIALIZE_PASS_BEGIN41.6k (LoopLoadElimination, LLE_OPTION, LLE_name, false, false)
|
629 | 41.6k | INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) |
630 | 41.6k | INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) |
631 | 41.6k | INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) |
632 | 41.6k | INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) |
633 | 41.6k | INITIALIZE_PASS_DEPENDENCY(LoopSimplify) |
634 | 41.6k | INITIALIZE_PASS_END(LoopLoadElimination, LLE_OPTION, LLE_name, false, false) |
635 | | |
636 | | namespace llvm { |
637 | | |
638 | 17.2k | FunctionPass *createLoopLoadEliminationPass() { |
639 | 17.2k | return new LoopLoadElimination(); |
640 | 17.2k | } |
641 | | |
642 | | PreservedAnalyses LoopLoadEliminationPass::run(Function &F, |
643 | 73 | FunctionAnalysisManager &AM) { |
644 | 73 | auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); |
645 | 73 | auto &LI = AM.getResult<LoopAnalysis>(F); |
646 | 73 | auto &TTI = AM.getResult<TargetIRAnalysis>(F); |
647 | 73 | auto &DT = AM.getResult<DominatorTreeAnalysis>(F); |
648 | 73 | auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); |
649 | 73 | auto &AA = AM.getResult<AAManager>(F); |
650 | 73 | auto &AC = AM.getResult<AssumptionAnalysis>(F); |
651 | 73 | |
652 | 73 | auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); |
653 | 73 | bool Changed = eliminateLoadsAcrossLoops( |
654 | 26 | F, LI, DT, [&](Loop &L) -> const LoopAccessInfo & { |
655 | 26 | LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI}; |
656 | 26 | return LAM.getResult<LoopAccessAnalysis>(L, AR); |
657 | 26 | }); |
658 | 73 | |
659 | 73 | if (!Changed) |
660 | 71 | return PreservedAnalyses::all(); |
661 | 2 | |
662 | 2 | PreservedAnalyses PA; |
663 | 2 | return PA; |
664 | 2 | } |
665 | | |
666 | | } // end namespace llvm |