/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/Transforms/Scalar/JumpThreading.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===- JumpThreading.cpp - Thread control through conditional blocks ------===// |
2 | | // |
3 | | // The LLVM Compiler Infrastructure |
4 | | // |
5 | | // This file is distributed under the University of Illinois Open Source |
6 | | // License. See LICENSE.TXT for details. |
7 | | // |
8 | | //===----------------------------------------------------------------------===// |
9 | | // |
10 | | // This file implements the Jump Threading pass. |
11 | | // |
12 | | //===----------------------------------------------------------------------===// |
13 | | |
14 | | #include "llvm/Transforms/Scalar/JumpThreading.h" |
15 | | #include "llvm/ADT/DenseMap.h" |
16 | | #include "llvm/ADT/DenseSet.h" |
17 | | #include "llvm/ADT/Optional.h" |
18 | | #include "llvm/ADT/STLExtras.h" |
19 | | #include "llvm/ADT/SmallPtrSet.h" |
20 | | #include "llvm/ADT/SmallVector.h" |
21 | | #include "llvm/ADT/Statistic.h" |
22 | | #include "llvm/Analysis/AliasAnalysis.h" |
23 | | #include "llvm/Analysis/BlockFrequencyInfo.h" |
24 | | #include "llvm/Analysis/BranchProbabilityInfo.h" |
25 | | #include "llvm/Analysis/CFG.h" |
26 | | #include "llvm/Analysis/ConstantFolding.h" |
27 | | #include "llvm/Analysis/GlobalsModRef.h" |
28 | | #include "llvm/Analysis/InstructionSimplify.h" |
29 | | #include "llvm/Analysis/LazyValueInfo.h" |
30 | | #include "llvm/Analysis/Loads.h" |
31 | | #include "llvm/Analysis/LoopInfo.h" |
32 | | #include "llvm/Analysis/TargetLibraryInfo.h" |
33 | | #include "llvm/Analysis/ValueTracking.h" |
34 | | #include "llvm/IR/BasicBlock.h" |
35 | | #include "llvm/IR/CFG.h" |
36 | | #include "llvm/IR/Constant.h" |
37 | | #include "llvm/IR/ConstantRange.h" |
38 | | #include "llvm/IR/Constants.h" |
39 | | #include "llvm/IR/DataLayout.h" |
40 | | #include "llvm/IR/Dominators.h" |
41 | | #include "llvm/IR/Function.h" |
42 | | #include "llvm/IR/InstrTypes.h" |
43 | | #include "llvm/IR/Instruction.h" |
44 | | #include "llvm/IR/Instructions.h" |
45 | | #include "llvm/IR/IntrinsicInst.h" |
46 | | #include "llvm/IR/Intrinsics.h" |
47 | | #include "llvm/IR/LLVMContext.h" |
48 | | #include "llvm/IR/MDBuilder.h" |
49 | | #include "llvm/IR/Metadata.h" |
50 | | #include "llvm/IR/Module.h" |
51 | | #include "llvm/IR/PassManager.h" |
52 | | #include "llvm/IR/PatternMatch.h" |
53 | | #include "llvm/IR/Type.h" |
54 | | #include "llvm/IR/Use.h" |
55 | | #include "llvm/IR/User.h" |
56 | | #include "llvm/IR/Value.h" |
57 | | #include "llvm/Pass.h" |
58 | | #include "llvm/Support/BlockFrequency.h" |
59 | | #include "llvm/Support/BranchProbability.h" |
60 | | #include "llvm/Support/Casting.h" |
61 | | #include "llvm/Support/CommandLine.h" |
62 | | #include "llvm/Support/Debug.h" |
63 | | #include "llvm/Support/raw_ostream.h" |
64 | | #include "llvm/Transforms/Scalar.h" |
65 | | #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
66 | | #include "llvm/Transforms/Utils/Cloning.h" |
67 | | #include "llvm/Transforms/Utils/Local.h" |
68 | | #include "llvm/Transforms/Utils/SSAUpdater.h" |
69 | | #include "llvm/Transforms/Utils/ValueMapper.h" |
70 | | #include <algorithm> |
71 | | #include <cassert> |
72 | | #include <cstddef> |
73 | | #include <cstdint> |
74 | | #include <iterator> |
75 | | #include <memory> |
76 | | #include <utility> |
77 | | |
78 | | using namespace llvm; |
79 | | using namespace jumpthreading; |
80 | | |
81 | | #define DEBUG_TYPE "jump-threading" |
82 | | |
83 | | STATISTIC(NumThreads, "Number of jumps threaded"); |
84 | | STATISTIC(NumFolds, "Number of terminators folded"); |
85 | | STATISTIC(NumDupes, "Number of branch blocks duplicated to eliminate phi"); |
86 | | |
87 | | static cl::opt<unsigned> |
88 | | BBDuplicateThreshold("jump-threading-threshold", |
89 | | cl::desc("Max block size to duplicate for jump threading"), |
90 | | cl::init(6), cl::Hidden); |
91 | | |
92 | | static cl::opt<unsigned> |
93 | | ImplicationSearchThreshold( |
94 | | "jump-threading-implication-search-threshold", |
95 | | cl::desc("The number of predecessors to search for a stronger " |
96 | | "condition to use to thread over a weaker condition"), |
97 | | cl::init(3), cl::Hidden); |
98 | | |
99 | | static cl::opt<bool> PrintLVIAfterJumpThreading( |
100 | | "print-lvi-after-jump-threading", |
101 | | cl::desc("Print the LazyValueInfo cache after JumpThreading"), cl::init(false), |
102 | | cl::Hidden); |
103 | | |
104 | | namespace { |
105 | | |
106 | | /// This pass performs 'jump threading', which looks at blocks that have |
107 | | /// multiple predecessors and multiple successors. If one or more of the |
108 | | /// predecessors of the block can be proven to always jump to one of the |
109 | | /// successors, we forward the edge from the predecessor to the successor by |
110 | | /// duplicating the contents of this block. |
111 | | /// |
112 | | /// An example of when this can occur is code like this: |
113 | | /// |
114 | | /// if () { ... |
115 | | /// X = 4; |
116 | | /// } |
117 | | /// if (X < 3) { |
118 | | /// |
119 | | /// In this case, the unconditional branch at the end of the first if can be |
120 | | /// revectored to the false side of the second if. |
121 | | class JumpThreading : public FunctionPass { |
122 | | JumpThreadingPass Impl; |
123 | | |
124 | | public: |
125 | | static char ID; // Pass identification |
126 | | |
127 | 34.9k | JumpThreading(int T = -1) : FunctionPass(ID), Impl(T) { |
128 | 34.9k | initializeJumpThreadingPass(*PassRegistry::getPassRegistry()); |
129 | 34.9k | } |
130 | | |
131 | | bool runOnFunction(Function &F) override; |
132 | | |
133 | 34.9k | void getAnalysisUsage(AnalysisUsage &AU) const override { |
134 | 34.9k | AU.addRequired<DominatorTreeWrapperPass>(); |
135 | 34.9k | AU.addPreserved<DominatorTreeWrapperPass>(); |
136 | 34.9k | AU.addRequired<AAResultsWrapperPass>(); |
137 | 34.9k | AU.addRequired<LazyValueInfoWrapperPass>(); |
138 | 34.9k | AU.addPreserved<LazyValueInfoWrapperPass>(); |
139 | 34.9k | AU.addPreserved<GlobalsAAWrapperPass>(); |
140 | 34.9k | AU.addRequired<TargetLibraryInfoWrapperPass>(); |
141 | 34.9k | } |
142 | | |
143 | 1.03M | void releaseMemory() override { Impl.releaseMemory(); } |
144 | | }; |
145 | | |
146 | | } // end anonymous namespace |
147 | | |
148 | | char JumpThreading::ID = 0; |
149 | | |
150 | 41.8k | INITIALIZE_PASS_BEGIN41.8k (JumpThreading, "jump-threading",
|
151 | 41.8k | "Jump Threading", false, false) |
152 | 41.8k | INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) |
153 | 41.8k | INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass) |
154 | 41.8k | INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) |
155 | 41.8k | INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) |
156 | 41.8k | INITIALIZE_PASS_END(JumpThreading, "jump-threading", |
157 | | "Jump Threading", false, false) |
158 | | |
159 | | // Public interface to the Jump Threading pass |
160 | 34.9k | FunctionPass *llvm::createJumpThreadingPass(int Threshold) { |
161 | 34.9k | return new JumpThreading(Threshold); |
162 | 34.9k | } |
163 | | |
164 | 35.0k | JumpThreadingPass::JumpThreadingPass(int T) { |
165 | 18.4E | BBDupThreshold = (T == -1) ? BBDuplicateThreshold35.0k : unsigned(T)18.4E ; |
166 | 35.0k | } |
167 | | |
168 | | // Update branch probability information according to conditional |
169 | | // branch probablity. This is usually made possible for cloned branches |
170 | | // in inline instances by the context specific profile in the caller. |
171 | | // For instance, |
172 | | // |
173 | | // [Block PredBB] |
174 | | // [Branch PredBr] |
175 | | // if (t) { |
176 | | // Block A; |
177 | | // } else { |
178 | | // Block B; |
179 | | // } |
180 | | // |
181 | | // [Block BB] |
182 | | // cond = PN([true, %A], [..., %B]); // PHI node |
183 | | // [Branch CondBr] |
184 | | // if (cond) { |
185 | | // ... // P(cond == true) = 1% |
186 | | // } |
187 | | // |
188 | | // Here we know that when block A is taken, cond must be true, which means |
189 | | // P(cond == true | A) = 1 |
190 | | // |
191 | | // Given that P(cond == true) = P(cond == true | A) * P(A) + |
192 | | // P(cond == true | B) * P(B) |
193 | | // we get |
194 | | // P(cond == true ) = P(A) + P(cond == true | B) * P(B) |
195 | | // |
196 | | // which gives us: |
197 | | // P(A) <= P(c == true), i.e. |
198 | | // P(t == true) <= P(cond == true) |
199 | | // |
200 | | // In other words, if we know P(cond == true), we know that P(t == true) |
201 | | // can not be greater than 1%. |
202 | 16.9k | static void updatePredecessorProfileMetadata(PHINode *PN, BasicBlock *BB) { |
203 | 16.9k | BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator()); |
204 | 16.9k | if (!CondBr) |
205 | 0 | return; |
206 | 16.9k | |
207 | 16.9k | BranchProbability BP; |
208 | 16.9k | uint64_t TrueWeight, FalseWeight; |
209 | 16.9k | if (!CondBr->extractProfMetadata(TrueWeight, FalseWeight)) |
210 | 16.7k | return; |
211 | 169 | |
212 | 169 | // Returns the outgoing edge of the dominating predecessor block |
213 | 169 | // that leads to the PhiNode's incoming block: |
214 | 169 | auto GetPredOutEdge = |
215 | 169 | [](BasicBlock *IncomingBB, |
216 | 177 | BasicBlock *PhiBB) -> std::pair<BasicBlock *, BasicBlock *> { |
217 | 177 | auto *PredBB = IncomingBB; |
218 | 177 | auto *SuccBB = PhiBB; |
219 | 177 | while (true177 ) { |
220 | 177 | BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator()); |
221 | 177 | if (PredBr && 177 PredBr->isConditional()177 ) |
222 | 177 | return {PredBB, SuccBB}; |
223 | 0 | auto *SinglePredBB = PredBB->getSinglePredecessor(); |
224 | 0 | if (!SinglePredBB) |
225 | 0 | return {nullptr, nullptr}; |
226 | 0 | SuccBB = PredBB; |
227 | 0 | PredBB = SinglePredBB; |
228 | 0 | } |
229 | 177 | }; |
230 | 169 | |
231 | 515 | for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e515 ; ++i346 ) { |
232 | 346 | Value *PhiOpnd = PN->getIncomingValue(i); |
233 | 346 | ConstantInt *CI = dyn_cast<ConstantInt>(PhiOpnd); |
234 | 346 | |
235 | 346 | if (!CI || 346 !CI->getType()->isIntegerTy(1)177 ) |
236 | 169 | continue; |
237 | 177 | |
238 | 177 | BP = (CI->isOne() ? 177 BranchProbability::getBranchProbability( |
239 | 45 | TrueWeight, TrueWeight + FalseWeight) |
240 | 132 | : BranchProbability::getBranchProbability( |
241 | 132 | FalseWeight, TrueWeight + FalseWeight)); |
242 | 177 | |
243 | 177 | auto PredOutEdge = GetPredOutEdge(PN->getIncomingBlock(i), BB); |
244 | 177 | if (!PredOutEdge.first) |
245 | 0 | return; |
246 | 177 | |
247 | 177 | BasicBlock *PredBB = PredOutEdge.first; |
248 | 177 | BranchInst *PredBr = cast<BranchInst>(PredBB->getTerminator()); |
249 | 177 | |
250 | 177 | uint64_t PredTrueWeight, PredFalseWeight; |
251 | 177 | // FIXME: We currently only set the profile data when it is missing. |
252 | 177 | // With PGO, this can be used to refine even existing profile data with |
253 | 177 | // context information. This needs to be done after more performance |
254 | 177 | // testing. |
255 | 177 | if (PredBr->extractProfMetadata(PredTrueWeight, PredFalseWeight)) |
256 | 18 | continue; |
257 | 159 | |
258 | 159 | // We can not infer anything useful when BP >= 50%, because BP is the |
259 | 159 | // upper bound probability value. |
260 | 159 | if (159 BP >= BranchProbability(50, 100)159 ) |
261 | 135 | continue; |
262 | 24 | |
263 | 24 | SmallVector<uint32_t, 2> Weights; |
264 | 24 | if (PredBr->getSuccessor(0) == PredOutEdge.second24 ) { |
265 | 24 | Weights.push_back(BP.getNumerator()); |
266 | 24 | Weights.push_back(BP.getCompl().getNumerator()); |
267 | 24 | } else { |
268 | 0 | Weights.push_back(BP.getCompl().getNumerator()); |
269 | 0 | Weights.push_back(BP.getNumerator()); |
270 | 0 | } |
271 | 346 | PredBr->setMetadata(LLVMContext::MD_prof, |
272 | 346 | MDBuilder(PredBr->getParent()->getContext()) |
273 | 346 | .createBranchWeights(Weights)); |
274 | 346 | } |
275 | 16.9k | } |
276 | | |
277 | | /// runOnFunction - Toplevel algorithm. |
278 | 1.03M | bool JumpThreading::runOnFunction(Function &F) { |
279 | 1.03M | if (skipFunction(F)) |
280 | 116 | return false; |
281 | 1.03M | auto TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); |
282 | 1.03M | // Get DT analysis before LVI. When LVI is initialized it conditionally adds |
283 | 1.03M | // DT if it's available. |
284 | 1.03M | auto DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); |
285 | 1.03M | auto LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI(); |
286 | 1.03M | auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); |
287 | 1.03M | std::unique_ptr<BlockFrequencyInfo> BFI; |
288 | 1.03M | std::unique_ptr<BranchProbabilityInfo> BPI; |
289 | 1.03M | bool HasProfileData = F.getEntryCount().hasValue(); |
290 | 1.03M | if (HasProfileData1.03M ) { |
291 | 36 | LoopInfo LI{DominatorTree(F)}; |
292 | 36 | BPI.reset(new BranchProbabilityInfo(F, LI, TLI)); |
293 | 36 | BFI.reset(new BlockFrequencyInfo(F, *BPI, LI)); |
294 | 36 | } |
295 | 1.03M | |
296 | 1.03M | bool Changed = Impl.runImpl(F, TLI, LVI, AA, DT, HasProfileData, |
297 | 1.03M | std::move(BFI), std::move(BPI)); |
298 | 1.03M | if (PrintLVIAfterJumpThreading1.03M ) { |
299 | 4 | dbgs() << "LVI for function '" << F.getName() << "':\n"; |
300 | 4 | LVI->printLVI(F, *DT, dbgs()); |
301 | 4 | } |
302 | 1.03M | return Changed; |
303 | 1.03M | } |
304 | | |
305 | | PreservedAnalyses JumpThreadingPass::run(Function &F, |
306 | 249 | FunctionAnalysisManager &AM) { |
307 | 249 | auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); |
308 | 249 | // Get DT analysis before LVI. When LVI is initialized it conditionally adds |
309 | 249 | // DT if it's available. |
310 | 249 | auto &DT = AM.getResult<DominatorTreeAnalysis>(F); |
311 | 249 | auto &LVI = AM.getResult<LazyValueAnalysis>(F); |
312 | 249 | auto &AA = AM.getResult<AAManager>(F); |
313 | 249 | |
314 | 249 | std::unique_ptr<BlockFrequencyInfo> BFI; |
315 | 249 | std::unique_ptr<BranchProbabilityInfo> BPI; |
316 | 249 | bool HasProfileData = F.getEntryCount().hasValue(); |
317 | 249 | if (HasProfileData249 ) { |
318 | 10 | LoopInfo LI{DominatorTree(F)}; |
319 | 10 | BPI.reset(new BranchProbabilityInfo(F, LI, &TLI)); |
320 | 10 | BFI.reset(new BlockFrequencyInfo(F, *BPI, LI)); |
321 | 10 | } |
322 | 249 | |
323 | 249 | bool Changed = runImpl(F, &TLI, &LVI, &AA, &DT, HasProfileData, |
324 | 249 | std::move(BFI), std::move(BPI)); |
325 | 249 | |
326 | 249 | if (!Changed) |
327 | 229 | return PreservedAnalyses::all(); |
328 | 20 | PreservedAnalyses PA; |
329 | 20 | PA.preserve<GlobalsAA>(); |
330 | 20 | PA.preserve<DominatorTreeAnalysis>(); |
331 | 20 | PA.preserve<LazyValueAnalysis>(); |
332 | 20 | return PA; |
333 | 20 | } |
334 | | |
335 | | bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_, |
336 | | LazyValueInfo *LVI_, AliasAnalysis *AA_, |
337 | | DominatorTree *DT_, bool HasProfileData_, |
338 | | std::unique_ptr<BlockFrequencyInfo> BFI_, |
339 | 1.03M | std::unique_ptr<BranchProbabilityInfo> BPI_) { |
340 | 1.03M | DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n"); |
341 | 1.03M | TLI = TLI_; |
342 | 1.03M | LVI = LVI_; |
343 | 1.03M | AA = AA_; |
344 | 1.03M | DT = DT_; |
345 | 1.03M | BFI.reset(); |
346 | 1.03M | BPI.reset(); |
347 | 1.03M | // When profile data is available, we need to update edge weights after |
348 | 1.03M | // successful jump threading, which requires both BPI and BFI being available. |
349 | 1.03M | HasProfileData = HasProfileData_; |
350 | 1.03M | auto *GuardDecl = F.getParent()->getFunction( |
351 | 1.03M | Intrinsic::getName(Intrinsic::experimental_guard)); |
352 | 23 | HasGuards = GuardDecl && !GuardDecl->use_empty(); |
353 | 1.03M | if (HasProfileData1.03M ) { |
354 | 46 | BPI = std::move(BPI_); |
355 | 46 | BFI = std::move(BFI_); |
356 | 46 | } |
357 | 1.03M | |
358 | 1.03M | // Remove unreachable blocks from function as they may result in infinite |
359 | 1.03M | // loop. We do threading if we found something profitable. Jump threading a |
360 | 1.03M | // branch can create other opportunities. If these opportunities form a cycle |
361 | 1.03M | // i.e. if any jump threading is undoing previous threading in the path, then |
362 | 1.03M | // we will loop forever. We take care of this issue by not jump threading for |
363 | 1.03M | // back edges. This works for normal cases but not for unreachable blocks as |
364 | 1.03M | // they may have cycle with no back edge. |
365 | 1.03M | bool EverChanged = false; |
366 | 1.03M | EverChanged |= removeUnreachableBlocks(F, LVI, DT); |
367 | 1.03M | FindLoopHeaders(F); |
368 | 1.03M | |
369 | 1.03M | bool Changed; |
370 | 1.18M | do { |
371 | 1.18M | Changed = false; |
372 | 13.7M | for (Function::iterator I = F.begin(), E = F.end(); I != E13.7M ;) { |
373 | 12.5M | BasicBlock *BB = &*I; |
374 | 12.5M | // Thread all of the branches we can over this block. |
375 | 12.6M | while (ProcessBlock(BB)) |
376 | 112k | Changed = true; |
377 | 12.5M | |
378 | 12.5M | ++I; |
379 | 12.5M | |
380 | 12.5M | // If the block is trivially dead, zap it. This eliminates the successor |
381 | 12.5M | // edges which simplifies the CFG. |
382 | 12.5M | if (pred_empty(BB) && |
383 | 12.5M | BB != &BB->getParent()->getEntryBlock()1.18M ) { |
384 | 1.24k | DEBUG(dbgs() << " JT: Deleting dead block '" << BB->getName() |
385 | 1.24k | << "' with terminator: " << *BB->getTerminator() << '\n'); |
386 | 1.24k | LoopHeaders.erase(BB); |
387 | 1.24k | LVI->eraseBlock(BB); |
388 | 1.24k | DeleteDeadBlock(BB); |
389 | 1.24k | Changed = true; |
390 | 1.24k | continue; |
391 | 1.24k | } |
392 | 12.5M | |
393 | 12.5M | BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator()); |
394 | 12.5M | |
395 | 12.5M | // Can't thread an unconditional jump, but if the block is "almost |
396 | 12.5M | // empty", we can replace uses of it with uses of the successor and make |
397 | 12.5M | // this dead. |
398 | 12.5M | // We should not eliminate the loop header or latch either, because |
399 | 12.5M | // eliminating a loop header or latch might later prevent LoopSimplify |
400 | 12.5M | // from transforming nested loops into simplified form. We will rely on |
401 | 12.5M | // later passes in backend to clean up empty blocks. |
402 | 12.5M | if (BI && 12.5M BI->isUnconditional()11.0M && |
403 | 4.95M | BB != &BB->getParent()->getEntryBlock() && |
404 | 12.5M | // If the terminator is the only non-phi instruction, try to nuke it. |
405 | 12.5M | BB->getFirstNonPHIOrDbg()->isTerminator()4.89M && !LoopHeaders.count(BB)1.17M && |
406 | 12.5M | !LoopHeaders.count(BI->getSuccessor(0))1.10M ) { |
407 | 373k | // FIXME: It is always conservatively correct to drop the info |
408 | 373k | // for a block even if it doesn't get erased. This isn't totally |
409 | 373k | // awesome, but it allows us to use AssertingVH to prevent nasty |
410 | 373k | // dangling pointer issues within LazyValueInfo. |
411 | 373k | LVI->eraseBlock(BB); |
412 | 373k | if (TryToSimplifyUncondBranchFromEmptyBlock(BB, DT)) |
413 | 327k | Changed = true; |
414 | 373k | } |
415 | 12.5M | } |
416 | 1.18M | EverChanged |= Changed; |
417 | 1.18M | } while (Changed); |
418 | 1.03M | |
419 | 1.03M | LoopHeaders.clear(); |
420 | 1.03M | return EverChanged; |
421 | 1.03M | } |
422 | | |
423 | | // Replace uses of Cond with ToVal when safe to do so. If all uses are |
424 | | // replaced, we can remove Cond. We cannot blindly replace all uses of Cond |
425 | | // because we may incorrectly replace uses when guards/assumes are uses of |
426 | | // of `Cond` and we used the guards/assume to reason about the `Cond` value |
427 | | // at the end of block. RAUW unconditionally replaces all uses |
428 | | // including the guards/assumes themselves and the uses before the |
429 | | // guard/assume. |
430 | 209 | static void ReplaceFoldableUses(Instruction *Cond, Value *ToVal) { |
431 | 209 | assert(Cond->getType() == ToVal->getType()); |
432 | 209 | auto *BB = Cond->getParent(); |
433 | 209 | // We can unconditionally replace all uses in non-local blocks (i.e. uses |
434 | 209 | // strictly dominated by BB), since LVI information is true from the |
435 | 209 | // terminator of BB. |
436 | 209 | replaceNonLocalUsesWith(Cond, ToVal); |
437 | 533 | for (Instruction &I : reverse(*BB)) { |
438 | 533 | // Reached the Cond whose uses we are trying to replace, so there are no |
439 | 533 | // more uses. |
440 | 533 | if (&I == Cond) |
441 | 198 | break; |
442 | 335 | // We only replace uses in instructions that are guaranteed to reach the end |
443 | 335 | // of BB, where we know Cond is ToVal. |
444 | 335 | if (335 !isGuaranteedToTransferExecutionToSuccessor(&I)335 ) |
445 | 11 | break; |
446 | 324 | I.replaceUsesOfWith(Cond, ToVal); |
447 | 324 | } |
448 | 209 | if (Cond->use_empty() && 209 !Cond->mayHaveSideEffects()201 ) |
449 | 201 | Cond->eraseFromParent(); |
450 | 209 | } |
451 | | |
452 | | /// Return the cost of duplicating a piece of this block from first non-phi |
453 | | /// and before StopAt instruction to thread across it. Stop scanning the block |
454 | | /// when exceeding the threshold. If duplication is impossible, returns ~0U. |
455 | | static unsigned getJumpThreadDuplicationCost(BasicBlock *BB, |
456 | | Instruction *StopAt, |
457 | 60.7k | unsigned Threshold) { |
458 | 60.7k | assert(StopAt->getParent() == BB && "Not an instruction from proper BB?"); |
459 | 60.7k | /// Ignore PHI nodes, these will be flattened when duplication happens. |
460 | 60.7k | BasicBlock::const_iterator I(BB->getFirstNonPHI()); |
461 | 60.7k | |
462 | 60.7k | // FIXME: THREADING will delete values that are just used to compute the |
463 | 60.7k | // branch, so they shouldn't count against the duplication cost. |
464 | 60.7k | |
465 | 60.7k | unsigned Bonus = 0; |
466 | 60.7k | if (BB->getTerminator() == StopAt60.7k ) { |
467 | 60.7k | // Threading through a switch statement is particularly profitable. If this |
468 | 60.7k | // block ends in a switch, decrease its cost to make it more likely to |
469 | 60.7k | // happen. |
470 | 60.7k | if (isa<SwitchInst>(StopAt)) |
471 | 1.39k | Bonus = 6; |
472 | 60.7k | |
473 | 60.7k | // The same holds for indirect branches, but slightly more so. |
474 | 60.7k | if (isa<IndirectBrInst>(StopAt)) |
475 | 1 | Bonus = 8; |
476 | 60.7k | } |
477 | 60.7k | |
478 | 60.7k | // Bump the threshold up so the early exit from the loop doesn't skip the |
479 | 60.7k | // terminator-based Size adjustment at the end. |
480 | 60.7k | Threshold += Bonus; |
481 | 60.7k | |
482 | 60.7k | // Sum up the cost of each instruction until we get to the terminator. Don't |
483 | 60.7k | // include the terminator because the copy won't include it. |
484 | 60.7k | unsigned Size = 0; |
485 | 204k | for (; &*I != StopAt204k ; ++I143k ) { |
486 | 158k | |
487 | 158k | // Stop scanning the block if we've reached the threshold. |
488 | 158k | if (Size > Threshold) |
489 | 15.1k | return Size; |
490 | 143k | |
491 | 143k | // Debugger intrinsics don't incur code size. |
492 | 143k | if (143k isa<DbgInfoIntrinsic>(I)143k ) continue0 ; |
493 | 143k | |
494 | 143k | // If this is a pointer->pointer bitcast, it is free. |
495 | 143k | if (143k isa<BitCastInst>(I) && 143k I->getType()->isPointerTy()1.79k ) |
496 | 1.79k | continue; |
497 | 141k | |
498 | 141k | // Bail out if this instruction gives back a token type, it is not possible |
499 | 141k | // to duplicate it if it is used outside this BB. |
500 | 141k | if (141k I->getType()->isTokenTy() && 141k I->isUsedOutsideOfBlock(BB)0 ) |
501 | 0 | return ~0U; |
502 | 141k | |
503 | 141k | // All other instructions count for at least one unit. |
504 | 141k | ++Size; |
505 | 141k | |
506 | 141k | // Calls are more expensive. If they are non-intrinsic calls, we model them |
507 | 141k | // as having cost of 4. If they are a non-vector intrinsic, we model them |
508 | 141k | // as having cost of 2 total, and if they are a vector intrinsic, we model |
509 | 141k | // them as having cost 1. |
510 | 141k | if (const CallInst *CI141k = dyn_cast<CallInst>(I)) { |
511 | 25.2k | if (CI->cannotDuplicate() || 25.2k CI->isConvergent()25.2k ) |
512 | 25.2k | // Blocks with NoDuplicate are modelled as having infinite cost, so they |
513 | 25.2k | // are never duplicated. |
514 | 6 | return ~0U; |
515 | 25.1k | else if (25.1k !isa<IntrinsicInst>(CI)25.1k ) |
516 | 12.8k | Size += 3; |
517 | 12.3k | else if (12.3k !CI->getType()->isVectorTy()12.3k ) |
518 | 12.3k | Size += 1; |
519 | 25.2k | } |
520 | 158k | } |
521 | 60.7k | |
522 | 45.6k | return Size > Bonus ? 45.6k Size - Bonus36.7k : 08.91k ; |
523 | 60.7k | } |
524 | | |
525 | | /// FindLoopHeaders - We do not want jump threading to turn proper loop |
526 | | /// structures into irreducible loops. Doing this breaks up the loop nesting |
527 | | /// hierarchy and pessimizes later transformations. To prevent this from |
528 | | /// happening, we first have to find the loop headers. Here we approximate this |
529 | | /// by finding targets of backedges in the CFG. |
530 | | /// |
531 | | /// Note that there definitely are cases when we want to allow threading of |
532 | | /// edges across a loop header. For example, threading a jump from outside the |
533 | | /// loop (the preheader) to an exit block of the loop is definitely profitable. |
534 | | /// It is also almost always profitable to thread backedges from within the loop |
535 | | /// to exit blocks, and is often profitable to thread backedges to other blocks |
536 | | /// within the loop (forming a nested loop). This simple analysis is not rich |
537 | | /// enough to track all of these properties and keep it up-to-date as the CFG |
538 | | /// mutates, so we don't allow any of these transformations. |
539 | 1.03M | void JumpThreadingPass::FindLoopHeaders(Function &F) { |
540 | 1.03M | SmallVector<std::pair<const BasicBlock*,const BasicBlock*>, 32> Edges; |
541 | 1.03M | FindFunctionBackedges(F, Edges); |
542 | 1.03M | |
543 | 1.03M | for (const auto &Edge : Edges) |
544 | 678k | LoopHeaders.insert(Edge.second); |
545 | 1.03M | } |
546 | | |
547 | | /// getKnownConstant - Helper method to determine if we can thread over a |
548 | | /// terminator with the given value as its condition, and if so what value to |
549 | | /// use for that. What kind of value this is depends on whether we want an |
550 | | /// integer or a block address, but an undef is always accepted. |
551 | | /// Returns null if Val is null or not an appropriate constant. |
552 | 18.3M | static Constant *getKnownConstant(Value *Val, ConstantPreference Preference) { |
553 | 18.3M | if (!Val) |
554 | 3.64M | return nullptr; |
555 | 14.6M | |
556 | 14.6M | // Undef is "known" enough. |
557 | 14.6M | if (UndefValue *14.6M U14.6M = dyn_cast<UndefValue>(Val)) |
558 | 24 | return U; |
559 | 14.6M | |
560 | 14.6M | if (14.6M Preference == WantBlockAddress14.6M ) |
561 | 30 | return dyn_cast<BlockAddress>(Val->stripPointerCasts()); |
562 | 14.6M | |
563 | 14.6M | return dyn_cast<ConstantInt>(Val); |
564 | 14.6M | } |
565 | | |
566 | | /// ComputeValueKnownInPredecessors - Given a basic block BB and a value V, see |
567 | | /// if we can infer that the value is a known ConstantInt/BlockAddress or undef |
568 | | /// in any of our predecessors. If so, return the known list of value and pred |
569 | | /// BB in the result vector. |
570 | | /// |
571 | | /// This returns true if there were any known values. |
572 | | bool JumpThreadingPass::ComputeValueKnownInPredecessors( |
573 | | Value *V, BasicBlock *BB, PredValueInfo &Result, |
574 | 8.15M | ConstantPreference Preference, Instruction *CxtI) { |
575 | 8.15M | // This method walks up use-def chains recursively. Because of this, we could |
576 | 8.15M | // get into an infinite loop going around loops in the use-def chain. To |
577 | 8.15M | // prevent this, keep track of what (value, block) pairs we've already visited |
578 | 8.15M | // and terminate the search if we loop back to them |
579 | 8.15M | if (!RecursionSet.insert(std::make_pair(V, BB)).second) |
580 | 10 | return false; |
581 | 8.15M | |
582 | 8.15M | // An RAII help to remove this pair from the recursion set once the recursion |
583 | 8.15M | // stack pops back out again. |
584 | 8.15M | RecursionSetRemover remover(RecursionSet, std::make_pair(V, BB)); |
585 | 8.15M | |
586 | 8.15M | // If V is a constant, then it is known in all predecessors. |
587 | 8.15M | if (Constant *KC8.15M = getKnownConstant(V, Preference)) { |
588 | 95 | for (BasicBlock *Pred : predecessors(BB)) |
589 | 146 | Result.push_back(std::make_pair(KC, Pred)); |
590 | 95 | |
591 | 95 | return !Result.empty(); |
592 | 95 | } |
593 | 8.15M | |
594 | 8.15M | // If V is a non-instruction value, or an instruction in a different block, |
595 | 8.15M | // then it can't be derived from a PHI. |
596 | 8.15M | Instruction *I = dyn_cast<Instruction>(V); |
597 | 8.15M | if (!I || 8.15M I->getParent() != BB8.14M ) { |
598 | 466k | |
599 | 466k | // Okay, if this is a live-in value, see if it has a known value at the end |
600 | 466k | // of any of our predecessors. |
601 | 466k | // |
602 | 466k | // FIXME: This should be an edge property, not a block end property. |
603 | 466k | /// TODO: Per PR2563, we could infer value range information about a |
604 | 466k | /// predecessor based on its terminator. |
605 | 466k | // |
606 | 466k | // FIXME: change this to use the more-rich 'getPredicateOnEdge' method if |
607 | 466k | // "I" is a non-local compare-with-a-constant instruction. This would be |
608 | 466k | // able to handle value inequalities better, for example if the compare is |
609 | 466k | // "X < 4" and "X < 3" is known true but "X < 4" itself is not available. |
610 | 466k | // Perhaps getConstantOnEdge should be smart enough to do this? |
611 | 466k | |
612 | 605k | for (BasicBlock *P : predecessors(BB)) { |
613 | 605k | // If the value is known by LazyValueInfo to be a constant in a |
614 | 605k | // predecessor, use that information to try to thread this block. |
615 | 605k | Constant *PredCst = LVI->getConstantOnEdge(V, P, BB, CxtI); |
616 | 605k | if (Constant *KC = getKnownConstant(PredCst, Preference)) |
617 | 37.5k | Result.push_back(std::make_pair(KC, P)); |
618 | 605k | } |
619 | 466k | |
620 | 466k | return !Result.empty(); |
621 | 466k | } |
622 | 7.69M | |
623 | 7.69M | /// If I is a PHI node, then we know the incoming values for any constants. |
624 | 7.69M | if (PHINode *7.69M PN7.69M = dyn_cast<PHINode>(I)) { |
625 | 68.4k | for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e68.4k ; ++i49.6k ) { |
626 | 49.6k | Value *InVal = PN->getIncomingValue(i); |
627 | 49.6k | if (Constant *KC49.6k = getKnownConstant(InVal, Preference)) { |
628 | 22.6k | Result.push_back(std::make_pair(KC, PN->getIncomingBlock(i))); |
629 | 49.6k | } else { |
630 | 27.0k | Constant *CI = LVI->getConstantOnEdge(InVal, |
631 | 27.0k | PN->getIncomingBlock(i), |
632 | 27.0k | BB, CxtI); |
633 | 27.0k | if (Constant *KC = getKnownConstant(CI, Preference)) |
634 | 1.38k | Result.push_back(std::make_pair(KC, PN->getIncomingBlock(i))); |
635 | 27.0k | } |
636 | 49.6k | } |
637 | 18.8k | |
638 | 18.8k | return !Result.empty(); |
639 | 18.8k | } |
640 | 7.67M | |
641 | 7.67M | // Handle Cast instructions. Only see through Cast when the source operand is |
642 | 7.67M | // PHI or Cmp and the source type is i1 to save the compilation time. |
643 | 7.67M | if (CastInst *7.67M CI7.67M = dyn_cast<CastInst>(I)) { |
644 | 102k | Value *Source = CI->getOperand(0); |
645 | 102k | if (!Source->getType()->isIntegerTy(1)) |
646 | 90.3k | return false; |
647 | 12.4k | if (12.4k !isa<PHINode>(Source) && 12.4k !isa<CmpInst>(Source)12.4k ) |
648 | 560 | return false; |
649 | 11.9k | ComputeValueKnownInPredecessors(Source, BB, Result, Preference, CxtI); |
650 | 11.9k | if (Result.empty()) |
651 | 11.8k | return false; |
652 | 91 | |
653 | 91 | // Convert the known values. |
654 | 91 | for (auto &R : Result) |
655 | 203 | R.first = ConstantExpr::getCast(CI->getOpcode(), R.first, CI->getType()); |
656 | 102k | |
657 | 102k | return true; |
658 | 102k | } |
659 | 7.56M | |
660 | 7.56M | PredValueInfoTy LHSVals, RHSVals; |
661 | 7.56M | |
662 | 7.56M | // Handle some boolean conditions. |
663 | 7.56M | if (I->getType()->getPrimitiveSizeInBits() == 17.56M ) { |
664 | 5.29M | assert(Preference == WantInteger && "One-bit non-integer type?"); |
665 | 5.29M | // X | true -> true |
666 | 5.29M | // X & false -> false |
667 | 5.29M | if (I->getOpcode() == Instruction::Or || |
668 | 5.29M | I->getOpcode() == Instruction::And5.18M ) { |
669 | 310k | ComputeValueKnownInPredecessors(I->getOperand(0), BB, LHSVals, |
670 | 310k | WantInteger, CxtI); |
671 | 310k | ComputeValueKnownInPredecessors(I->getOperand(1), BB, RHSVals, |
672 | 310k | WantInteger, CxtI); |
673 | 310k | |
674 | 310k | if (LHSVals.empty() && 310k RHSVals.empty()304k ) |
675 | 281k | return false; |
676 | 28.9k | |
677 | 28.9k | ConstantInt *InterestingVal; |
678 | 28.9k | if (I->getOpcode() == Instruction::Or) |
679 | 24.6k | InterestingVal = ConstantInt::getTrue(I->getContext()); |
680 | 28.9k | else |
681 | 4.28k | InterestingVal = ConstantInt::getFalse(I->getContext()); |
682 | 28.9k | |
683 | 28.9k | SmallPtrSet<BasicBlock*, 4> LHSKnownBBs; |
684 | 28.9k | |
685 | 28.9k | // Scan for the sentinel. If we find an undef, force it to the |
686 | 28.9k | // interesting value: x|undef -> true and x&undef -> false. |
687 | 28.9k | for (const auto &LHSVal : LHSVals) |
688 | 10.8k | if (10.8k LHSVal.first == InterestingVal || 10.8k isa<UndefValue>(LHSVal.first)8.46k ) { |
689 | 2.35k | Result.emplace_back(InterestingVal, LHSVal.second); |
690 | 2.35k | LHSKnownBBs.insert(LHSVal.second); |
691 | 2.35k | } |
692 | 28.9k | for (const auto &RHSVal : RHSVals) |
693 | 32.4k | if (32.4k RHSVal.first == InterestingVal || 32.4k isa<UndefValue>(RHSVal.first)25.9k ) { |
694 | 6.48k | // If we already inferred a value for this block on the LHS, don't |
695 | 6.48k | // re-add it. |
696 | 6.48k | if (!LHSKnownBBs.count(RHSVal.second)) |
697 | 6.21k | Result.emplace_back(InterestingVal, RHSVal.second); |
698 | 32.4k | } |
699 | 310k | |
700 | 310k | return !Result.empty(); |
701 | 310k | } |
702 | 4.98M | |
703 | 4.98M | // Handle the NOT form of XOR. |
704 | 4.98M | if (4.98M I->getOpcode() == Instruction::Xor && |
705 | 5.46k | isa<ConstantInt>(I->getOperand(1)) && |
706 | 4.98M | cast<ConstantInt>(I->getOperand(1))->isOne()4.89k ) { |
707 | 4.89k | ComputeValueKnownInPredecessors(I->getOperand(0), BB, Result, |
708 | 4.89k | WantInteger, CxtI); |
709 | 4.89k | if (Result.empty()) |
710 | 4.57k | return false; |
711 | 319 | |
712 | 319 | // Invert the known values. |
713 | 319 | for (auto &R : Result) |
714 | 440 | R.first = ConstantExpr::getNot(R.first); |
715 | 4.89k | |
716 | 4.89k | return true; |
717 | 4.89k | } |
718 | 5.29M | |
719 | 5.29M | // Try to simplify some other binary operator values. |
720 | 2.27M | } else if (BinaryOperator *2.27M BO2.27M = dyn_cast<BinaryOperator>(I)) { |
721 | 365k | assert(Preference != WantBlockAddress |
722 | 365k | && "A binary operator creating a block address?"); |
723 | 365k | if (ConstantInt *CI365k = dyn_cast<ConstantInt>(BO->getOperand(1))) { |
724 | 173k | PredValueInfoTy LHSVals; |
725 | 173k | ComputeValueKnownInPredecessors(BO->getOperand(0), BB, LHSVals, |
726 | 173k | WantInteger, CxtI); |
727 | 173k | |
728 | 173k | // Try to use constant folding to simplify the binary operator. |
729 | 2.03k | for (const auto &LHSVal : LHSVals) { |
730 | 2.03k | Constant *V = LHSVal.first; |
731 | 2.03k | Constant *Folded = ConstantExpr::get(BO->getOpcode(), V, CI); |
732 | 2.03k | |
733 | 2.03k | if (Constant *KC = getKnownConstant(Folded, WantInteger)) |
734 | 2.03k | Result.push_back(std::make_pair(KC, LHSVal.second)); |
735 | 2.03k | } |
736 | 173k | } |
737 | 2.27M | |
738 | 2.27M | return !Result.empty(); |
739 | 2.27M | } |
740 | 6.88M | |
741 | 6.88M | // Handle compare with phi operand, where the PHI is defined in this block. |
742 | 6.88M | if (CmpInst *6.88M Cmp6.88M = dyn_cast<CmpInst>(I)) { |
743 | 4.87M | assert(Preference == WantInteger && "Compares only produce integers"); |
744 | 4.87M | Type *CmpType = Cmp->getType(); |
745 | 4.87M | Value *CmpLHS = Cmp->getOperand(0); |
746 | 4.87M | Value *CmpRHS = Cmp->getOperand(1); |
747 | 4.87M | CmpInst::Predicate Pred = Cmp->getPredicate(); |
748 | 4.87M | |
749 | 4.87M | PHINode *PN = dyn_cast<PHINode>(CmpLHS); |
750 | 4.87M | if (PN && 4.87M PN->getParent() == BB360k ) { |
751 | 176k | const DataLayout &DL = PN->getModule()->getDataLayout(); |
752 | 176k | // We can do this simplification if any comparisons fold to true or false. |
753 | 176k | // See if any do. |
754 | 572k | for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e572k ; ++i395k ) { |
755 | 395k | BasicBlock *PredBB = PN->getIncomingBlock(i); |
756 | 395k | Value *LHS = PN->getIncomingValue(i); |
757 | 395k | Value *RHS = CmpRHS->DoPHITranslation(BB, PredBB); |
758 | 395k | |
759 | 395k | Value *Res = SimplifyCmpInst(Pred, LHS, RHS, {DL}); |
760 | 395k | if (!Res395k ) { |
761 | 328k | if (!isa<Constant>(RHS)) |
762 | 87.3k | continue; |
763 | 240k | |
764 | 240k | LazyValueInfo::Tristate |
765 | 240k | ResT = LVI->getPredicateOnEdge(Pred, LHS, |
766 | 240k | cast<Constant>(RHS), PredBB, BB, |
767 | 240k | CxtI ? CxtI240k : Cmp0 ); |
768 | 240k | if (ResT == LazyValueInfo::Unknown) |
769 | 219k | continue; |
770 | 21.9k | Res = ConstantInt::get(Type::getInt1Ty(LHS->getContext()), ResT); |
771 | 21.9k | } |
772 | 395k | |
773 | 89.4k | if (Constant *89.4k KC89.4k = getKnownConstant(Res, WantInteger)) |
774 | 87.5k | Result.push_back(std::make_pair(KC, PredBB)); |
775 | 395k | } |
776 | 176k | |
777 | 176k | return !Result.empty(); |
778 | 176k | } |
779 | 4.69M | |
780 | 4.69M | // If comparing a live-in value against a constant, see if we know the |
781 | 4.69M | // live-in value on any predecessors. |
782 | 4.69M | if (4.69M isa<Constant>(CmpRHS) && 4.69M !CmpType->isVectorTy()3.66M ) { |
783 | 3.66M | Constant *CmpConst = cast<Constant>(CmpRHS); |
784 | 3.66M | |
785 | 3.66M | if (!isa<Instruction>(CmpLHS) || |
786 | 3.66M | cast<Instruction>(CmpLHS)->getParent() != BB2.94M ) { |
787 | 1.45M | for (BasicBlock *P : predecessors(BB)) { |
788 | 1.45M | // If the value is known by LazyValueInfo to be a constant in a |
789 | 1.45M | // predecessor, use that information to try to thread this block. |
790 | 1.45M | LazyValueInfo::Tristate Res = |
791 | 1.45M | LVI->getPredicateOnEdge(Pred, CmpLHS, |
792 | 1.45M | CmpConst, P, BB, CxtI ? CxtI1.45M : Cmp0 ); |
793 | 1.45M | if (Res == LazyValueInfo::Unknown) |
794 | 1.44M | continue; |
795 | 9.45k | |
796 | 9.45k | Constant *ResC = ConstantInt::get(CmpType, Res); |
797 | 9.45k | Result.push_back(std::make_pair(ResC, P)); |
798 | 9.45k | } |
799 | 1.43M | |
800 | 1.43M | return !Result.empty(); |
801 | 1.43M | } |
802 | 2.22M | |
803 | 2.22M | // InstCombine can fold some forms of constant range checks into |
804 | 2.22M | // (icmp (add (x, C1)), C2). See if we have we have such a thing with |
805 | 2.22M | // x as a live-in. |
806 | 2.22M | { |
807 | 2.22M | using namespace PatternMatch; |
808 | 2.22M | |
809 | 2.22M | Value *AddLHS; |
810 | 2.22M | ConstantInt *AddConst; |
811 | 2.22M | if (isa<ConstantInt>(CmpConst) && |
812 | 2.22M | match(CmpLHS, m_Add(m_Value(AddLHS), m_ConstantInt(AddConst)))1.65M ) { |
813 | 66.6k | if (!isa<Instruction>(AddLHS) || |
814 | 66.6k | cast<Instruction>(AddLHS)->getParent() != BB65.2k ) { |
815 | 58.3k | for (BasicBlock *P : predecessors(BB)) { |
816 | 58.3k | // If the value is known by LazyValueInfo to be a ConstantRange in |
817 | 58.3k | // a predecessor, use that information to try to thread this |
818 | 58.3k | // block. |
819 | 58.3k | ConstantRange CR = LVI->getConstantRangeOnEdge( |
820 | 58.3k | AddLHS, P, BB, CxtI ? CxtI58.3k : cast<Instruction>(CmpLHS)0 ); |
821 | 58.3k | // Propagate the range through the addition. |
822 | 58.3k | CR = CR.add(AddConst->getValue()); |
823 | 58.3k | |
824 | 58.3k | // Get the range where the compare returns true. |
825 | 58.3k | ConstantRange CmpRange = ConstantRange::makeExactICmpRegion( |
826 | 58.3k | Pred, cast<ConstantInt>(CmpConst)->getValue()); |
827 | 58.3k | |
828 | 58.3k | Constant *ResC; |
829 | 58.3k | if (CmpRange.contains(CR)) |
830 | 442 | ResC = ConstantInt::getTrue(CmpType); |
831 | 57.8k | else if (57.8k CmpRange.inverse().contains(CR)57.8k ) |
832 | 993 | ResC = ConstantInt::getFalse(CmpType); |
833 | 57.8k | else |
834 | 56.8k | continue; |
835 | 1.43k | |
836 | 1.43k | Result.push_back(std::make_pair(ResC, P)); |
837 | 1.43k | } |
838 | 39.1k | |
839 | 39.1k | return !Result.empty(); |
840 | 39.1k | } |
841 | 2.19M | } |
842 | 2.19M | } |
843 | 2.19M | |
844 | 2.19M | // Try to find a constant value for the LHS of a comparison, |
845 | 2.19M | // and evaluate it statically if we can. |
846 | 2.19M | PredValueInfoTy LHSVals; |
847 | 2.19M | ComputeValueKnownInPredecessors(I->getOperand(0), BB, LHSVals, |
848 | 2.19M | WantInteger, CxtI); |
849 | 2.19M | |
850 | 2.23k | for (const auto &LHSVal : LHSVals) { |
851 | 2.23k | Constant *V = LHSVal.first; |
852 | 2.23k | Constant *Folded = ConstantExpr::getCompare(Pred, V, CmpConst); |
853 | 2.23k | if (Constant *KC = getKnownConstant(Folded, WantInteger)) |
854 | 2.23k | Result.push_back(std::make_pair(KC, LHSVal.second)); |
855 | 2.23k | } |
856 | 3.66M | |
857 | 3.66M | return !Result.empty(); |
858 | 3.66M | } |
859 | 4.87M | } |
860 | 3.05M | |
861 | 3.05M | if (SelectInst *3.05M SI3.05M = dyn_cast<SelectInst>(I)) { |
862 | 21.3k | // Handle select instructions where at least one operand is a known constant |
863 | 21.3k | // and we can figure out the condition value for any predecessor block. |
864 | 21.3k | Constant *TrueVal = getKnownConstant(SI->getTrueValue(), Preference); |
865 | 21.3k | Constant *FalseVal = getKnownConstant(SI->getFalseValue(), Preference); |
866 | 21.3k | PredValueInfoTy Conds; |
867 | 21.3k | if ((TrueVal || 21.3k FalseVal17.3k ) && |
868 | 16.7k | ComputeValueKnownInPredecessors(SI->getCondition(), BB, Conds, |
869 | 21.3k | WantInteger, CxtI)) { |
870 | 560 | for (auto &C : Conds) { |
871 | 560 | Constant *Cond = C.first; |
872 | 560 | |
873 | 560 | // Figure out what value to use for the condition. |
874 | 560 | bool KnownCond; |
875 | 560 | if (ConstantInt *CI560 = dyn_cast<ConstantInt>(Cond)) { |
876 | 560 | // A known boolean. |
877 | 560 | KnownCond = CI->isOne(); |
878 | 560 | } else { |
879 | 0 | assert(isa<UndefValue>(Cond) && "Unexpected condition value"); |
880 | 0 | // Either operand will do, so be sure to pick the one that's a known |
881 | 0 | // constant. |
882 | 0 | // FIXME: Do this more cleverly if both values are known constants? |
883 | 0 | KnownCond = (TrueVal != nullptr); |
884 | 0 | } |
885 | 560 | |
886 | 560 | // See if the select has a known constant value for this predecessor. |
887 | 560 | if (Constant *Val = KnownCond ? TrueVal : FalseVal) |
888 | 448 | Result.push_back(std::make_pair(Val, C.second)); |
889 | 560 | } |
890 | 274 | |
891 | 274 | return !Result.empty(); |
892 | 274 | } |
893 | 3.05M | } |
894 | 3.05M | |
895 | 3.05M | // If all else fails, see if LVI can figure out a constant value for us. |
896 | 3.05M | Constant *CI = LVI->getConstant(V, BB, CxtI); |
897 | 3.05M | if (Constant *KC3.05M = getKnownConstant(CI, Preference)) { |
898 | 0 | for (BasicBlock *Pred : predecessors(BB)) |
899 | 0 | Result.push_back(std::make_pair(KC, Pred)); |
900 | 0 | } |
901 | 8.15M | |
902 | 8.15M | return !Result.empty(); |
903 | 8.15M | } |
904 | | |
905 | | /// GetBestDestForBranchOnUndef - If we determine that the specified block ends |
906 | | /// in an undefined jump, decide which block is best to revector to. |
907 | | /// |
908 | | /// Since we can pick an arbitrary destination, we pick the successor with the |
909 | | /// fewest predecessors. This should reduce the in-degree of the others. |
910 | 28 | static unsigned GetBestDestForJumpOnUndef(BasicBlock *BB) { |
911 | 28 | TerminatorInst *BBTerm = BB->getTerminator(); |
912 | 28 | unsigned MinSucc = 0; |
913 | 28 | BasicBlock *TestBB = BBTerm->getSuccessor(MinSucc); |
914 | 28 | // Compute the successor with the minimum number of predecessors. |
915 | 28 | unsigned MinNumPreds = std::distance(pred_begin(TestBB), pred_end(TestBB)); |
916 | 56 | for (unsigned i = 1, e = BBTerm->getNumSuccessors(); i != e56 ; ++i28 ) { |
917 | 28 | TestBB = BBTerm->getSuccessor(i); |
918 | 28 | unsigned NumPreds = std::distance(pred_begin(TestBB), pred_end(TestBB)); |
919 | 28 | if (NumPreds < MinNumPreds28 ) { |
920 | 16 | MinSucc = i; |
921 | 16 | MinNumPreds = NumPreds; |
922 | 16 | } |
923 | 28 | } |
924 | 28 | |
925 | 28 | return MinSucc; |
926 | 28 | } |
927 | | |
928 | 48.0k | static bool hasAddressTakenAndUsed(BasicBlock *BB) { |
929 | 48.0k | if (!BB->hasAddressTaken()48.0k ) return false48.0k ; |
930 | 8 | |
931 | 8 | // If the block has its address taken, it may be a tree of dead constants |
932 | 8 | // hanging off of it. These shouldn't keep the block alive. |
933 | 8 | BlockAddress *BA = BlockAddress::get(BB); |
934 | 8 | BA->removeDeadConstantUsers(); |
935 | 8 | return !BA->use_empty(); |
936 | 8 | } |
937 | | |
938 | | /// ProcessBlock - If there are any predecessors whose control can be threaded |
939 | | /// through to a successor, transform them now. |
940 | 12.6M | bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) { |
941 | 12.6M | // If the block is trivially dead, just return and let the caller nuke it. |
942 | 12.6M | // This simplifies other transformations. |
943 | 12.6M | if (pred_empty(BB) && |
944 | 1.18M | BB != &BB->getParent()->getEntryBlock()) |
945 | 1.24k | return false; |
946 | 12.6M | |
947 | 12.6M | // If this block has a single predecessor, and if that pred has a single |
948 | 12.6M | // successor, merge the blocks. This encourages recursive jump threading |
949 | 12.6M | // because now the condition in this block can be threaded through |
950 | 12.6M | // predecessors of our predecessor block. |
951 | 12.6M | if (BasicBlock *12.6M SinglePred12.6M = BB->getSinglePredecessor()) { |
952 | 7.56M | const TerminatorInst *TI = SinglePred->getTerminator(); |
953 | 7.56M | if (!TI->isExceptional() && 7.56M TI->getNumSuccessors() == 17.41M && |
954 | 7.56M | SinglePred != BB48.0k && !hasAddressTakenAndUsed(BB)48.0k ) { |
955 | 48.0k | // If SinglePred was a loop header, BB becomes one. |
956 | 48.0k | if (LoopHeaders.erase(SinglePred)) |
957 | 819 | LoopHeaders.insert(BB); |
958 | 48.0k | |
959 | 48.0k | LVI->eraseBlock(SinglePred); |
960 | 48.0k | MergeBasicBlockIntoOnlyPred(BB, DT); |
961 | 48.0k | |
962 | 48.0k | // Now that BB is merged into SinglePred (i.e. SinglePred Code followed by |
963 | 48.0k | // BB code within one basic block `BB`), we need to invalidate the LVI |
964 | 48.0k | // information associated with BB, because the LVI information need not be |
965 | 48.0k | // true for all of BB after the merge. For example, |
966 | 48.0k | // Before the merge, LVI info and code is as follows: |
967 | 48.0k | // SinglePred: <LVI info1 for %p val> |
968 | 48.0k | // %y = use of %p |
969 | 48.0k | // call @exit() // need not transfer execution to successor. |
970 | 48.0k | // assume(%p) // from this point on %p is true |
971 | 48.0k | // br label %BB |
972 | 48.0k | // BB: <LVI info2 for %p val, i.e. %p is true> |
973 | 48.0k | // %x = use of %p |
974 | 48.0k | // br label exit |
975 | 48.0k | // |
976 | 48.0k | // Note that this LVI info for blocks BB and SinglPred is correct for %p |
977 | 48.0k | // (info2 and info1 respectively). After the merge and the deletion of the |
978 | 48.0k | // LVI info1 for SinglePred. We have the following code: |
979 | 48.0k | // BB: <LVI info2 for %p val> |
980 | 48.0k | // %y = use of %p |
981 | 48.0k | // call @exit() |
982 | 48.0k | // assume(%p) |
983 | 48.0k | // %x = use of %p <-- LVI info2 is correct from here onwards. |
984 | 48.0k | // br label exit |
985 | 48.0k | // LVI info2 for BB is incorrect at the beginning of BB. |
986 | 48.0k | |
987 | 48.0k | // Invalidate LVI information for BB if the LVI is not provably true for |
988 | 48.0k | // all of BB. |
989 | 48.0k | if (any_of(*BB, [](Instruction &I) 48.0k { |
990 | 411k | return !isGuaranteedToTransferExecutionToSuccessor(&I); |
991 | 411k | })) |
992 | 14.3k | LVI->eraseBlock(BB); |
993 | 48.0k | return true; |
994 | 48.0k | } |
995 | 12.6M | } |
996 | 12.6M | |
997 | 12.6M | if (12.6M TryToUnfoldSelectInCurrBB(BB)12.6M ) |
998 | 1.63k | return true; |
999 | 12.6M | |
1000 | 12.6M | // Look if we can propagate guards to predecessors. |
1001 | 12.6M | if (12.6M HasGuards && 12.6M ProcessGuards(BB)143 ) |
1002 | 2 | return true; |
1003 | 12.6M | |
1004 | 12.6M | // What kind of constant we're looking for. |
1005 | 12.6M | ConstantPreference Preference = WantInteger; |
1006 | 12.6M | |
1007 | 12.6M | // Look to see if the terminator is a conditional branch, switch or indirect |
1008 | 12.6M | // branch, if not we can't thread it. |
1009 | 12.6M | Value *Condition; |
1010 | 12.6M | Instruction *Terminator = BB->getTerminator(); |
1011 | 12.6M | if (BranchInst *BI12.6M = dyn_cast<BranchInst>(Terminator)) { |
1012 | 11.1M | // Can't thread an unconditional jump. |
1013 | 11.1M | if (BI->isUnconditional()11.1M ) return false4.95M ; |
1014 | 6.17M | Condition = BI->getCondition(); |
1015 | 12.6M | } else if (SwitchInst *1.50M SI1.50M = dyn_cast<SwitchInst>(Terminator)) { |
1016 | 118k | Condition = SI->getCondition(); |
1017 | 1.50M | } else if (IndirectBrInst *1.38M IB1.38M = dyn_cast<IndirectBrInst>(Terminator)) { |
1018 | 18 | // Can't thread indirect branch with no successors. |
1019 | 18 | if (IB->getNumSuccessors() == 018 ) return false1 ; |
1020 | 17 | Condition = IB->getAddress()->stripPointerCasts(); |
1021 | 17 | Preference = WantBlockAddress; |
1022 | 1.38M | } else { |
1023 | 1.38M | return false; // Must be an invoke. |
1024 | 1.38M | } |
1025 | 6.29M | |
1026 | 6.29M | // Run constant folding to see if we can reduce the condition to a simple |
1027 | 6.29M | // constant. |
1028 | 6.29M | if (Instruction *6.29M I6.29M = dyn_cast<Instruction>(Condition)) { |
1029 | 6.28M | Value *SimpleVal = |
1030 | 6.28M | ConstantFoldInstruction(I, BB->getModule()->getDataLayout(), TLI); |
1031 | 6.28M | if (SimpleVal6.28M ) { |
1032 | 3.61k | I->replaceAllUsesWith(SimpleVal); |
1033 | 3.61k | if (isInstructionTriviallyDead(I, TLI)) |
1034 | 3.61k | I->eraseFromParent(); |
1035 | 3.61k | Condition = SimpleVal; |
1036 | 3.61k | } |
1037 | 6.28M | } |
1038 | 6.29M | |
1039 | 6.29M | // If the terminator is branching on an undef, we can pick any of the |
1040 | 6.29M | // successors to branch to. Let GetBestDestForJumpOnUndef decide. |
1041 | 6.29M | if (isa<UndefValue>(Condition)6.29M ) { |
1042 | 18 | unsigned BestSucc = GetBestDestForJumpOnUndef(BB); |
1043 | 18 | std::vector<DominatorTree::UpdateType> Updates; |
1044 | 18 | |
1045 | 18 | // Fold the branch/switch. |
1046 | 18 | TerminatorInst *BBTerm = BB->getTerminator(); |
1047 | 54 | for (unsigned i = 0, e = BBTerm->getNumSuccessors(); i != e54 ; ++i36 ) { |
1048 | 36 | if (i == BestSucc36 ) continue18 ; |
1049 | 18 | BasicBlock *Succ = BBTerm->getSuccessor(i); |
1050 | 18 | Succ->removePredecessor(BB, true); |
1051 | 18 | if (Succ == BB) |
1052 | 0 | continue; |
1053 | 18 | DominatorTree::UpdateType UT = {DominatorTree::Delete, BB, Succ}; |
1054 | 18 | // Make sure to remove a DT edge exactly once and not an edge to itself. |
1055 | 18 | if (std::find(Updates.begin(), Updates.end(), UT) == Updates.end()) |
1056 | 18 | Updates.push_back(UT); |
1057 | 36 | } |
1058 | 18 | |
1059 | 18 | DEBUG(dbgs() << " In block '" << BB->getName() |
1060 | 18 | << "' folding undef terminator: " << *BBTerm << '\n'); |
1061 | 18 | BranchInst::Create(BBTerm->getSuccessor(BestSucc), BBTerm); |
1062 | 18 | BBTerm->eraseFromParent(); |
1063 | 18 | DT->applyUpdates(Updates); |
1064 | 18 | return true; |
1065 | 18 | } |
1066 | 6.29M | |
1067 | 6.29M | // If the terminator of this block is branching on a constant, simplify the |
1068 | 6.29M | // terminator to an unconditional branch. This can occur due to threading in |
1069 | 6.29M | // other blocks. |
1070 | 6.29M | if (6.29M getKnownConstant(Condition, Preference)6.29M ) { |
1071 | 5.08k | DEBUG(dbgs() << " In block '" << BB->getName() |
1072 | 5.08k | << "' folding terminator: " << *BB->getTerminator() << '\n'); |
1073 | 5.08k | ++NumFolds; |
1074 | 5.08k | ConstantFoldTerminator(BB, true, nullptr, DT); |
1075 | 5.08k | return true; |
1076 | 5.08k | } |
1077 | 6.28M | |
1078 | 6.28M | Instruction *CondInst = dyn_cast<Instruction>(Condition); |
1079 | 6.28M | |
1080 | 6.28M | // All the rest of our checks depend on the condition being an instruction. |
1081 | 6.28M | if (!CondInst6.28M ) { |
1082 | 7.99k | // FIXME: Unify this with code below. |
1083 | 7.99k | if (ProcessThreadableEdges(Condition, BB, Preference, Terminator)) |
1084 | 50 | return true; |
1085 | 7.94k | return false; |
1086 | 7.94k | } |
1087 | 6.28M | |
1088 | 6.28M | if (CmpInst *6.28M CondCmp6.28M = dyn_cast<CmpInst>(CondInst)) { |
1089 | 5.62M | // If we're branching on a conditional, LVI might be able to determine |
1090 | 5.62M | // it's value at the branch instruction. We only handle comparisons |
1091 | 5.62M | // against a constant at this time. |
1092 | 5.62M | // TODO: This should be extended to handle switches as well. |
1093 | 5.62M | BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator()); |
1094 | 5.62M | Constant *CondConst = dyn_cast<Constant>(CondCmp->getOperand(1)); |
1095 | 5.62M | if (CondBr && 5.62M CondConst5.62M ) { |
1096 | 4.26M | // We should have returned as soon as we turn a conditional branch to |
1097 | 4.26M | // unconditional. Because its no longer interesting as far as jump |
1098 | 4.26M | // threading is concerned. |
1099 | 4.26M | assert(CondBr->isConditional() && "Threading on unconditional terminator"); |
1100 | 4.26M | |
1101 | 4.26M | LazyValueInfo::Tristate Ret = |
1102 | 4.26M | LVI->getPredicateAt(CondCmp->getPredicate(), CondCmp->getOperand(0), |
1103 | 4.26M | CondConst, CondBr); |
1104 | 4.26M | if (Ret != LazyValueInfo::Unknown4.26M ) { |
1105 | 10.1k | unsigned ToRemove = Ret == LazyValueInfo::True ? 14.59k : 05.57k ; |
1106 | 10.1k | unsigned ToKeep = Ret == LazyValueInfo::True ? 04.59k : 15.57k ; |
1107 | 10.1k | BasicBlock *ToRemoveSucc = CondBr->getSuccessor(ToRemove); |
1108 | 10.1k | ToRemoveSucc->removePredecessor(BB, true); |
1109 | 10.1k | BranchInst::Create(CondBr->getSuccessor(ToKeep), CondBr); |
1110 | 10.1k | CondBr->eraseFromParent(); |
1111 | 10.1k | if (BB != ToRemoveSucc) |
1112 | 10.1k | DT->deleteEdge(BB, ToRemoveSucc); |
1113 | 10.1k | if (CondCmp->use_empty()) |
1114 | 8.13k | CondCmp->eraseFromParent(); |
1115 | 10.1k | // We can safely replace *some* uses of the CondInst if it has |
1116 | 10.1k | // exactly one value as returned by LVI. RAUW is incorrect in the |
1117 | 10.1k | // presence of guards and assumes, that have the `Cond` as the use. This |
1118 | 10.1k | // is because we use the guards/assume to reason about the `Cond` value |
1119 | 10.1k | // at the end of block, but RAUW unconditionally replaces all uses |
1120 | 10.1k | // including the guards/assumes themselves and the uses before the |
1121 | 10.1k | // guard/assume. |
1122 | 2.03k | else if (2.03k CondCmp->getParent() == BB2.03k ) { |
1123 | 189 | auto *CI = Ret == LazyValueInfo::True ? |
1124 | 49 | ConstantInt::getTrue(CondCmp->getType()) : |
1125 | 140 | ConstantInt::getFalse(CondCmp->getType()); |
1126 | 2.03k | ReplaceFoldableUses(CondCmp, CI); |
1127 | 2.03k | } |
1128 | 10.1k | return true; |
1129 | 10.1k | } |
1130 | 4.25M | |
1131 | 4.25M | // We did not manage to simplify this branch, try to see whether |
1132 | 4.25M | // CondCmp depends on a known phi-select pattern. |
1133 | 4.25M | if (4.25M TryToUnfoldSelect(CondCmp, BB)4.25M ) |
1134 | 496 | return true; |
1135 | 6.27M | } |
1136 | 5.62M | } |
1137 | 6.27M | |
1138 | 6.27M | // Check for some cases that are worth simplifying. Right now we want to look |
1139 | 6.27M | // for loads that are used by a switch or by the condition for the branch. If |
1140 | 6.27M | // we see one, check to see if it's partially redundant. If so, insert a PHI |
1141 | 6.27M | // which can then be used to thread the values. |
1142 | 6.27M | Value *SimplifyValue = CondInst; |
1143 | 6.27M | if (CmpInst *CondCmp = dyn_cast<CmpInst>(SimplifyValue)) |
1144 | 5.61M | if (5.61M isa<Constant>(CondCmp->getOperand(1))5.61M ) |
1145 | 4.25M | SimplifyValue = CondCmp->getOperand(0); |
1146 | 6.27M | |
1147 | 6.27M | // TODO: There are other places where load PRE would be profitable, such as |
1148 | 6.27M | // more complex comparisons. |
1149 | 6.27M | if (LoadInst *LI = dyn_cast<LoadInst>(SimplifyValue)) |
1150 | 1.78M | if (1.78M SimplifyPartiallyRedundantLoad(LI)1.78M ) |
1151 | 2.11k | return true; |
1152 | 6.26M | |
1153 | 6.26M | // Before threading, try to propagate profile data backwards: |
1154 | 6.26M | if (PHINode *6.26M PN6.26M = dyn_cast<PHINode>(CondInst)) |
1155 | 51.4k | if (51.4k PN->getParent() == BB && 51.4k isa<BranchInst>(BB->getTerminator())21.4k ) |
1156 | 16.9k | updatePredecessorProfileMetadata(PN, BB); |
1157 | 6.26M | |
1158 | 6.26M | // Handle a variety of cases where we are branching on something derived from |
1159 | 6.26M | // a PHI node in the current block. If we can prove that any predecessors |
1160 | 6.26M | // compute a predictable value based on a PHI node, thread those predecessors. |
1161 | 6.26M | if (ProcessThreadableEdges(CondInst, BB, Preference, Terminator)) |
1162 | 44.4k | return true; |
1163 | 6.22M | |
1164 | 6.22M | // If this is an otherwise-unfoldable branch on a phi node in the current |
1165 | 6.22M | // block, see if we can simplify. |
1166 | 6.22M | if (PHINode *6.22M PN6.22M = dyn_cast<PHINode>(CondInst)) |
1167 | 45.3k | if (45.3k PN->getParent() == BB && 45.3k isa<BranchInst>(BB->getTerminator())15.4k ) |
1168 | 12.1k | return ProcessBranchOnPHI(PN); |
1169 | 6.21M | |
1170 | 6.21M | // If this is an otherwise-unfoldable branch on a XOR, see if we can simplify. |
1171 | 6.21M | if (6.21M CondInst->getOpcode() == Instruction::Xor && |
1172 | 6.21M | CondInst->getParent() == BB1.02k && isa<BranchInst>(BB->getTerminator())1.02k ) |
1173 | 964 | return ProcessBranchOnXOR(cast<BinaryOperator>(CondInst)); |
1174 | 6.21M | |
1175 | 6.21M | // Search for a stronger dominating condition that can be used to simplify a |
1176 | 6.21M | // conditional branch leaving BB. |
1177 | 6.21M | if (6.21M ProcessImpliedCondition(BB)6.21M ) |
1178 | 41 | return true; |
1179 | 6.21M | |
1180 | 6.21M | return false; |
1181 | 6.21M | } |
1182 | | |
1183 | 6.21M | bool JumpThreadingPass::ProcessImpliedCondition(BasicBlock *BB) { |
1184 | 6.21M | auto *BI = dyn_cast<BranchInst>(BB->getTerminator()); |
1185 | 6.21M | if (!BI || 6.21M !BI->isConditional()6.10M ) |
1186 | 110k | return false; |
1187 | 6.10M | |
1188 | 6.10M | Value *Cond = BI->getCondition(); |
1189 | 6.10M | BasicBlock *CurrentBB = BB; |
1190 | 6.10M | BasicBlock *CurrentPred = BB->getSinglePredecessor(); |
1191 | 6.10M | unsigned Iter = 0; |
1192 | 6.10M | |
1193 | 6.10M | auto &DL = BB->getModule()->getDataLayout(); |
1194 | 6.10M | |
1195 | 12.2M | while (CurrentPred && 12.2M Iter++ < ImplicationSearchThreshold7.32M ) { |
1196 | 6.31M | auto *PBI = dyn_cast<BranchInst>(CurrentPred->getTerminator()); |
1197 | 6.31M | if (!PBI || 6.31M !PBI->isConditional()6.14M ) |
1198 | 170k | return false; |
1199 | 6.14M | if (6.14M PBI->getSuccessor(0) != CurrentBB && 6.14M PBI->getSuccessor(1) != CurrentBB3.58M ) |
1200 | 0 | return false; |
1201 | 6.14M | |
1202 | 6.14M | bool CondIsTrue = PBI->getSuccessor(0) == CurrentBB; |
1203 | 6.14M | Optional<bool> Implication = |
1204 | 6.14M | isImpliedCondition(PBI->getCondition(), Cond, DL, CondIsTrue); |
1205 | 6.14M | if (Implication6.14M ) { |
1206 | 41 | BasicBlock *RemoveSucc = BI->getSuccessor(*Implication ? 16 : 035 ); |
1207 | 41 | RemoveSucc->removePredecessor(BB); |
1208 | 41 | BranchInst::Create(BI->getSuccessor(*Implication ? 06 : 135 ), BI); |
1209 | 41 | BI->eraseFromParent(); |
1210 | 41 | if (BB != RemoveSucc) |
1211 | 41 | DT->deleteEdge(BB, RemoveSucc); |
1212 | 41 | return true; |
1213 | 41 | } |
1214 | 6.14M | CurrentBB = CurrentPred; |
1215 | 6.14M | CurrentPred = CurrentBB->getSinglePredecessor(); |
1216 | 6.14M | } |
1217 | 6.10M | |
1218 | 5.93M | return false; |
1219 | 6.21M | } |
1220 | | |
1221 | | /// Return true if Op is an instruction defined in the given block. |
1222 | 935k | static bool isOpDefinedInBlock(Value *Op, BasicBlock *BB) { |
1223 | 935k | if (Instruction *OpInst = dyn_cast<Instruction>(Op)) |
1224 | 810k | if (810k OpInst->getParent() == BB810k ) |
1225 | 668k | return true; |
1226 | 266k | return false; |
1227 | 266k | } |
1228 | | |
1229 | | /// SimplifyPartiallyRedundantLoad - If LI is an obviously partially redundant |
1230 | | /// load instruction, eliminate it by replacing it with a PHI node. This is an |
1231 | | /// important optimization that encourages jump threading, and needs to be run |
1232 | | /// interlaced with other jump threading tasks. |
1233 | 1.78M | bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) { |
1234 | 1.78M | // Don't hack volatile and ordered loads. |
1235 | 1.78M | if (!LI->isUnordered()1.78M ) return false4.07k ; |
1236 | 1.78M | |
1237 | 1.78M | // If the load is defined in a block with exactly one predecessor, it can't be |
1238 | 1.78M | // partially redundant. |
1239 | 1.78M | BasicBlock *LoadBB = LI->getParent(); |
1240 | 1.78M | if (LoadBB->getSinglePredecessor()) |
1241 | 848k | return false; |
1242 | 936k | |
1243 | 936k | // If the load is defined in an EH pad, it can't be partially redundant, |
1244 | 936k | // because the edges between the invoke and the EH pad cannot have other |
1245 | 936k | // instructions between them. |
1246 | 936k | if (936k LoadBB->isEHPad()936k ) |
1247 | 740 | return false; |
1248 | 935k | |
1249 | 935k | Value *LoadedPtr = LI->getOperand(0); |
1250 | 935k | |
1251 | 935k | // If the loaded operand is defined in the LoadBB and its not a phi, |
1252 | 935k | // it can't be available in predecessors. |
1253 | 935k | if (isOpDefinedInBlock(LoadedPtr, LoadBB) && 935k !isa<PHINode>(LoadedPtr)668k ) |
1254 | 638k | return false; |
1255 | 296k | |
1256 | 296k | // Scan a few instructions up from the load, to see if it is obviously live at |
1257 | 296k | // the entry to its block. |
1258 | 296k | BasicBlock::iterator BBIt(LI); |
1259 | 296k | bool IsLoadCSE; |
1260 | 296k | if (Value *AvailableVal = FindAvailableLoadedValue( |
1261 | 48 | LI, LoadBB, BBIt, DefMaxInstsToScan, AA, &IsLoadCSE)) { |
1262 | 48 | // If the value of the load is locally available within the block, just use |
1263 | 48 | // it. This frequently occurs for reg2mem'd allocas. |
1264 | 48 | |
1265 | 48 | if (IsLoadCSE48 ) { |
1266 | 2 | LoadInst *NLI = cast<LoadInst>(AvailableVal); |
1267 | 2 | combineMetadataForCSE(NLI, LI); |
1268 | 2 | }; |
1269 | 48 | |
1270 | 48 | // If the returned value is the load itself, replace with an undef. This can |
1271 | 48 | // only happen in dead loops. |
1272 | 48 | if (AvailableVal == LI48 ) AvailableVal = UndefValue::get(LI->getType())0 ; |
1273 | 48 | if (AvailableVal->getType() != LI->getType()) |
1274 | 40 | AvailableVal = |
1275 | 40 | CastInst::CreateBitOrPointerCast(AvailableVal, LI->getType(), "", LI); |
1276 | 48 | LI->replaceAllUsesWith(AvailableVal); |
1277 | 48 | LI->eraseFromParent(); |
1278 | 48 | return true; |
1279 | 48 | } |
1280 | 296k | |
1281 | 296k | // Otherwise, if we scanned the whole block and got to the top of the block, |
1282 | 296k | // we know the block is locally transparent to the load. If not, something |
1283 | 296k | // might clobber its value. |
1284 | 296k | if (296k BBIt != LoadBB->begin()296k ) |
1285 | 97.5k | return false; |
1286 | 199k | |
1287 | 199k | // If all of the loads and stores that feed the value have the same AA tags, |
1288 | 199k | // then we can propagate them onto any newly inserted loads. |
1289 | 199k | AAMDNodes AATags; |
1290 | 199k | LI->getAAMetadata(AATags); |
1291 | 199k | |
1292 | 199k | SmallPtrSet<BasicBlock*, 8> PredsScanned; |
1293 | 199k | |
1294 | 199k | using AvailablePredsTy = SmallVector<std::pair<BasicBlock *, Value *>, 8>; |
1295 | 199k | |
1296 | 199k | AvailablePredsTy AvailablePreds; |
1297 | 199k | BasicBlock *OneUnavailablePred = nullptr; |
1298 | 199k | SmallVector<LoadInst*, 8> CSELoads; |
1299 | 199k | |
1300 | 199k | // If we got here, the loaded value is transparent through to the start of the |
1301 | 199k | // block. Check to see if it is available in any of the predecessor blocks. |
1302 | 502k | for (BasicBlock *PredBB : predecessors(LoadBB)) { |
1303 | 502k | // If we already scanned this predecessor, skip it. |
1304 | 502k | if (!PredsScanned.insert(PredBB).second) |
1305 | 1.62k | continue; |
1306 | 501k | |
1307 | 501k | BBIt = PredBB->end(); |
1308 | 501k | unsigned NumScanedInst = 0; |
1309 | 501k | Value *PredAvailable = nullptr; |
1310 | 501k | // NOTE: We don't CSE load that is volatile or anything stronger than |
1311 | 501k | // unordered, that should have been checked when we entered the function. |
1312 | 501k | assert(LI->isUnordered() && "Attempting to CSE volatile or atomic loads"); |
1313 | 501k | // If this is a load on a phi pointer, phi-translate it and search |
1314 | 501k | // for available load/store to the pointer in predecessors. |
1315 | 501k | Value *Ptr = LoadedPtr->DoPHITranslation(LoadBB, PredBB); |
1316 | 501k | PredAvailable = FindAvailablePtrLoadStore( |
1317 | 501k | Ptr, LI->getType(), LI->isAtomic(), PredBB, BBIt, DefMaxInstsToScan, |
1318 | 501k | AA, &IsLoadCSE, &NumScanedInst); |
1319 | 501k | |
1320 | 501k | // If PredBB has a single predecessor, continue scanning through the |
1321 | 501k | // single precessor. |
1322 | 501k | BasicBlock *SinglePredBB = PredBB; |
1323 | 691k | while (!PredAvailable && 691k SinglePredBB689k && BBIt == SinglePredBB->begin()594k && |
1324 | 501k | NumScanedInst < DefMaxInstsToScan209k ) { |
1325 | 190k | SinglePredBB = SinglePredBB->getSinglePredecessor(); |
1326 | 190k | if (SinglePredBB190k ) { |
1327 | 96.0k | BBIt = SinglePredBB->end(); |
1328 | 96.0k | PredAvailable = FindAvailablePtrLoadStore( |
1329 | 96.0k | Ptr, LI->getType(), LI->isAtomic(), SinglePredBB, BBIt, |
1330 | 96.0k | (DefMaxInstsToScan - NumScanedInst), AA, &IsLoadCSE, |
1331 | 96.0k | &NumScanedInst); |
1332 | 96.0k | } |
1333 | 190k | } |
1334 | 501k | |
1335 | 501k | if (!PredAvailable501k ) { |
1336 | 498k | OneUnavailablePred = PredBB; |
1337 | 498k | continue; |
1338 | 498k | } |
1339 | 2.66k | |
1340 | 2.66k | if (2.66k IsLoadCSE2.66k ) |
1341 | 1.70k | CSELoads.push_back(cast<LoadInst>(PredAvailable)); |
1342 | 502k | |
1343 | 502k | // If so, this load is partially redundant. Remember this info so that we |
1344 | 502k | // can create a PHI node. |
1345 | 502k | AvailablePreds.push_back(std::make_pair(PredBB, PredAvailable)); |
1346 | 502k | } |
1347 | 199k | |
1348 | 199k | // If the loaded value isn't available in any predecessor, it isn't partially |
1349 | 199k | // redundant. |
1350 | 199k | if (AvailablePreds.empty()199k ) return false197k ; |
1351 | 2.06k | |
1352 | 2.06k | // Okay, the loaded value is available in at least one (and maybe all!) |
1353 | 2.06k | // predecessors. If the value is unavailable in more than one unique |
1354 | 2.06k | // predecessor, we want to insert a merge block for those common predecessors. |
1355 | 2.06k | // This ensures that we only have to insert one reload, thus not increasing |
1356 | 2.06k | // code size. |
1357 | 2.06k | BasicBlock *UnavailablePred = nullptr; |
1358 | 2.06k | |
1359 | 2.06k | // If there is exactly one predecessor where the value is unavailable, the |
1360 | 2.06k | // already computed 'OneUnavailablePred' block is it. If it ends in an |
1361 | 2.06k | // unconditional branch, we know that it isn't a critical edge. |
1362 | 2.06k | if (PredsScanned.size() == AvailablePreds.size()+1 && |
1363 | 2.06k | OneUnavailablePred->getTerminator()->getNumSuccessors() == 11.05k ) { |
1364 | 651 | UnavailablePred = OneUnavailablePred; |
1365 | 2.06k | } else if (1.41k PredsScanned.size() != AvailablePreds.size()1.41k ) { |
1366 | 1.18k | // Otherwise, we had multiple unavailable predecessors or we had a critical |
1367 | 1.18k | // edge from the one. |
1368 | 1.18k | SmallVector<BasicBlock*, 8> PredsToSplit; |
1369 | 1.18k | SmallPtrSet<BasicBlock*, 8> AvailablePredSet; |
1370 | 1.18k | |
1371 | 1.18k | for (const auto &AvailablePred : AvailablePreds) |
1372 | 1.48k | AvailablePredSet.insert(AvailablePred.first); |
1373 | 1.18k | |
1374 | 1.18k | // Add all the unavailable predecessors to the PredsToSplit list. |
1375 | 4.20k | for (BasicBlock *P : predecessors(LoadBB)) { |
1376 | 4.20k | // If the predecessor is an indirect goto, we can't split the edge. |
1377 | 4.20k | if (isa<IndirectBrInst>(P->getTerminator())) |
1378 | 1 | return false; |
1379 | 4.19k | |
1380 | 4.19k | if (4.19k !AvailablePredSet.count(P)4.19k ) |
1381 | 2.68k | PredsToSplit.push_back(P); |
1382 | 4.20k | } |
1383 | 1.18k | |
1384 | 1.18k | // Split them out to their own block. |
1385 | 1.18k | UnavailablePred = SplitBlockPreds(LoadBB, PredsToSplit, "thread-pre-split"); |
1386 | 1.18k | } |
1387 | 2.06k | |
1388 | 2.06k | // If the value isn't available in all predecessors, then there will be |
1389 | 2.06k | // exactly one where it isn't available. Insert a load on that edge and add |
1390 | 2.06k | // it to the AvailablePreds list. |
1391 | 2.06k | if (2.06k UnavailablePred2.06k ) { |
1392 | 1.83k | assert(UnavailablePred->getTerminator()->getNumSuccessors() == 1 && |
1393 | 1.83k | "Can't handle critical edge here!"); |
1394 | 1.83k | LoadInst *NewVal = new LoadInst( |
1395 | 1.83k | LoadedPtr->DoPHITranslation(LoadBB, UnavailablePred), |
1396 | 1.83k | LI->getName() + ".pr", false, LI->getAlignment(), LI->getOrdering(), |
1397 | 1.83k | LI->getSyncScopeID(), UnavailablePred->getTerminator()); |
1398 | 1.83k | NewVal->setDebugLoc(LI->getDebugLoc()); |
1399 | 1.83k | if (AATags) |
1400 | 1.69k | NewVal->setAAMetadata(AATags); |
1401 | 1.83k | |
1402 | 1.83k | AvailablePreds.push_back(std::make_pair(UnavailablePred, NewVal)); |
1403 | 1.83k | } |
1404 | 2.06k | |
1405 | 2.06k | // Now we know that each predecessor of this block has a value in |
1406 | 2.06k | // AvailablePreds, sort them for efficient access as we're walking the preds. |
1407 | 2.06k | array_pod_sort(AvailablePreds.begin(), AvailablePreds.end()); |
1408 | 2.06k | |
1409 | 2.06k | // Create a PHI node at the start of the block for the PRE'd load value. |
1410 | 2.06k | pred_iterator PB = pred_begin(LoadBB), PE = pred_end(LoadBB); |
1411 | 2.06k | PHINode *PN = PHINode::Create(LI->getType(), std::distance(PB, PE), "", |
1412 | 2.06k | &LoadBB->front()); |
1413 | 2.06k | PN->takeName(LI); |
1414 | 2.06k | PN->setDebugLoc(LI->getDebugLoc()); |
1415 | 2.06k | |
1416 | 2.06k | // Insert new entries into the PHI for each predecessor. A single block may |
1417 | 2.06k | // have multiple entries here. |
1418 | 6.61k | for (pred_iterator PI = PB; PI != PE6.61k ; ++PI4.55k ) { |
1419 | 4.55k | BasicBlock *P = *PI; |
1420 | 4.55k | AvailablePredsTy::iterator I = |
1421 | 4.55k | std::lower_bound(AvailablePreds.begin(), AvailablePreds.end(), |
1422 | 4.55k | std::make_pair(P, (Value*)nullptr)); |
1423 | 4.55k | |
1424 | 4.55k | assert(I != AvailablePreds.end() && I->first == P && |
1425 | 4.55k | "Didn't find entry for predecessor!"); |
1426 | 4.55k | |
1427 | 4.55k | // If we have an available predecessor but it requires casting, insert the |
1428 | 4.55k | // cast in the predecessor and use the cast. Note that we have to update the |
1429 | 4.55k | // AvailablePreds vector as we go so that all of the PHI entries for this |
1430 | 4.55k | // predecessor use the same bitcast. |
1431 | 4.55k | Value *&PredV = I->second; |
1432 | 4.55k | if (PredV->getType() != LI->getType()) |
1433 | 66 | PredV = CastInst::CreateBitOrPointerCast(PredV, LI->getType(), "", |
1434 | 66 | P->getTerminator()); |
1435 | 4.55k | |
1436 | 4.55k | PN->addIncoming(PredV, I->first); |
1437 | 4.55k | } |
1438 | 2.06k | |
1439 | 1.70k | for (LoadInst *PredLI : CSELoads) { |
1440 | 1.70k | combineMetadataForCSE(PredLI, LI); |
1441 | 1.70k | } |
1442 | 2.06k | |
1443 | 2.06k | LI->replaceAllUsesWith(PN); |
1444 | 2.06k | LI->eraseFromParent(); |
1445 | 2.06k | |
1446 | 2.06k | return true; |
1447 | 1.78M | } |
1448 | | |
1449 | | /// FindMostPopularDest - The specified list contains multiple possible |
1450 | | /// threadable destinations. Pick the one that occurs the most frequently in |
1451 | | /// the list. |
1452 | | static BasicBlock * |
1453 | | FindMostPopularDest(BasicBlock *BB, |
1454 | | const SmallVectorImpl<std::pair<BasicBlock *, |
1455 | 19.5k | BasicBlock *>> &PredToDestList) { |
1456 | 19.5k | assert(!PredToDestList.empty()); |
1457 | 19.5k | |
1458 | 19.5k | // Determine popularity. If there are multiple possible destinations, we |
1459 | 19.5k | // explicitly choose to ignore 'undef' destinations. We prefer to thread |
1460 | 19.5k | // blocks with known and real destinations to threading undef. We'll handle |
1461 | 19.5k | // them later if interesting. |
1462 | 19.5k | DenseMap<BasicBlock*, unsigned> DestPopularity; |
1463 | 19.5k | for (const auto &PredToDest : PredToDestList) |
1464 | 65.2k | if (65.2k PredToDest.second65.2k ) |
1465 | 65.2k | DestPopularity[PredToDest.second]++; |
1466 | 19.5k | |
1467 | 19.5k | // Find the most popular dest. |
1468 | 19.5k | DenseMap<BasicBlock*, unsigned>::iterator DPI = DestPopularity.begin(); |
1469 | 19.5k | BasicBlock *MostPopularDest = DPI->first; |
1470 | 19.5k | unsigned Popularity = DPI->second; |
1471 | 19.5k | SmallVector<BasicBlock*, 4> SamePopularity; |
1472 | 19.5k | |
1473 | 39.5k | for (++DPI; DPI != DestPopularity.end()39.5k ; ++DPI19.9k ) { |
1474 | 19.9k | // If the popularity of this entry isn't higher than the popularity we've |
1475 | 19.9k | // seen so far, ignore it. |
1476 | 19.9k | if (DPI->second < Popularity) |
1477 | 5.89k | ; // ignore. |
1478 | 14.0k | else if (14.0k DPI->second == Popularity14.0k ) { |
1479 | 9.37k | // If it is the same as what we've seen so far, keep track of it. |
1480 | 9.37k | SamePopularity.push_back(DPI->first); |
1481 | 14.0k | } else { |
1482 | 4.70k | // If it is more popular, remember it. |
1483 | 4.70k | SamePopularity.clear(); |
1484 | 4.70k | MostPopularDest = DPI->first; |
1485 | 4.70k | Popularity = DPI->second; |
1486 | 4.70k | } |
1487 | 19.9k | } |
1488 | 19.5k | |
1489 | 19.5k | // Okay, now we know the most popular destination. If there is more than one |
1490 | 19.5k | // destination, we need to determine one. This is arbitrary, but we need |
1491 | 19.5k | // to make a deterministic decision. Pick the first one that appears in the |
1492 | 19.5k | // successor list. |
1493 | 19.5k | if (!SamePopularity.empty()19.5k ) { |
1494 | 9.16k | SamePopularity.push_back(MostPopularDest); |
1495 | 9.16k | TerminatorInst *TI = BB->getTerminator(); |
1496 | 9.16k | for (unsigned i = 0; ; ++i615 ) { |
1497 | 9.77k | assert(i != TI->getNumSuccessors() && "Didn't find any successor!"); |
1498 | 9.77k | |
1499 | 9.77k | if (!is_contained(SamePopularity, TI->getSuccessor(i))) |
1500 | 615 | continue; |
1501 | 9.16k | |
1502 | 9.16k | MostPopularDest = TI->getSuccessor(i); |
1503 | 9.16k | break; |
1504 | 9.16k | } |
1505 | 9.16k | } |
1506 | 19.5k | |
1507 | 19.5k | // Okay, we have finally picked the most popular destination. |
1508 | 19.5k | return MostPopularDest; |
1509 | 19.5k | } |
1510 | | |
1511 | | bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB, |
1512 | | ConstantPreference Preference, |
1513 | 6.27M | Instruction *CxtI) { |
1514 | 6.27M | // If threading this would thread across a loop header, don't even try to |
1515 | 6.27M | // thread the edge. |
1516 | 6.27M | if (LoopHeaders.count(BB)) |
1517 | 1.13M | return false; |
1518 | 5.13M | |
1519 | 5.13M | PredValueInfoTy PredValues; |
1520 | 5.13M | if (!ComputeValueKnownInPredecessors(Cond, BB, PredValues, Preference, CxtI)) |
1521 | 5.07M | return false; |
1522 | 62.7k | |
1523 | 5.13M | assert(!PredValues.empty() && |
1524 | 62.7k | "ComputeValueKnownInPredecessors returned true with no values"); |
1525 | 62.7k | |
1526 | 62.7k | DEBUG(dbgs() << "IN BB: " << *BB; |
1527 | 62.7k | for (const auto &PredValue : PredValues) { |
1528 | 62.7k | dbgs() << " BB '" << BB->getName() << "': FOUND condition = " |
1529 | 62.7k | << *PredValue.first |
1530 | 62.7k | << " for pred '" << PredValue.second->getName() << "'.\n"; |
1531 | 62.7k | }); |
1532 | 62.7k | |
1533 | 62.7k | // Decide what we want to thread through. Convert our list of known values to |
1534 | 62.7k | // a list of known destinations for each pred. This also discards duplicate |
1535 | 62.7k | // predecessors and keeps track of the undefined inputs (which are represented |
1536 | 62.7k | // as a null dest in the PredToDestList). |
1537 | 62.7k | SmallPtrSet<BasicBlock*, 16> SeenPreds; |
1538 | 62.7k | SmallVector<std::pair<BasicBlock*, BasicBlock*>, 16> PredToDestList; |
1539 | 62.7k | |
1540 | 62.7k | BasicBlock *OnlyDest = nullptr; |
1541 | 62.7k | BasicBlock *MultipleDestSentinel = (BasicBlock*)(intptr_t)~0ULL; |
1542 | 62.7k | Constant *OnlyVal = nullptr; |
1543 | 62.7k | Constant *MultipleVal = (Constant *)(intptr_t)~0ULL; |
1544 | 62.7k | |
1545 | 62.7k | unsigned PredWithKnownDest = 0; |
1546 | 125k | for (const auto &PredValue : PredValues) { |
1547 | 125k | BasicBlock *Pred = PredValue.second; |
1548 | 125k | if (!SeenPreds.insert(Pred).second) |
1549 | 1.07k | continue; // Duplicate predecessor entry. |
1550 | 124k | |
1551 | 124k | Constant *Val = PredValue.first; |
1552 | 124k | |
1553 | 124k | BasicBlock *DestBB; |
1554 | 124k | if (isa<UndefValue>(Val)) |
1555 | 12 | DestBB = nullptr; |
1556 | 124k | else if (BranchInst *124k BI124k = dyn_cast<BranchInst>(BB->getTerminator())) { |
1557 | 119k | assert(isa<ConstantInt>(Val) && "Expecting a constant integer"); |
1558 | 119k | DestBB = BI->getSuccessor(cast<ConstantInt>(Val)->isZero()); |
1559 | 124k | } else if (SwitchInst *5.21k SI5.21k = dyn_cast<SwitchInst>(BB->getTerminator())) { |
1560 | 5.21k | assert(isa<ConstantInt>(Val) && "Expecting a constant integer"); |
1561 | 5.21k | DestBB = SI->findCaseValue(cast<ConstantInt>(Val))->getCaseSuccessor(); |
1562 | 5.21k | } else { |
1563 | 3 | assert(isa<IndirectBrInst>(BB->getTerminator()) |
1564 | 3 | && "Unexpected terminator"); |
1565 | 3 | assert(isa<BlockAddress>(Val) && "Expecting a constant blockaddress"); |
1566 | 3 | DestBB = cast<BlockAddress>(Val)->getBasicBlock(); |
1567 | 3 | } |
1568 | 124k | |
1569 | 124k | // If we have exactly one destination, remember it for efficiency below. |
1570 | 124k | if (PredToDestList.empty()124k ) { |
1571 | 62.7k | OnlyDest = DestBB; |
1572 | 62.7k | OnlyVal = Val; |
1573 | 124k | } else { |
1574 | 61.5k | if (OnlyDest != DestBB) |
1575 | 38.9k | OnlyDest = MultipleDestSentinel; |
1576 | 61.5k | // It possible we have same destination, but different value, e.g. default |
1577 | 61.5k | // case in switchinst. |
1578 | 61.5k | if (Val != OnlyVal) |
1579 | 39.0k | OnlyVal = MultipleVal; |
1580 | 61.5k | } |
1581 | 124k | |
1582 | 124k | // We know where this predecessor is going. |
1583 | 124k | ++PredWithKnownDest; |
1584 | 124k | |
1585 | 124k | // If the predecessor ends with an indirect goto, we can't change its |
1586 | 124k | // destination. |
1587 | 124k | if (isa<IndirectBrInst>(Pred->getTerminator())) |
1588 | 0 | continue; |
1589 | 124k | |
1590 | 124k | PredToDestList.push_back(std::make_pair(Pred, DestBB)); |
1591 | 124k | } |
1592 | 62.7k | |
1593 | 62.7k | // If all edges were unthreadable, we fail. |
1594 | 62.7k | if (PredToDestList.empty()) |
1595 | 0 | return false; |
1596 | 62.7k | |
1597 | 62.7k | // If all the predecessors go to a single known successor, we want to fold, |
1598 | 62.7k | // not thread. By doing so, we do not need to duplicate the current block and |
1599 | 62.7k | // also miss potential opportunities in case we dont/cant duplicate. |
1600 | 62.7k | if (62.7k OnlyDest && 62.7k OnlyDest != MultipleDestSentinel62.6k ) { |
1601 | 43.1k | if (PredWithKnownDest == |
1602 | 43.1k | (size_t)std::distance(pred_begin(BB), pred_end(BB))) { |
1603 | 1.62k | std::vector<DominatorTree::UpdateType> Updates; |
1604 | 1.62k | bool SeenFirstBranchToOnlyDest = false; |
1605 | 3.33k | for (BasicBlock *SuccBB : successors(BB)) { |
1606 | 3.33k | if (SuccBB == OnlyDest && 3.33k !SeenFirstBranchToOnlyDest1.65k ) { |
1607 | 1.62k | SeenFirstBranchToOnlyDest = true; // Don't modify the first branch. |
1608 | 3.33k | } else { |
1609 | 1.70k | SuccBB->removePredecessor(BB, true); // This is unreachable successor. |
1610 | 1.70k | if (SuccBB != OnlyDest && 1.70k SuccBB != BB1.67k ) { |
1611 | 1.67k | DominatorTree::UpdateType UT = {DominatorTree::Delete, BB, SuccBB}; |
1612 | 1.67k | // Make sure to remove a DT edge exactly once. |
1613 | 1.67k | if (std::find(Updates.begin(), Updates.end(), UT) == Updates.end()) |
1614 | 1.67k | Updates.push_back(UT); |
1615 | 1.67k | } |
1616 | 1.70k | } |
1617 | 3.33k | } |
1618 | 1.62k | |
1619 | 1.62k | // Finally update the terminator. |
1620 | 1.62k | TerminatorInst *Term = BB->getTerminator(); |
1621 | 1.62k | BranchInst::Create(OnlyDest, Term); |
1622 | 1.62k | Term->eraseFromParent(); |
1623 | 1.62k | DT->applyUpdates(Updates); |
1624 | 1.62k | |
1625 | 1.62k | // If the condition is now dead due to the removal of the old terminator, |
1626 | 1.62k | // erase it. |
1627 | 1.62k | if (auto *CondInst1.62k = dyn_cast<Instruction>(Cond)) { |
1628 | 1.61k | if (CondInst->use_empty() && 1.61k !CondInst->mayHaveSideEffects()360 ) |
1629 | 360 | CondInst->eraseFromParent(); |
1630 | 1.61k | // We can safely replace *some* uses of the CondInst if it has |
1631 | 1.61k | // exactly one value as returned by LVI. RAUW is incorrect in the |
1632 | 1.61k | // presence of guards and assumes, that have the `Cond` as the use. This |
1633 | 1.61k | // is because we use the guards/assume to reason about the `Cond` value |
1634 | 1.61k | // at the end of block, but RAUW unconditionally replaces all uses |
1635 | 1.61k | // including the guards/assumes themselves and the uses before the |
1636 | 1.61k | // guard/assume. |
1637 | 1.25k | else if (1.25k OnlyVal && 1.25k OnlyVal != MultipleVal1.25k && |
1638 | 1.25k | CondInst->getParent() == BB) |
1639 | 20 | ReplaceFoldableUses(CondInst, OnlyVal); |
1640 | 1.61k | } |
1641 | 1.62k | return true; |
1642 | 1.62k | } |
1643 | 61.0k | } |
1644 | 61.0k | |
1645 | 61.0k | // Determine which is the most common successor. If we have many inputs and |
1646 | 61.0k | // this block is a switch, we want to start by threading the batch that goes |
1647 | 61.0k | // to the most popular destination first. If we only know about one |
1648 | 61.0k | // threadable destination (the common case) we can avoid this. |
1649 | 61.0k | BasicBlock *MostPopularDest = OnlyDest; |
1650 | 61.0k | |
1651 | 61.0k | if (MostPopularDest == MultipleDestSentinel) |
1652 | 19.5k | MostPopularDest = FindMostPopularDest(BB, PredToDestList); |
1653 | 61.0k | |
1654 | 61.0k | // Now that we know what the most popular destination is, factor all |
1655 | 61.0k | // predecessors that will jump to it into a single predecessor. |
1656 | 61.0k | SmallVector<BasicBlock*, 16> PredsToFactor; |
1657 | 61.0k | for (const auto &PredToDest : PredToDestList) |
1658 | 122k | if (122k PredToDest.second == MostPopularDest122k ) { |
1659 | 99.9k | BasicBlock *Pred = PredToDest.first; |
1660 | 99.9k | |
1661 | 99.9k | // This predecessor may be a switch or something else that has multiple |
1662 | 99.9k | // edges to the block. Factor each of these edges by listing them |
1663 | 99.9k | // according to # occurrences in PredsToFactor. |
1664 | 99.9k | for (BasicBlock *Succ : successors(Pred)) |
1665 | 168k | if (168k Succ == BB168k ) |
1666 | 100k | PredsToFactor.push_back(Pred); |
1667 | 122k | } |
1668 | 61.0k | |
1669 | 61.0k | // If the threadable edges are branching on an undefined value, we get to pick |
1670 | 61.0k | // the destination that these predecessors should get to. |
1671 | 61.0k | if (!MostPopularDest) |
1672 | 10 | MostPopularDest = BB->getTerminator()-> |
1673 | 10 | getSuccessor(GetBestDestForJumpOnUndef(BB)); |
1674 | 6.27M | |
1675 | 6.27M | // Ok, try to thread it! |
1676 | 6.27M | return ThreadEdge(BB, PredsToFactor, MostPopularDest); |
1677 | 6.27M | } |
1678 | | |
1679 | | /// ProcessBranchOnPHI - We have an otherwise unthreadable conditional branch on |
1680 | | /// a PHI node in the current block. See if there are any simplifications we |
1681 | | /// can do based on inputs to the phi node. |
1682 | 12.1k | bool JumpThreadingPass::ProcessBranchOnPHI(PHINode *PN) { |
1683 | 12.1k | BasicBlock *BB = PN->getParent(); |
1684 | 12.1k | |
1685 | 12.1k | // TODO: We could make use of this to do it once for blocks with common PHI |
1686 | 12.1k | // values. |
1687 | 12.1k | SmallVector<BasicBlock*, 1> PredBBs; |
1688 | 12.1k | PredBBs.resize(1); |
1689 | 12.1k | |
1690 | 12.1k | // If any of the predecessor blocks end in an unconditional branch, we can |
1691 | 12.1k | // *duplicate* the conditional branch into that block in order to further |
1692 | 12.1k | // encourage jump threading and to eliminate cases where we have branch on a |
1693 | 12.1k | // phi of an icmp (branch on icmp is much better). |
1694 | 35.4k | for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e35.4k ; ++i23.3k ) { |
1695 | 23.7k | BasicBlock *PredBB = PN->getIncomingBlock(i); |
1696 | 23.7k | if (BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator())) |
1697 | 23.2k | if (23.2k PredBr->isUnconditional()23.2k ) { |
1698 | 10.4k | PredBBs[0] = PredBB; |
1699 | 10.4k | // Try to duplicate BB into PredBB. |
1700 | 10.4k | if (DuplicateCondBranchOnPHIIntoPred(BB, PredBBs)) |
1701 | 438 | return true; |
1702 | 23.2k | } |
1703 | 23.7k | } |
1704 | 12.1k | |
1705 | 11.7k | return false; |
1706 | 12.1k | } |
1707 | | |
1708 | | /// ProcessBranchOnXOR - We have an otherwise unthreadable conditional branch on |
1709 | | /// a xor instruction in the current block. See if there are any |
1710 | | /// simplifications we can do based on inputs to the xor. |
1711 | 964 | bool JumpThreadingPass::ProcessBranchOnXOR(BinaryOperator *BO) { |
1712 | 964 | BasicBlock *BB = BO->getParent(); |
1713 | 964 | |
1714 | 964 | // If either the LHS or RHS of the xor is a constant, don't do this |
1715 | 964 | // optimization. |
1716 | 964 | if (isa<ConstantInt>(BO->getOperand(0)) || |
1717 | 964 | isa<ConstantInt>(BO->getOperand(1))) |
1718 | 284 | return false; |
1719 | 680 | |
1720 | 680 | // If the first instruction in BB isn't a phi, we won't be able to infer |
1721 | 680 | // anything special about any particular predecessor. |
1722 | 680 | if (680 !isa<PHINode>(BB->front())680 ) |
1723 | 491 | return false; |
1724 | 189 | |
1725 | 189 | // If this BB is a landing pad, we won't be able to split the edge into it. |
1726 | 189 | if (189 BB->isEHPad()189 ) |
1727 | 1 | return false; |
1728 | 188 | |
1729 | 188 | // If we have a xor as the branch input to this block, and we know that the |
1730 | 188 | // LHS or RHS of the xor in any predecessor is true/false, then we can clone |
1731 | 188 | // the condition into the predecessor and fix that value to true, saving some |
1732 | 188 | // logical ops on that path and encouraging other paths to simplify. |
1733 | 188 | // |
1734 | 188 | // This copies something like this: |
1735 | 188 | // |
1736 | 188 | // BB: |
1737 | 188 | // %X = phi i1 [1], [%X'] |
1738 | 188 | // %Y = icmp eq i32 %A, %B |
1739 | 188 | // %Z = xor i1 %X, %Y |
1740 | 188 | // br i1 %Z, ... |
1741 | 188 | // |
1742 | 188 | // Into: |
1743 | 188 | // BB': |
1744 | 188 | // %Y = icmp ne i32 %A, %B |
1745 | 188 | // br i1 %Y, ... |
1746 | 188 | |
1747 | 188 | PredValueInfoTy XorOpValues; |
1748 | 188 | bool isLHS = true; |
1749 | 188 | if (!ComputeValueKnownInPredecessors(BO->getOperand(0), BB, XorOpValues, |
1750 | 188 | WantInteger, BO)) { |
1751 | 181 | assert(XorOpValues.empty()); |
1752 | 181 | if (!ComputeValueKnownInPredecessors(BO->getOperand(1), BB, XorOpValues, |
1753 | 181 | WantInteger, BO)) |
1754 | 178 | return false; |
1755 | 3 | isLHS = false; |
1756 | 3 | } |
1757 | 188 | |
1758 | 10 | assert(!XorOpValues.empty() && |
1759 | 10 | "ComputeValueKnownInPredecessors returned true with no values"); |
1760 | 10 | |
1761 | 10 | // Scan the information to see which is most popular: true or false. The |
1762 | 10 | // predecessors can be of the set true, false, or undef. |
1763 | 10 | unsigned NumTrue = 0, NumFalse = 0; |
1764 | 14 | for (const auto &XorOpValue : XorOpValues) { |
1765 | 14 | if (isa<UndefValue>(XorOpValue.first)) |
1766 | 14 | // Ignore undefs for the count. |
1767 | 2 | continue; |
1768 | 12 | if (12 cast<ConstantInt>(XorOpValue.first)->isZero()12 ) |
1769 | 6 | ++NumFalse; |
1770 | 12 | else |
1771 | 6 | ++NumTrue; |
1772 | 14 | } |
1773 | 10 | |
1774 | 10 | // Determine which value to split on, true, false, or undef if neither. |
1775 | 10 | ConstantInt *SplitVal = nullptr; |
1776 | 10 | if (NumTrue > NumFalse) |
1777 | 4 | SplitVal = ConstantInt::getTrue(BB->getContext()); |
1778 | 6 | else if (6 NumTrue != 0 || 6 NumFalse != 06 ) |
1779 | 5 | SplitVal = ConstantInt::getFalse(BB->getContext()); |
1780 | 10 | |
1781 | 10 | // Collect all of the blocks that this can be folded into so that we can |
1782 | 10 | // factor this once and clone it once. |
1783 | 10 | SmallVector<BasicBlock*, 8> BlocksToFoldInto; |
1784 | 14 | for (const auto &XorOpValue : XorOpValues) { |
1785 | 14 | if (XorOpValue.first != SplitVal && 14 !isa<UndefValue>(XorOpValue.first)2 ) |
1786 | 0 | continue; |
1787 | 14 | |
1788 | 14 | BlocksToFoldInto.push_back(XorOpValue.second); |
1789 | 14 | } |
1790 | 10 | |
1791 | 10 | // If we inferred a value for all of the predecessors, then duplication won't |
1792 | 10 | // help us. However, we can just replace the LHS or RHS with the constant. |
1793 | 10 | if (BlocksToFoldInto.size() == |
1794 | 10 | cast<PHINode>(BB->front()).getNumIncomingValues()) { |
1795 | 1 | if (!SplitVal1 ) { |
1796 | 1 | // If all preds provide undef, just nuke the xor, because it is undef too. |
1797 | 1 | BO->replaceAllUsesWith(UndefValue::get(BO->getType())); |
1798 | 1 | BO->eraseFromParent(); |
1799 | 1 | } else if (0 SplitVal->isZero()0 ) { |
1800 | 0 | // If all preds provide 0, replace the xor with the other input. |
1801 | 0 | BO->replaceAllUsesWith(BO->getOperand(isLHS)); |
1802 | 0 | BO->eraseFromParent(); |
1803 | 0 | } else { |
1804 | 0 | // If all preds provide 1, set the computed value to 1. |
1805 | 0 | BO->setOperand(!isLHS, SplitVal); |
1806 | 0 | } |
1807 | 1 | |
1808 | 1 | return true; |
1809 | 1 | } |
1810 | 9 | |
1811 | 9 | // Try to duplicate BB into PredBB. |
1812 | 9 | return DuplicateCondBranchOnPHIIntoPred(BB, BlocksToFoldInto); |
1813 | 9 | } |
1814 | | |
1815 | | /// AddPHINodeEntriesForMappedBlock - We're adding 'NewPred' as a new |
1816 | | /// predecessor to the PHIBB block. If it has PHI nodes, add entries for |
1817 | | /// NewPred using the entries from OldPred (suitably mapped). |
1818 | | static void AddPHINodeEntriesForMappedBlock(BasicBlock *PHIBB, |
1819 | | BasicBlock *OldPred, |
1820 | | BasicBlock *NewPred, |
1821 | 43.7k | DenseMap<Instruction*, Value*> &ValueMap) { |
1822 | 43.7k | for (BasicBlock::iterator PNI = PHIBB->begin(); |
1823 | 65.2k | PHINode *PN65.2k = dyn_cast<PHINode>(PNI); ++PNI21.4k ) { |
1824 | 21.4k | // Ok, we have a PHI node. Figure out what the incoming value was for the |
1825 | 21.4k | // DestBlock. |
1826 | 21.4k | Value *IV = PN->getIncomingValueForBlock(OldPred); |
1827 | 21.4k | |
1828 | 21.4k | // Remap the value if necessary. |
1829 | 21.4k | if (Instruction *Inst21.4k = dyn_cast<Instruction>(IV)) { |
1830 | 11.5k | DenseMap<Instruction*, Value*>::iterator I = ValueMap.find(Inst); |
1831 | 11.5k | if (I != ValueMap.end()) |
1832 | 7.72k | IV = I->second; |
1833 | 11.5k | } |
1834 | 21.4k | |
1835 | 21.4k | PN->addIncoming(IV, NewPred); |
1836 | 21.4k | } |
1837 | 43.7k | } |
1838 | | |
1839 | | /// ThreadEdge - We have decided that it is safe and profitable to factor the |
1840 | | /// blocks in PredBBs to one predecessor, then thread an edge from it to SuccBB |
1841 | | /// across BB. Transform the IR to reflect this change. |
1842 | | bool JumpThreadingPass::ThreadEdge(BasicBlock *BB, |
1843 | | const SmallVectorImpl<BasicBlock *> &PredBBs, |
1844 | 61.0k | BasicBlock *SuccBB) { |
1845 | 61.0k | // If threading to the same block as we come from, we would infinite loop. |
1846 | 61.0k | if (SuccBB == BB61.0k ) { |
1847 | 0 | DEBUG(dbgs() << " Not threading across BB '" << BB->getName() |
1848 | 0 | << "' - would thread to self!\n"); |
1849 | 0 | return false; |
1850 | 0 | } |
1851 | 61.0k | |
1852 | 61.0k | // If threading this would thread across a loop header, don't thread the edge. |
1853 | 61.0k | // See the comments above FindLoopHeaders for justifications and caveats. |
1854 | 61.0k | if (61.0k LoopHeaders.count(BB) || 61.0k LoopHeaders.count(SuccBB)61.0k ) { |
1855 | 1.82k | DEBUG({ |
1856 | 1.82k | bool BBIsHeader = LoopHeaders.count(BB); |
1857 | 1.82k | bool SuccIsHeader = LoopHeaders.count(SuccBB); |
1858 | 1.82k | dbgs() << " Not threading across " |
1859 | 1.82k | << (BBIsHeader ? "loop header BB '" : "block BB '") << BB->getName() |
1860 | 1.82k | << "' to dest " << (SuccIsHeader ? "loop header BB '" : "block BB '") |
1861 | 1.82k | << SuccBB->getName() << "' - it might create an irreducible loop!\n"; |
1862 | 1.82k | }); |
1863 | 1.82k | return false; |
1864 | 1.82k | } |
1865 | 59.2k | |
1866 | 59.2k | unsigned JumpThreadCost = |
1867 | 59.2k | getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold); |
1868 | 59.2k | if (JumpThreadCost > BBDupThreshold59.2k ) { |
1869 | 16.4k | DEBUG(dbgs() << " Not threading BB '" << BB->getName() |
1870 | 16.4k | << "' - Cost is too high: " << JumpThreadCost << "\n"); |
1871 | 16.4k | return false; |
1872 | 16.4k | } |
1873 | 42.8k | |
1874 | 42.8k | // And finally, do it! Start by factoring the predecessors if needed. |
1875 | 42.8k | BasicBlock *PredBB; |
1876 | 42.8k | if (PredBBs.size() == 1) |
1877 | 28.8k | PredBB = PredBBs[0]; |
1878 | 13.9k | else { |
1879 | 13.9k | DEBUG(dbgs() << " Factoring out " << PredBBs.size() |
1880 | 13.9k | << " common predecessors.\n"); |
1881 | 13.9k | PredBB = SplitBlockPreds(BB, PredBBs, ".thr_comm"); |
1882 | 13.9k | } |
1883 | 42.8k | |
1884 | 42.8k | // And finally, do it! |
1885 | 42.8k | DEBUG(dbgs() << " Threading edge from '" << PredBB->getName() << "' to '" |
1886 | 42.8k | << SuccBB->getName() << "' with cost: " << JumpThreadCost |
1887 | 42.8k | << ", across block:\n " |
1888 | 42.8k | << *BB << "\n"); |
1889 | 42.8k | |
1890 | 42.8k | LVI->threadEdge(PredBB, BB, SuccBB); |
1891 | 42.8k | |
1892 | 42.8k | // We are going to have to map operands from the original BB block to the new |
1893 | 42.8k | // copy of the block 'NewBB'. If there are PHI nodes in BB, evaluate them to |
1894 | 42.8k | // account for entry from PredBB. |
1895 | 42.8k | DenseMap<Instruction*, Value*> ValueMapping; |
1896 | 42.8k | |
1897 | 42.8k | BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), |
1898 | 42.8k | BB->getName()+".thread", |
1899 | 42.8k | BB->getParent(), BB); |
1900 | 42.8k | NewBB->moveAfter(PredBB); |
1901 | 42.8k | |
1902 | 42.8k | // Set the block frequency of NewBB. |
1903 | 42.8k | if (HasProfileData42.8k ) { |
1904 | 3 | auto NewBBFreq = |
1905 | 3 | BFI->getBlockFreq(PredBB) * BPI->getEdgeProbability(PredBB, BB); |
1906 | 3 | BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency()); |
1907 | 3 | } |
1908 | 42.8k | |
1909 | 42.8k | BasicBlock::iterator BI = BB->begin(); |
1910 | 95.4k | for (; PHINode *PN95.4k = dyn_cast<PHINode>(BI); ++BI52.5k ) |
1911 | 52.5k | ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB); |
1912 | 42.8k | |
1913 | 42.8k | // Clone the non-phi instructions of BB into NewBB, keeping track of the |
1914 | 42.8k | // mapping and using it to remap operands in the cloned instructions. |
1915 | 89.6k | for (; !isa<TerminatorInst>(BI)89.6k ; ++BI46.8k ) { |
1916 | 46.8k | Instruction *New = BI->clone(); |
1917 | 46.8k | New->setName(BI->getName()); |
1918 | 46.8k | NewBB->getInstList().push_back(New); |
1919 | 46.8k | ValueMapping[&*BI] = New; |
1920 | 46.8k | |
1921 | 46.8k | // Remap operands to patch up intra-block references. |
1922 | 141k | for (unsigned i = 0, e = New->getNumOperands(); i != e141k ; ++i95.0k ) |
1923 | 95.0k | if (Instruction *95.0k Inst95.0k = dyn_cast<Instruction>(New->getOperand(i))) { |
1924 | 53.9k | DenseMap<Instruction*, Value*>::iterator I = ValueMapping.find(Inst); |
1925 | 53.9k | if (I != ValueMapping.end()) |
1926 | 35.8k | New->setOperand(i, I->second); |
1927 | 95.0k | } |
1928 | 46.8k | } |
1929 | 42.8k | |
1930 | 42.8k | // We didn't copy the terminator from BB over to NewBB, because there is now |
1931 | 42.8k | // an unconditional jump to SuccBB. Insert the unconditional jump. |
1932 | 42.8k | BranchInst *NewBI = BranchInst::Create(SuccBB, NewBB); |
1933 | 42.8k | NewBI->setDebugLoc(BB->getTerminator()->getDebugLoc()); |
1934 | 42.8k | |
1935 | 42.8k | // Check to see if SuccBB has PHI nodes. If so, we need to add entries to the |
1936 | 42.8k | // PHI nodes for NewBB now. |
1937 | 42.8k | AddPHINodeEntriesForMappedBlock(SuccBB, BB, NewBB, ValueMapping); |
1938 | 42.8k | |
1939 | 42.8k | // If there were values defined in BB that are used outside the block, then we |
1940 | 42.8k | // now have to update all uses of the value to use either the original value, |
1941 | 42.8k | // the cloned value, or some PHI derived value. This can require arbitrary |
1942 | 42.8k | // PHI insertion, of which we are prepared to do, clean these up now. |
1943 | 42.8k | SSAUpdater SSAUpdate; |
1944 | 42.8k | SmallVector<Use*, 16> UsesToRename; |
1945 | 142k | for (Instruction &I : *BB) { |
1946 | 142k | // Scan all uses of this instruction to see if it is used outside of its |
1947 | 142k | // block, and if so, record them in UsesToRename. |
1948 | 157k | for (Use &U : I.uses()) { |
1949 | 157k | Instruction *User = cast<Instruction>(U.getUser()); |
1950 | 157k | if (PHINode *UserPN157k = dyn_cast<PHINode>(User)) { |
1951 | 35.7k | if (UserPN->getIncomingBlock(U) == BB) |
1952 | 17.2k | continue; |
1953 | 121k | } else if (121k User->getParent() == BB121k ) |
1954 | 74.5k | continue; |
1955 | 65.3k | |
1956 | 65.3k | UsesToRename.push_back(&U); |
1957 | 65.3k | } |
1958 | 142k | // If there are no uses outside the block, we're done with this instruction. |
1959 | 142k | if (UsesToRename.empty()) |
1960 | 113k | continue; |
1961 | 28.5k | |
1962 | 28.5k | DEBUG28.5k (dbgs() << "JT: Renaming non-local uses of: " << I << "\n"); |
1963 | 28.5k | |
1964 | 28.5k | // We found a use of I outside of BB. Rename all uses of I that are outside |
1965 | 28.5k | // its block to be uses of the appropriate PHI node etc. See ValuesInBlocks |
1966 | 28.5k | // with the two values we know. |
1967 | 28.5k | SSAUpdate.Initialize(I.getType(), I.getName()); |
1968 | 28.5k | SSAUpdate.AddAvailableValue(BB, &I); |
1969 | 28.5k | SSAUpdate.AddAvailableValue(NewBB, ValueMapping[&I]); |
1970 | 28.5k | |
1971 | 93.9k | while (!UsesToRename.empty()) |
1972 | 65.3k | SSAUpdate.RewriteUse(*UsesToRename.pop_back_val()); |
1973 | 28.5k | DEBUG(dbgs() << "\n"); |
1974 | 142k | } |
1975 | 42.8k | |
1976 | 42.8k | // Ok, NewBB is good to go. Update the terminator of PredBB to jump to |
1977 | 42.8k | // NewBB instead of BB. This eliminates predecessors from BB, which requires |
1978 | 42.8k | // us to simplify any PHI nodes in BB. |
1979 | 42.8k | TerminatorInst *PredTerm = PredBB->getTerminator(); |
1980 | 104k | for (unsigned i = 0, e = PredTerm->getNumSuccessors(); i != e104k ; ++i61.5k ) |
1981 | 61.5k | if (61.5k PredTerm->getSuccessor(i) == BB61.5k ) { |
1982 | 42.8k | BB->removePredecessor(PredBB, true); |
1983 | 42.8k | PredTerm->setSuccessor(i, NewBB); |
1984 | 42.8k | } |
1985 | 61.0k | |
1986 | 61.0k | DT->applyUpdates({{DominatorTree::Insert, NewBB, SuccBB}, |
1987 | 61.0k | {DominatorTree::Insert, PredBB, NewBB}, |
1988 | 61.0k | {DominatorTree::Delete, PredBB, BB}}); |
1989 | 61.0k | |
1990 | 61.0k | // At this point, the IR is fully up to date and consistent. Do a quick scan |
1991 | 61.0k | // over the new instructions and zap any that are constants or dead. This |
1992 | 61.0k | // frequently happens because of phi translation. |
1993 | 61.0k | SimplifyInstructionsInBlock(NewBB, TLI); |
1994 | 61.0k | |
1995 | 61.0k | // Update the edge weight from BB to SuccBB, which should be less than before. |
1996 | 61.0k | UpdateBlockFreqAndEdgeWeight(PredBB, BB, NewBB, SuccBB); |
1997 | 61.0k | |
1998 | 61.0k | // Threaded an edge! |
1999 | 61.0k | ++NumThreads; |
2000 | 61.0k | return true; |
2001 | 61.0k | } |
2002 | | |
2003 | | /// Create a new basic block that will be the predecessor of BB and successor of |
2004 | | /// all blocks in Preds. When profile data is available, update the frequency of |
2005 | | /// this new block. |
2006 | | BasicBlock *JumpThreadingPass::SplitBlockPreds(BasicBlock *BB, |
2007 | | ArrayRef<BasicBlock *> Preds, |
2008 | 15.1k | const char *Suffix) { |
2009 | 15.1k | // Collect the frequencies of all predecessors of BB, which will be used to |
2010 | 15.1k | // update the edge weight on BB->SuccBB. |
2011 | 15.1k | BlockFrequency PredBBFreq(0); |
2012 | 15.1k | if (HasProfileData) |
2013 | 1 | for (auto Pred : Preds) |
2014 | 2 | PredBBFreq += BFI->getBlockFreq(Pred) * BPI->getEdgeProbability(Pred, BB); |
2015 | 15.1k | |
2016 | 15.1k | BasicBlock *PredBB = SplitBlockPredecessors(BB, Preds, Suffix, DT); |
2017 | 15.1k | |
2018 | 15.1k | // Set the block frequency of the newly created PredBB, which is the sum of |
2019 | 15.1k | // frequencies of Preds. |
2020 | 15.1k | if (HasProfileData) |
2021 | 1 | BFI->setBlockFreq(PredBB, PredBBFreq.getFrequency()); |
2022 | 15.1k | return PredBB; |
2023 | 15.1k | } |
2024 | | |
2025 | 3 | bool JumpThreadingPass::doesBlockHaveProfileData(BasicBlock *BB) { |
2026 | 3 | const TerminatorInst *TI = BB->getTerminator(); |
2027 | 3 | assert(TI->getNumSuccessors() > 1 && "not a split"); |
2028 | 3 | |
2029 | 3 | MDNode *WeightsNode = TI->getMetadata(LLVMContext::MD_prof); |
2030 | 3 | if (!WeightsNode) |
2031 | 2 | return false; |
2032 | 1 | |
2033 | 1 | MDString *MDName = cast<MDString>(WeightsNode->getOperand(0)); |
2034 | 1 | if (MDName->getString() != "branch_weights") |
2035 | 0 | return false; |
2036 | 1 | |
2037 | 1 | // Ensure there are weights for all of the successors. Note that the first |
2038 | 1 | // operand to the metadata node is a name, not a weight. |
2039 | 1 | return WeightsNode->getNumOperands() == TI->getNumSuccessors() + 1; |
2040 | 1 | } |
2041 | | |
2042 | | /// Update the block frequency of BB and branch weight and the metadata on the |
2043 | | /// edge BB->SuccBB. This is done by scaling the weight of BB->SuccBB by 1 - |
2044 | | /// Freq(PredBB->BB) / Freq(BB->SuccBB). |
2045 | | void JumpThreadingPass::UpdateBlockFreqAndEdgeWeight(BasicBlock *PredBB, |
2046 | | BasicBlock *BB, |
2047 | | BasicBlock *NewBB, |
2048 | 42.8k | BasicBlock *SuccBB) { |
2049 | 42.8k | if (!HasProfileData) |
2050 | 42.8k | return; |
2051 | 3 | |
2052 | 42.8k | assert(BFI && BPI && "BFI & BPI should have been created here"); |
2053 | 3 | |
2054 | 3 | // As the edge from PredBB to BB is deleted, we have to update the block |
2055 | 3 | // frequency of BB. |
2056 | 3 | auto BBOrigFreq = BFI->getBlockFreq(BB); |
2057 | 3 | auto NewBBFreq = BFI->getBlockFreq(NewBB); |
2058 | 3 | auto BB2SuccBBFreq = BBOrigFreq * BPI->getEdgeProbability(BB, SuccBB); |
2059 | 3 | auto BBNewFreq = BBOrigFreq - NewBBFreq; |
2060 | 3 | BFI->setBlockFreq(BB, BBNewFreq.getFrequency()); |
2061 | 3 | |
2062 | 3 | // Collect updated outgoing edges' frequencies from BB and use them to update |
2063 | 3 | // edge probabilities. |
2064 | 3 | SmallVector<uint64_t, 4> BBSuccFreq; |
2065 | 6 | for (BasicBlock *Succ : successors(BB)) { |
2066 | 6 | auto SuccFreq = (Succ == SuccBB) |
2067 | 3 | ? BB2SuccBBFreq - NewBBFreq |
2068 | 3 | : BBOrigFreq * BPI->getEdgeProbability(BB, Succ); |
2069 | 6 | BBSuccFreq.push_back(SuccFreq.getFrequency()); |
2070 | 6 | } |
2071 | 3 | |
2072 | 3 | uint64_t MaxBBSuccFreq = |
2073 | 3 | *std::max_element(BBSuccFreq.begin(), BBSuccFreq.end()); |
2074 | 3 | |
2075 | 3 | SmallVector<BranchProbability, 4> BBSuccProbs; |
2076 | 3 | if (MaxBBSuccFreq == 0) |
2077 | 1 | BBSuccProbs.assign(BBSuccFreq.size(), |
2078 | 1 | {1, static_cast<unsigned>(BBSuccFreq.size())}); |
2079 | 2 | else { |
2080 | 2 | for (uint64_t Freq : BBSuccFreq) |
2081 | 4 | BBSuccProbs.push_back( |
2082 | 4 | BranchProbability::getBranchProbability(Freq, MaxBBSuccFreq)); |
2083 | 2 | // Normalize edge probabilities so that they sum up to one. |
2084 | 2 | BranchProbability::normalizeProbabilities(BBSuccProbs.begin(), |
2085 | 2 | BBSuccProbs.end()); |
2086 | 2 | } |
2087 | 3 | |
2088 | 3 | // Update edge probabilities in BPI. |
2089 | 9 | for (int I = 0, E = BBSuccProbs.size(); I < E9 ; I++6 ) |
2090 | 6 | BPI->setEdgeProbability(BB, I, BBSuccProbs[I]); |
2091 | 3 | |
2092 | 3 | // Update the profile metadata as well. |
2093 | 3 | // |
2094 | 3 | // Don't do this if the profile of the transformed blocks was statically |
2095 | 3 | // estimated. (This could occur despite the function having an entry |
2096 | 3 | // frequency in completely cold parts of the CFG.) |
2097 | 3 | // |
2098 | 3 | // In this case we don't want to suggest to subsequent passes that the |
2099 | 3 | // calculated weights are fully consistent. Consider this graph: |
2100 | 3 | // |
2101 | 3 | // check_1 |
2102 | 3 | // 50% / | |
2103 | 3 | // eq_1 | 50% |
2104 | 3 | // \ | |
2105 | 3 | // check_2 |
2106 | 3 | // 50% / | |
2107 | 3 | // eq_2 | 50% |
2108 | 3 | // \ | |
2109 | 3 | // check_3 |
2110 | 3 | // 50% / | |
2111 | 3 | // eq_3 | 50% |
2112 | 3 | // \ | |
2113 | 3 | // |
2114 | 3 | // Assuming the blocks check_* all compare the same value against 1, 2 and 3, |
2115 | 3 | // the overall probabilities are inconsistent; the total probability that the |
2116 | 3 | // value is either 1, 2 or 3 is 150%. |
2117 | 3 | // |
2118 | 3 | // As a consequence if we thread eq_1 -> check_2 to check_3, check_2->check_3 |
2119 | 3 | // becomes 0%. This is even worse if the edge whose probability becomes 0% is |
2120 | 3 | // the loop exit edge. Then based solely on static estimation we would assume |
2121 | 3 | // the loop was extremely hot. |
2122 | 3 | // |
2123 | 3 | // FIXME this locally as well so that BPI and BFI are consistent as well. We |
2124 | 3 | // shouldn't make edges extremely likely or unlikely based solely on static |
2125 | 3 | // estimation. |
2126 | 3 | if (BBSuccProbs.size() >= 2 && 3 doesBlockHaveProfileData(BB)3 ) { |
2127 | 1 | SmallVector<uint32_t, 4> Weights; |
2128 | 1 | for (auto Prob : BBSuccProbs) |
2129 | 2 | Weights.push_back(Prob.getNumerator()); |
2130 | 1 | |
2131 | 1 | auto TI = BB->getTerminator(); |
2132 | 1 | TI->setMetadata( |
2133 | 1 | LLVMContext::MD_prof, |
2134 | 1 | MDBuilder(TI->getParent()->getContext()).createBranchWeights(Weights)); |
2135 | 1 | } |
2136 | 42.8k | } |
2137 | | |
2138 | | /// DuplicateCondBranchOnPHIIntoPred - PredBB contains an unconditional branch |
2139 | | /// to BB which contains an i1 PHI node and a conditional branch on that PHI. |
2140 | | /// If we can duplicate the contents of BB up into PredBB do so now, this |
2141 | | /// improves the odds that the branch will be on an analyzable instruction like |
2142 | | /// a compare. |
2143 | | bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred( |
2144 | 10.4k | BasicBlock *BB, const SmallVectorImpl<BasicBlock *> &PredBBs) { |
2145 | 10.4k | assert(!PredBBs.empty() && "Can't handle an empty set"); |
2146 | 10.4k | |
2147 | 10.4k | // If BB is a loop header, then duplicating this block outside the loop would |
2148 | 10.4k | // cause us to transform this into an irreducible loop, don't do this. |
2149 | 10.4k | // See the comments above FindLoopHeaders for justifications and caveats. |
2150 | 10.4k | if (LoopHeaders.count(BB)10.4k ) { |
2151 | 8.96k | DEBUG(dbgs() << " Not duplicating loop header '" << BB->getName() |
2152 | 8.96k | << "' into predecessor block '" << PredBBs[0]->getName() |
2153 | 8.96k | << "' - it might create an irreducible loop!\n"); |
2154 | 8.96k | return false; |
2155 | 8.96k | } |
2156 | 1.53k | |
2157 | 1.53k | unsigned DuplicationCost = |
2158 | 1.53k | getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold); |
2159 | 1.53k | if (DuplicationCost > BBDupThreshold1.53k ) { |
2160 | 1.08k | DEBUG(dbgs() << " Not duplicating BB '" << BB->getName() |
2161 | 1.08k | << "' - Cost is too high: " << DuplicationCost << "\n"); |
2162 | 1.08k | return false; |
2163 | 1.08k | } |
2164 | 443 | |
2165 | 443 | // And finally, do it! Start by factoring the predecessors if needed. |
2166 | 443 | BasicBlock *PredBB; |
2167 | 443 | if (PredBBs.size() == 1) |
2168 | 440 | PredBB = PredBBs[0]; |
2169 | 3 | else { |
2170 | 3 | DEBUG(dbgs() << " Factoring out " << PredBBs.size() |
2171 | 3 | << " common predecessors.\n"); |
2172 | 3 | PredBB = SplitBlockPreds(BB, PredBBs, ".thr_comm"); |
2173 | 3 | } |
2174 | 443 | |
2175 | 443 | // Okay, we decided to do this! Clone all the instructions in BB onto the end |
2176 | 443 | // of PredBB. |
2177 | 443 | DEBUG(dbgs() << " Duplicating block '" << BB->getName() << "' into end of '" |
2178 | 443 | << PredBB->getName() << "' to eliminate branch on phi. Cost: " |
2179 | 443 | << DuplicationCost << " block is:" << *BB << "\n"); |
2180 | 443 | |
2181 | 443 | // Unless PredBB ends with an unconditional branch, split the edge so that we |
2182 | 443 | // can just clone the bits from BB into the end of the new PredBB. |
2183 | 443 | BranchInst *OldPredBranch = dyn_cast<BranchInst>(PredBB->getTerminator()); |
2184 | 443 | |
2185 | 443 | if (!OldPredBranch || 443 !OldPredBranch->isUnconditional()442 ) { |
2186 | 2 | PredBB = SplitEdge(PredBB, BB, DT); |
2187 | 2 | OldPredBranch = cast<BranchInst>(PredBB->getTerminator()); |
2188 | 2 | } |
2189 | 443 | |
2190 | 443 | // We are going to have to map operands from the original BB block into the |
2191 | 443 | // PredBB block. Evaluate PHI nodes in BB. |
2192 | 443 | DenseMap<Instruction*, Value*> ValueMapping; |
2193 | 443 | |
2194 | 443 | BasicBlock::iterator BI = BB->begin(); |
2195 | 1.74k | for (; PHINode *PN1.74k = dyn_cast<PHINode>(BI); ++BI1.30k ) |
2196 | 1.30k | ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB); |
2197 | 443 | // Clone the non-phi instructions of BB into PredBB, keeping track of the |
2198 | 443 | // mapping and using it to remap operands in the cloned instructions. |
2199 | 999 | for (; BI != BB->end()999 ; ++BI556 ) { |
2200 | 556 | Instruction *New = BI->clone(); |
2201 | 556 | |
2202 | 556 | // Remap operands to patch up intra-block references. |
2203 | 2.08k | for (unsigned i = 0, e = New->getNumOperands(); i != e2.08k ; ++i1.52k ) |
2204 | 1.52k | if (Instruction *1.52k Inst1.52k = dyn_cast<Instruction>(New->getOperand(i))) { |
2205 | 571 | DenseMap<Instruction*, Value*>::iterator I = ValueMapping.find(Inst); |
2206 | 571 | if (I != ValueMapping.end()) |
2207 | 539 | New->setOperand(i, I->second); |
2208 | 1.52k | } |
2209 | 556 | |
2210 | 556 | // If this instruction can be simplified after the operands are updated, |
2211 | 556 | // just use the simplified value instead. This frequently happens due to |
2212 | 556 | // phi translation. |
2213 | 556 | if (Value *IV = SimplifyInstruction( |
2214 | 556 | New, |
2215 | 6 | {BB->getModule()->getDataLayout(), TLI, nullptr, nullptr, New})) { |
2216 | 6 | ValueMapping[&*BI] = IV; |
2217 | 6 | if (!New->mayHaveSideEffects()6 ) { |
2218 | 6 | New->deleteValue(); |
2219 | 6 | New = nullptr; |
2220 | 6 | } |
2221 | 556 | } else { |
2222 | 550 | ValueMapping[&*BI] = New; |
2223 | 550 | } |
2224 | 556 | if (New556 ) { |
2225 | 550 | // Otherwise, insert the new instruction into the block. |
2226 | 550 | New->setName(BI->getName()); |
2227 | 550 | PredBB->getInstList().insert(OldPredBranch->getIterator(), New); |
2228 | 550 | } |
2229 | 556 | } |
2230 | 443 | |
2231 | 443 | // Check to see if the targets of the branch had PHI nodes. If so, we need to |
2232 | 443 | // add entries to the PHI nodes for branch from PredBB now. |
2233 | 443 | BranchInst *BBBranch = cast<BranchInst>(BB->getTerminator()); |
2234 | 443 | AddPHINodeEntriesForMappedBlock(BBBranch->getSuccessor(0), BB, PredBB, |
2235 | 443 | ValueMapping); |
2236 | 443 | AddPHINodeEntriesForMappedBlock(BBBranch->getSuccessor(1), BB, PredBB, |
2237 | 443 | ValueMapping); |
2238 | 443 | |
2239 | 443 | // If there were values defined in BB that are used outside the block, then we |
2240 | 443 | // now have to update all uses of the value to use either the original value, |
2241 | 443 | // the cloned value, or some PHI derived value. This can require arbitrary |
2242 | 443 | // PHI insertion, of which we are prepared to do, clean these up now. |
2243 | 443 | SSAUpdater SSAUpdate; |
2244 | 443 | SmallVector<Use*, 16> UsesToRename; |
2245 | 1.86k | for (Instruction &I : *BB) { |
2246 | 1.86k | // Scan all uses of this instruction to see if it is used outside of its |
2247 | 1.86k | // block, and if so, record them in UsesToRename. |
2248 | 2.23k | for (Use &U : I.uses()) { |
2249 | 2.23k | Instruction *User = cast<Instruction>(U.getUser()); |
2250 | 2.23k | if (PHINode *UserPN2.23k = dyn_cast<PHINode>(User)) { |
2251 | 1.34k | if (UserPN->getIncomingBlock(U) == BB) |
2252 | 1.07k | continue; |
2253 | 887 | } else if (887 User->getParent() == BB887 ) |
2254 | 539 | continue; |
2255 | 620 | |
2256 | 620 | UsesToRename.push_back(&U); |
2257 | 620 | } |
2258 | 1.86k | |
2259 | 1.86k | // If there are no uses outside the block, we're done with this instruction. |
2260 | 1.86k | if (UsesToRename.empty()) |
2261 | 1.48k | continue; |
2262 | 375 | |
2263 | 375 | DEBUG375 (dbgs() << "JT: Renaming non-local uses of: " << I << "\n"); |
2264 | 375 | |
2265 | 375 | // We found a use of I outside of BB. Rename all uses of I that are outside |
2266 | 375 | // its block to be uses of the appropriate PHI node etc. See ValuesInBlocks |
2267 | 375 | // with the two values we know. |
2268 | 375 | SSAUpdate.Initialize(I.getType(), I.getName()); |
2269 | 375 | SSAUpdate.AddAvailableValue(BB, &I); |
2270 | 375 | SSAUpdate.AddAvailableValue(PredBB, ValueMapping[&I]); |
2271 | 375 | |
2272 | 995 | while (!UsesToRename.empty()) |
2273 | 620 | SSAUpdate.RewriteUse(*UsesToRename.pop_back_val()); |
2274 | 375 | DEBUG(dbgs() << "\n"); |
2275 | 1.86k | } |
2276 | 443 | |
2277 | 443 | // PredBB no longer jumps to BB, remove entries in the PHI node for the edge |
2278 | 443 | // that we nuked. |
2279 | 443 | BB->removePredecessor(PredBB, true); |
2280 | 443 | |
2281 | 443 | // Remove the unconditional branch at the end of the PredBB block. |
2282 | 443 | OldPredBranch->eraseFromParent(); |
2283 | 443 | if (BB != PredBB) |
2284 | 443 | DT->deleteEdge(PredBB, BB); |
2285 | 10.4k | |
2286 | 10.4k | ++NumDupes; |
2287 | 10.4k | return true; |
2288 | 10.4k | } |
2289 | | |
2290 | | /// TryToUnfoldSelect - Look for blocks of the form |
2291 | | /// bb1: |
2292 | | /// %a = select |
2293 | | /// br bb2 |
2294 | | /// |
2295 | | /// bb2: |
2296 | | /// %p = phi [%a, %bb1] ... |
2297 | | /// %c = icmp %p |
2298 | | /// br i1 %c |
2299 | | /// |
2300 | | /// And expand the select into a branch structure if one of its arms allows %c |
2301 | | /// to be folded. This later enables threading from bb1 over bb2. |
2302 | 4.25M | bool JumpThreadingPass::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) { |
2303 | 4.25M | BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator()); |
2304 | 4.25M | PHINode *CondLHS = dyn_cast<PHINode>(CondCmp->getOperand(0)); |
2305 | 4.25M | Constant *CondRHS = cast<Constant>(CondCmp->getOperand(1)); |
2306 | 4.25M | |
2307 | 4.25M | if (!CondBr || 4.25M !CondBr->isConditional()4.25M || !CondLHS4.25M || |
2308 | 390k | CondLHS->getParent() != BB) |
2309 | 4.03M | return false; |
2310 | 218k | |
2311 | 685k | for (unsigned I = 0, E = CondLHS->getNumIncomingValues(); 218k I != E685k ; ++I466k ) { |
2312 | 466k | BasicBlock *Pred = CondLHS->getIncomingBlock(I); |
2313 | 466k | SelectInst *SI = dyn_cast<SelectInst>(CondLHS->getIncomingValue(I)); |
2314 | 466k | |
2315 | 466k | // Look if one of the incoming values is a select in the corresponding |
2316 | 466k | // predecessor. |
2317 | 466k | if (!SI || 466k SI->getParent() != Pred4.33k || !SI->hasOneUse()3.67k ) |
2318 | 464k | continue; |
2319 | 2.59k | |
2320 | 2.59k | BranchInst *PredTerm = dyn_cast<BranchInst>(Pred->getTerminator()); |
2321 | 2.59k | if (!PredTerm || 2.59k !PredTerm->isUnconditional()2.59k ) |
2322 | 1.73k | continue; |
2323 | 866 | |
2324 | 866 | // Now check if one of the select values would allow us to constant fold the |
2325 | 866 | // terminator in BB. We don't do the transform if both sides fold, those |
2326 | 866 | // cases will be threaded in any case. |
2327 | 866 | LazyValueInfo::Tristate LHSFolds = |
2328 | 866 | LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(1), |
2329 | 866 | CondRHS, Pred, BB, CondCmp); |
2330 | 866 | LazyValueInfo::Tristate RHSFolds = |
2331 | 866 | LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(2), |
2332 | 866 | CondRHS, Pred, BB, CondCmp); |
2333 | 866 | if ((LHSFolds != LazyValueInfo::Unknown || |
2334 | 311 | RHSFolds != LazyValueInfo::Unknown) && |
2335 | 866 | LHSFolds != RHSFolds612 ) { |
2336 | 496 | // Expand the select. |
2337 | 496 | // |
2338 | 496 | // Pred -- |
2339 | 496 | // | v |
2340 | 496 | // | NewBB |
2341 | 496 | // | | |
2342 | 496 | // |----- |
2343 | 496 | // v |
2344 | 496 | // BB |
2345 | 496 | BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "select.unfold", |
2346 | 496 | BB->getParent(), BB); |
2347 | 496 | // Move the unconditional branch to NewBB. |
2348 | 496 | PredTerm->removeFromParent(); |
2349 | 496 | NewBB->getInstList().insert(NewBB->end(), PredTerm); |
2350 | 496 | DT->insertEdge(NewBB, BB); |
2351 | 496 | // Create a conditional branch and update PHI nodes. |
2352 | 496 | BranchInst::Create(NewBB, BB, SI->getCondition(), Pred); |
2353 | 496 | CondLHS->setIncomingValue(I, SI->getFalseValue()); |
2354 | 496 | CondLHS->addIncoming(SI->getTrueValue(), NewBB); |
2355 | 496 | // The select is now dead. |
2356 | 496 | SI->eraseFromParent(); |
2357 | 496 | |
2358 | 496 | DT->insertEdge(Pred, NewBB); |
2359 | 496 | // Update any other PHI nodes in BB. |
2360 | 496 | for (BasicBlock::iterator BI = BB->begin(); |
2361 | 1.33k | PHINode *Phi1.33k = dyn_cast<PHINode>(BI); ++BI840 ) |
2362 | 840 | if (840 Phi != CondLHS840 ) |
2363 | 344 | Phi->addIncoming(Phi->getIncomingValueForBlock(Pred), NewBB); |
2364 | 496 | return true; |
2365 | 496 | } |
2366 | 466k | } |
2367 | 218k | return false; |
2368 | 4.25M | } |
2369 | | |
2370 | | /// TryToUnfoldSelectInCurrBB - Look for PHI/Select or PHI/CMP/Select in the |
2371 | | /// same BB in the form |
2372 | | /// bb: |
2373 | | /// %p = phi [false, %bb1], [true, %bb2], [false, %bb3], [true, %bb4], ... |
2374 | | /// %s = select %p, trueval, falseval |
2375 | | /// |
2376 | | /// or |
2377 | | /// |
2378 | | /// bb: |
2379 | | /// %p = phi [0, %bb1], [1, %bb2], [0, %bb3], [1, %bb4], ... |
2380 | | /// %c = cmp %p, 0 |
2381 | | /// %s = select %c, trueval, falseval |
2382 | | /// |
2383 | | /// And expand the select into a branch structure. This later enables |
2384 | | /// jump-threading over bb in this pass. |
2385 | | /// |
2386 | | /// Using the similar approach of SimplifyCFG::FoldCondBranchOnPHI(), unfold |
2387 | | /// select if the associated PHI has at least one constant. If the unfolded |
2388 | | /// select is not jump-threaded, it will be folded again in the later |
2389 | | /// optimizations. |
2390 | 12.6M | bool JumpThreadingPass::TryToUnfoldSelectInCurrBB(BasicBlock *BB) { |
2391 | 12.6M | // If threading this would thread across a loop header, don't thread the edge. |
2392 | 12.6M | // See the comments above FindLoopHeaders for justifications and caveats. |
2393 | 12.6M | if (LoopHeaders.count(BB)) |
2394 | 1.24M | return false; |
2395 | 11.3M | |
2396 | 11.3M | for (BasicBlock::iterator BI = BB->begin(); |
2397 | 13.1M | PHINode *PN13.1M = dyn_cast<PHINode>(BI); ++BI1.78M ) { |
2398 | 1.78M | // Look for a Phi having at least one constant incoming value. |
2399 | 1.78M | if (llvm::all_of(PN->incoming_values(), |
2400 | 4.80M | [](Value *V) { return !isa<ConstantInt>(V); })) |
2401 | 1.23M | continue; |
2402 | 546k | |
2403 | 546k | auto isUnfoldCandidate = [BB](SelectInst *SI, Value *V) 546k { |
2404 | 6.15k | // Check if SI is in BB and use V as condition. |
2405 | 6.15k | if (SI->getParent() != BB) |
2406 | 2.79k | return false; |
2407 | 3.35k | Value *Cond = SI->getCondition(); |
2408 | 3.35k | return (Cond && Cond == V3.35k && Cond->getType()->isIntegerTy(1)1.63k ); |
2409 | 6.15k | }; |
2410 | 546k | |
2411 | 546k | SelectInst *SI = nullptr; |
2412 | 798k | for (Use &U : PN->uses()) { |
2413 | 798k | if (ICmpInst *Cmp798k = dyn_cast<ICmpInst>(U.getUser())) { |
2414 | 124k | // Look for a ICmp in BB that compares PN with a constant and is the |
2415 | 124k | // condition of a Select. |
2416 | 124k | if (Cmp->getParent() == BB && 124k Cmp->hasOneUse()47.1k && |
2417 | 45.7k | isa<ConstantInt>(Cmp->getOperand(1 - U.getOperandNo()))) |
2418 | 30.8k | if (SelectInst *30.8k SelectI30.8k = dyn_cast<SelectInst>(Cmp->user_back())) |
2419 | 775 | if (775 isUnfoldCandidate(SelectI, Cmp->use_begin()->get())775 ) { |
2420 | 757 | SI = SelectI; |
2421 | 757 | break; |
2422 | 757 | } |
2423 | 674k | } else if (SelectInst *674k SelectI674k = dyn_cast<SelectInst>(U.getUser())) { |
2424 | 5.38k | // Look for a Select in BB that uses PN as condtion. |
2425 | 5.38k | if (isUnfoldCandidate(SelectI, U.get())5.38k ) { |
2426 | 876 | SI = SelectI; |
2427 | 876 | break; |
2428 | 876 | } |
2429 | 546k | } |
2430 | 798k | } |
2431 | 546k | |
2432 | 546k | if (!SI) |
2433 | 545k | continue; |
2434 | 1.63k | // Expand the select. |
2435 | 1.63k | TerminatorInst *Term = |
2436 | 1.63k | SplitBlockAndInsertIfThen(SI->getCondition(), SI, false, nullptr, DT); |
2437 | 1.63k | PHINode *NewPN = PHINode::Create(SI->getType(), 2, "", SI); |
2438 | 1.63k | NewPN->addIncoming(SI->getTrueValue(), Term->getParent()); |
2439 | 1.63k | NewPN->addIncoming(SI->getFalseValue(), BB); |
2440 | 1.63k | SI->replaceAllUsesWith(NewPN); |
2441 | 1.63k | SI->eraseFromParent(); |
2442 | 1.63k | return true; |
2443 | 1.63k | } |
2444 | 11.3M | return false; |
2445 | 12.6M | } |
2446 | | |
2447 | | /// Try to propagate a guard from the current BB into one of its predecessors |
2448 | | /// in case if another branch of execution implies that the condition of this |
2449 | | /// guard is always true. Currently we only process the simplest case that |
2450 | | /// looks like: |
2451 | | /// |
2452 | | /// Start: |
2453 | | /// %cond = ... |
2454 | | /// br i1 %cond, label %T1, label %F1 |
2455 | | /// T1: |
2456 | | /// br label %Merge |
2457 | | /// F1: |
2458 | | /// br label %Merge |
2459 | | /// Merge: |
2460 | | /// %condGuard = ... |
2461 | | /// call void(i1, ...) @llvm.experimental.guard( i1 %condGuard )[ "deopt"() ] |
2462 | | /// |
2463 | | /// And cond either implies condGuard or !condGuard. In this case all the |
2464 | | /// instructions before the guard can be duplicated in both branches, and the |
2465 | | /// guard is then threaded to one of them. |
2466 | 143 | bool JumpThreadingPass::ProcessGuards(BasicBlock *BB) { |
2467 | 143 | using namespace PatternMatch; |
2468 | 143 | |
2469 | 143 | // We only want to deal with two predecessors. |
2470 | 143 | BasicBlock *Pred1, *Pred2; |
2471 | 143 | auto PI = pred_begin(BB), PE = pred_end(BB); |
2472 | 143 | if (PI == PE) |
2473 | 65 | return false; |
2474 | 78 | Pred1 = *PI++; |
2475 | 78 | if (PI == PE) |
2476 | 47 | return false; |
2477 | 31 | Pred2 = *PI++; |
2478 | 31 | if (PI != PE) |
2479 | 5 | return false; |
2480 | 26 | if (26 Pred1 == Pred226 ) |
2481 | 4 | return false; |
2482 | 22 | |
2483 | 22 | // Try to thread one of the guards of the block. |
2484 | 22 | // TODO: Look up deeper than to immediate predecessor? |
2485 | 22 | auto *Parent = Pred1->getSinglePredecessor(); |
2486 | 22 | if (!Parent || 22 Parent != Pred2->getSinglePredecessor()20 ) |
2487 | 12 | return false; |
2488 | 10 | |
2489 | 10 | if (auto *10 BI10 = dyn_cast<BranchInst>(Parent->getTerminator())) |
2490 | 10 | for (auto &I : *BB) |
2491 | 34 | if (34 match(&I, m_Intrinsic<Intrinsic::experimental_guard>())34 ) |
2492 | 4 | if (4 ThreadGuard(BB, cast<IntrinsicInst>(&I), BI)4 ) |
2493 | 2 | return true; |
2494 | 8 | |
2495 | 8 | return false; |
2496 | 8 | } |
2497 | | |
2498 | | /// Try to propagate the guard from BB which is the lower block of a diamond |
2499 | | /// to one of its branches, in case if diamond's condition implies guard's |
2500 | | /// condition. |
2501 | | bool JumpThreadingPass::ThreadGuard(BasicBlock *BB, IntrinsicInst *Guard, |
2502 | 4 | BranchInst *BI) { |
2503 | 4 | assert(BI->getNumSuccessors() == 2 && "Wrong number of successors?"); |
2504 | 4 | assert(BI->isConditional() && "Unconditional branch has 2 successors?"); |
2505 | 4 | Value *GuardCond = Guard->getArgOperand(0); |
2506 | 4 | Value *BranchCond = BI->getCondition(); |
2507 | 4 | BasicBlock *TrueDest = BI->getSuccessor(0); |
2508 | 4 | BasicBlock *FalseDest = BI->getSuccessor(1); |
2509 | 4 | |
2510 | 4 | auto &DL = BB->getModule()->getDataLayout(); |
2511 | 4 | bool TrueDestIsSafe = false; |
2512 | 4 | bool FalseDestIsSafe = false; |
2513 | 4 | |
2514 | 4 | // True dest is safe if BranchCond => GuardCond. |
2515 | 4 | auto Impl = isImpliedCondition(BranchCond, GuardCond, DL); |
2516 | 4 | if (Impl && 4 *Impl2 ) |
2517 | 1 | TrueDestIsSafe = true; |
2518 | 3 | else { |
2519 | 3 | // False dest is safe if !BranchCond => GuardCond. |
2520 | 3 | Impl = isImpliedCondition(BranchCond, GuardCond, DL, /* LHSIsTrue */ false); |
2521 | 3 | if (Impl && 3 *Impl2 ) |
2522 | 1 | FalseDestIsSafe = true; |
2523 | 3 | } |
2524 | 4 | |
2525 | 4 | if (!TrueDestIsSafe && 4 !FalseDestIsSafe3 ) |
2526 | 2 | return false; |
2527 | 2 | |
2528 | 2 | BasicBlock *PredUnguardedBlock = TrueDestIsSafe ? 2 TrueDest1 : FalseDest1 ; |
2529 | 2 | BasicBlock *PredGuardedBlock = FalseDestIsSafe ? TrueDest1 : FalseDest1 ; |
2530 | 2 | |
2531 | 2 | ValueToValueMapTy UnguardedMapping, GuardedMapping; |
2532 | 2 | Instruction *AfterGuard = Guard->getNextNode(); |
2533 | 2 | unsigned Cost = getJumpThreadDuplicationCost(BB, AfterGuard, BBDupThreshold); |
2534 | 2 | if (Cost > BBDupThreshold) |
2535 | 0 | return false; |
2536 | 2 | // Duplicate all instructions before the guard and the guard itself to the |
2537 | 2 | // branch where implication is not proved. |
2538 | 2 | BasicBlock *GuardedBlock = DuplicateInstructionsInSplitBetween( |
2539 | 2 | BB, PredGuardedBlock, AfterGuard, GuardedMapping); |
2540 | 2 | assert(GuardedBlock && "Could not create the guarded block?"); |
2541 | 2 | // Duplicate all instructions before the guard in the unguarded branch. |
2542 | 2 | // Since we have successfully duplicated the guarded block and this block |
2543 | 2 | // has fewer instructions, we expect it to succeed. |
2544 | 2 | BasicBlock *UnguardedBlock = DuplicateInstructionsInSplitBetween( |
2545 | 2 | BB, PredUnguardedBlock, Guard, UnguardedMapping); |
2546 | 2 | assert(UnguardedBlock && "Could not create the unguarded block?"); |
2547 | 2 | DEBUG(dbgs() << "Moved guard " << *Guard << " to block " |
2548 | 2 | << GuardedBlock->getName() << "\n"); |
2549 | 2 | // DuplicateInstructionsInSplitBetween inserts a new block, BB.split, between |
2550 | 2 | // PredBB and BB. We need to perform two inserts and one delete in DT for each |
2551 | 2 | // of the above calls. |
2552 | 2 | DT->applyUpdates({// Guarded block split. |
2553 | 2 | {DominatorTree::Delete, PredGuardedBlock, BB}, |
2554 | 2 | {DominatorTree::Insert, PredGuardedBlock, GuardedBlock}, |
2555 | 2 | {DominatorTree::Insert, GuardedBlock, BB}, |
2556 | 2 | // Unguarded block split. |
2557 | 2 | {DominatorTree::Delete, PredUnguardedBlock, BB}, |
2558 | 2 | {DominatorTree::Insert, PredUnguardedBlock, UnguardedBlock}, |
2559 | 2 | {DominatorTree::Insert, UnguardedBlock, BB}}); |
2560 | 2 | |
2561 | 2 | // Some instructions before the guard may still have uses. For them, we need |
2562 | 2 | // to create Phi nodes merging their copies in both guarded and unguarded |
2563 | 2 | // branches. Those instructions that have no uses can be just removed. |
2564 | 2 | SmallVector<Instruction *, 4> ToRemove; |
2565 | 10 | for (auto BI = BB->begin(); &*BI != AfterGuard10 ; ++BI8 ) |
2566 | 8 | if (8 !isa<PHINode>(&*BI)8 ) |
2567 | 6 | ToRemove.push_back(&*BI); |
2568 | 2 | |
2569 | 2 | Instruction *InsertionPoint = &*BB->getFirstInsertionPt(); |
2570 | 2 | assert(InsertionPoint && "Empty block?"); |
2571 | 2 | // Substitute with Phis & remove. |
2572 | 6 | for (auto *Inst : reverse(ToRemove)) { |
2573 | 6 | if (!Inst->use_empty()6 ) { |
2574 | 2 | PHINode *NewPN = PHINode::Create(Inst->getType(), 2); |
2575 | 2 | NewPN->addIncoming(UnguardedMapping[Inst], UnguardedBlock); |
2576 | 2 | NewPN->addIncoming(GuardedMapping[Inst], GuardedBlock); |
2577 | 2 | NewPN->insertBefore(InsertionPoint); |
2578 | 2 | Inst->replaceAllUsesWith(NewPN); |
2579 | 2 | } |
2580 | 6 | Inst->eraseFromParent(); |
2581 | 6 | } |
2582 | 4 | return true; |
2583 | 4 | } |