/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/CodeGen/CodeGenPrepare.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===- CodeGenPrepare.cpp - Prepare a function for code generation --------===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | // |
9 | | // This pass munges the code in the input function to better prepare it for |
10 | | // SelectionDAG-based code generation. This works around limitations in it's |
11 | | // basic-block-at-a-time approach. It should eventually be removed. |
12 | | // |
13 | | //===----------------------------------------------------------------------===// |
14 | | |
15 | | #include "llvm/ADT/APInt.h" |
16 | | #include "llvm/ADT/ArrayRef.h" |
17 | | #include "llvm/ADT/DenseMap.h" |
18 | | #include "llvm/ADT/MapVector.h" |
19 | | #include "llvm/ADT/PointerIntPair.h" |
20 | | #include "llvm/ADT/STLExtras.h" |
21 | | #include "llvm/ADT/SmallPtrSet.h" |
22 | | #include "llvm/ADT/SmallVector.h" |
23 | | #include "llvm/ADT/Statistic.h" |
24 | | #include "llvm/Analysis/BlockFrequencyInfo.h" |
25 | | #include "llvm/Analysis/BranchProbabilityInfo.h" |
26 | | #include "llvm/Analysis/ConstantFolding.h" |
27 | | #include "llvm/Analysis/InstructionSimplify.h" |
28 | | #include "llvm/Analysis/LoopInfo.h" |
29 | | #include "llvm/Analysis/MemoryBuiltins.h" |
30 | | #include "llvm/Analysis/ProfileSummaryInfo.h" |
31 | | #include "llvm/Analysis/TargetLibraryInfo.h" |
32 | | #include "llvm/Analysis/TargetTransformInfo.h" |
33 | | #include "llvm/Transforms/Utils/Local.h" |
34 | | #include "llvm/Analysis/ValueTracking.h" |
35 | | #include "llvm/Analysis/VectorUtils.h" |
36 | | #include "llvm/CodeGen/Analysis.h" |
37 | | #include "llvm/CodeGen/ISDOpcodes.h" |
38 | | #include "llvm/CodeGen/SelectionDAGNodes.h" |
39 | | #include "llvm/CodeGen/TargetLowering.h" |
40 | | #include "llvm/CodeGen/TargetPassConfig.h" |
41 | | #include "llvm/CodeGen/TargetSubtargetInfo.h" |
42 | | #include "llvm/CodeGen/ValueTypes.h" |
43 | | #include "llvm/Config/llvm-config.h" |
44 | | #include "llvm/IR/Argument.h" |
45 | | #include "llvm/IR/Attributes.h" |
46 | | #include "llvm/IR/BasicBlock.h" |
47 | | #include "llvm/IR/CallSite.h" |
48 | | #include "llvm/IR/Constant.h" |
49 | | #include "llvm/IR/Constants.h" |
50 | | #include "llvm/IR/DataLayout.h" |
51 | | #include "llvm/IR/DerivedTypes.h" |
52 | | #include "llvm/IR/Dominators.h" |
53 | | #include "llvm/IR/Function.h" |
54 | | #include "llvm/IR/GetElementPtrTypeIterator.h" |
55 | | #include "llvm/IR/GlobalValue.h" |
56 | | #include "llvm/IR/GlobalVariable.h" |
57 | | #include "llvm/IR/IRBuilder.h" |
58 | | #include "llvm/IR/InlineAsm.h" |
59 | | #include "llvm/IR/InstrTypes.h" |
60 | | #include "llvm/IR/Instruction.h" |
61 | | #include "llvm/IR/Instructions.h" |
62 | | #include "llvm/IR/IntrinsicInst.h" |
63 | | #include "llvm/IR/Intrinsics.h" |
64 | | #include "llvm/IR/LLVMContext.h" |
65 | | #include "llvm/IR/MDBuilder.h" |
66 | | #include "llvm/IR/Module.h" |
67 | | #include "llvm/IR/Operator.h" |
68 | | #include "llvm/IR/PatternMatch.h" |
69 | | #include "llvm/IR/Statepoint.h" |
70 | | #include "llvm/IR/Type.h" |
71 | | #include "llvm/IR/Use.h" |
72 | | #include "llvm/IR/User.h" |
73 | | #include "llvm/IR/Value.h" |
74 | | #include "llvm/IR/ValueHandle.h" |
75 | | #include "llvm/IR/ValueMap.h" |
76 | | #include "llvm/Pass.h" |
77 | | #include "llvm/Support/BlockFrequency.h" |
78 | | #include "llvm/Support/BranchProbability.h" |
79 | | #include "llvm/Support/Casting.h" |
80 | | #include "llvm/Support/CommandLine.h" |
81 | | #include "llvm/Support/Compiler.h" |
82 | | #include "llvm/Support/Debug.h" |
83 | | #include "llvm/Support/ErrorHandling.h" |
84 | | #include "llvm/Support/MachineValueType.h" |
85 | | #include "llvm/Support/MathExtras.h" |
86 | | #include "llvm/Support/raw_ostream.h" |
87 | | #include "llvm/Target/TargetMachine.h" |
88 | | #include "llvm/Target/TargetOptions.h" |
89 | | #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
90 | | #include "llvm/Transforms/Utils/BypassSlowDivision.h" |
91 | | #include "llvm/Transforms/Utils/SimplifyLibCalls.h" |
92 | | #include <algorithm> |
93 | | #include <cassert> |
94 | | #include <cstdint> |
95 | | #include <iterator> |
96 | | #include <limits> |
97 | | #include <memory> |
98 | | #include <utility> |
99 | | #include <vector> |
100 | | |
101 | | using namespace llvm; |
102 | | using namespace llvm::PatternMatch; |
103 | | |
104 | | #define DEBUG_TYPE "codegenprepare" |
105 | | |
106 | | STATISTIC(NumBlocksElim, "Number of blocks eliminated"); |
107 | | STATISTIC(NumPHIsElim, "Number of trivial PHIs eliminated"); |
108 | | STATISTIC(NumGEPsElim, "Number of GEPs converted to casts"); |
109 | | STATISTIC(NumCmpUses, "Number of uses of Cmp expressions replaced with uses of " |
110 | | "sunken Cmps"); |
111 | | STATISTIC(NumCastUses, "Number of uses of Cast expressions replaced with uses " |
112 | | "of sunken Casts"); |
113 | | STATISTIC(NumMemoryInsts, "Number of memory instructions whose address " |
114 | | "computations were sunk"); |
115 | | STATISTIC(NumMemoryInstsPhiCreated, |
116 | | "Number of phis created when address " |
117 | | "computations were sunk to memory instructions"); |
118 | | STATISTIC(NumMemoryInstsSelectCreated, |
119 | | "Number of select created when address " |
120 | | "computations were sunk to memory instructions"); |
121 | | STATISTIC(NumExtsMoved, "Number of [s|z]ext instructions combined with loads"); |
122 | | STATISTIC(NumExtUses, "Number of uses of [s|z]ext instructions optimized"); |
123 | | STATISTIC(NumAndsAdded, |
124 | | "Number of and mask instructions added to form ext loads"); |
125 | | STATISTIC(NumAndUses, "Number of uses of and mask instructions optimized"); |
126 | | STATISTIC(NumRetsDup, "Number of return instructions duplicated"); |
127 | | STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved"); |
128 | | STATISTIC(NumSelectsExpanded, "Number of selects turned into branches"); |
129 | | STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed"); |
130 | | |
131 | | static cl::opt<bool> DisableBranchOpts( |
132 | | "disable-cgp-branch-opts", cl::Hidden, cl::init(false), |
133 | | cl::desc("Disable branch optimizations in CodeGenPrepare")); |
134 | | |
135 | | static cl::opt<bool> |
136 | | DisableGCOpts("disable-cgp-gc-opts", cl::Hidden, cl::init(false), |
137 | | cl::desc("Disable GC optimizations in CodeGenPrepare")); |
138 | | |
139 | | static cl::opt<bool> DisableSelectToBranch( |
140 | | "disable-cgp-select2branch", cl::Hidden, cl::init(false), |
141 | | cl::desc("Disable select to branch conversion.")); |
142 | | |
143 | | static cl::opt<bool> AddrSinkUsingGEPs( |
144 | | "addr-sink-using-gep", cl::Hidden, cl::init(true), |
145 | | cl::desc("Address sinking in CGP using GEPs.")); |
146 | | |
147 | | static cl::opt<bool> EnableAndCmpSinking( |
148 | | "enable-andcmp-sinking", cl::Hidden, cl::init(true), |
149 | | cl::desc("Enable sinkinig and/cmp into branches.")); |
150 | | |
151 | | static cl::opt<bool> DisableStoreExtract( |
152 | | "disable-cgp-store-extract", cl::Hidden, cl::init(false), |
153 | | cl::desc("Disable store(extract) optimizations in CodeGenPrepare")); |
154 | | |
155 | | static cl::opt<bool> StressStoreExtract( |
156 | | "stress-cgp-store-extract", cl::Hidden, cl::init(false), |
157 | | cl::desc("Stress test store(extract) optimizations in CodeGenPrepare")); |
158 | | |
159 | | static cl::opt<bool> DisableExtLdPromotion( |
160 | | "disable-cgp-ext-ld-promotion", cl::Hidden, cl::init(false), |
161 | | cl::desc("Disable ext(promotable(ld)) -> promoted(ext(ld)) optimization in " |
162 | | "CodeGenPrepare")); |
163 | | |
164 | | static cl::opt<bool> StressExtLdPromotion( |
165 | | "stress-cgp-ext-ld-promotion", cl::Hidden, cl::init(false), |
166 | | cl::desc("Stress test ext(promotable(ld)) -> promoted(ext(ld)) " |
167 | | "optimization in CodeGenPrepare")); |
168 | | |
169 | | static cl::opt<bool> DisablePreheaderProtect( |
170 | | "disable-preheader-prot", cl::Hidden, cl::init(false), |
171 | | cl::desc("Disable protection against removing loop preheaders")); |
172 | | |
173 | | static cl::opt<bool> ProfileGuidedSectionPrefix( |
174 | | "profile-guided-section-prefix", cl::Hidden, cl::init(true), cl::ZeroOrMore, |
175 | | cl::desc("Use profile info to add section prefix for hot/cold functions")); |
176 | | |
177 | | static cl::opt<unsigned> FreqRatioToSkipMerge( |
178 | | "cgp-freq-ratio-to-skip-merge", cl::Hidden, cl::init(2), |
179 | | cl::desc("Skip merging empty blocks if (frequency of empty block) / " |
180 | | "(frequency of destination block) is greater than this ratio")); |
181 | | |
182 | | static cl::opt<bool> ForceSplitStore( |
183 | | "force-split-store", cl::Hidden, cl::init(false), |
184 | | cl::desc("Force store splitting no matter what the target query says.")); |
185 | | |
186 | | static cl::opt<bool> |
187 | | EnableTypePromotionMerge("cgp-type-promotion-merge", cl::Hidden, |
188 | | cl::desc("Enable merging of redundant sexts when one is dominating" |
189 | | " the other."), cl::init(true)); |
190 | | |
191 | | static cl::opt<bool> DisableComplexAddrModes( |
192 | | "disable-complex-addr-modes", cl::Hidden, cl::init(false), |
193 | | cl::desc("Disables combining addressing modes with different parts " |
194 | | "in optimizeMemoryInst.")); |
195 | | |
196 | | static cl::opt<bool> |
197 | | AddrSinkNewPhis("addr-sink-new-phis", cl::Hidden, cl::init(false), |
198 | | cl::desc("Allow creation of Phis in Address sinking.")); |
199 | | |
200 | | static cl::opt<bool> |
201 | | AddrSinkNewSelects("addr-sink-new-select", cl::Hidden, cl::init(true), |
202 | | cl::desc("Allow creation of selects in Address sinking.")); |
203 | | |
204 | | static cl::opt<bool> AddrSinkCombineBaseReg( |
205 | | "addr-sink-combine-base-reg", cl::Hidden, cl::init(true), |
206 | | cl::desc("Allow combining of BaseReg field in Address sinking.")); |
207 | | |
208 | | static cl::opt<bool> AddrSinkCombineBaseGV( |
209 | | "addr-sink-combine-base-gv", cl::Hidden, cl::init(true), |
210 | | cl::desc("Allow combining of BaseGV field in Address sinking.")); |
211 | | |
212 | | static cl::opt<bool> AddrSinkCombineBaseOffs( |
213 | | "addr-sink-combine-base-offs", cl::Hidden, cl::init(true), |
214 | | cl::desc("Allow combining of BaseOffs field in Address sinking.")); |
215 | | |
216 | | static cl::opt<bool> AddrSinkCombineScaledReg( |
217 | | "addr-sink-combine-scaled-reg", cl::Hidden, cl::init(true), |
218 | | cl::desc("Allow combining of ScaledReg field in Address sinking.")); |
219 | | |
220 | | static cl::opt<bool> |
221 | | EnableGEPOffsetSplit("cgp-split-large-offset-gep", cl::Hidden, |
222 | | cl::init(true), |
223 | | cl::desc("Enable splitting large offset of GEP.")); |
224 | | |
225 | | namespace { |
226 | | |
227 | | enum ExtType { |
228 | | ZeroExtension, // Zero extension has been seen. |
229 | | SignExtension, // Sign extension has been seen. |
230 | | BothExtension // This extension type is used if we saw sext after |
231 | | // ZeroExtension had been set, or if we saw zext after |
232 | | // SignExtension had been set. It makes the type |
233 | | // information of a promoted instruction invalid. |
234 | | }; |
235 | | |
236 | | using SetOfInstrs = SmallPtrSet<Instruction *, 16>; |
237 | | using TypeIsSExt = PointerIntPair<Type *, 2, ExtType>; |
238 | | using InstrToOrigTy = DenseMap<Instruction *, TypeIsSExt>; |
239 | | using SExts = SmallVector<Instruction *, 16>; |
240 | | using ValueToSExts = DenseMap<Value *, SExts>; |
241 | | |
242 | | class TypePromotionTransaction; |
243 | | |
244 | | class CodeGenPrepare : public FunctionPass { |
245 | | const TargetMachine *TM = nullptr; |
246 | | const TargetSubtargetInfo *SubtargetInfo; |
247 | | const TargetLowering *TLI = nullptr; |
248 | | const TargetRegisterInfo *TRI; |
249 | | const TargetTransformInfo *TTI = nullptr; |
250 | | const TargetLibraryInfo *TLInfo; |
251 | | const LoopInfo *LI; |
252 | | std::unique_ptr<BlockFrequencyInfo> BFI; |
253 | | std::unique_ptr<BranchProbabilityInfo> BPI; |
254 | | |
255 | | /// As we scan instructions optimizing them, this is the next instruction |
256 | | /// to optimize. Transforms that can invalidate this should update it. |
257 | | BasicBlock::iterator CurInstIterator; |
258 | | |
259 | | /// Keeps track of non-local addresses that have been sunk into a block. |
260 | | /// This allows us to avoid inserting duplicate code for blocks with |
261 | | /// multiple load/stores of the same address. The usage of WeakTrackingVH |
262 | | /// enables SunkAddrs to be treated as a cache whose entries can be |
263 | | /// invalidated if a sunken address computation has been erased. |
264 | | ValueMap<Value*, WeakTrackingVH> SunkAddrs; |
265 | | |
266 | | /// Keeps track of all instructions inserted for the current function. |
267 | | SetOfInstrs InsertedInsts; |
268 | | |
269 | | /// Keeps track of the type of the related instruction before their |
270 | | /// promotion for the current function. |
271 | | InstrToOrigTy PromotedInsts; |
272 | | |
273 | | /// Keep track of instructions removed during promotion. |
274 | | SetOfInstrs RemovedInsts; |
275 | | |
276 | | /// Keep track of sext chains based on their initial value. |
277 | | DenseMap<Value *, Instruction *> SeenChainsForSExt; |
278 | | |
279 | | /// Keep track of GEPs accessing the same data structures such as structs or |
280 | | /// arrays that are candidates to be split later because of their large |
281 | | /// size. |
282 | | MapVector< |
283 | | AssertingVH<Value>, |
284 | | SmallVector<std::pair<AssertingVH<GetElementPtrInst>, int64_t>, 32>> |
285 | | LargeOffsetGEPMap; |
286 | | |
287 | | /// Keep track of new GEP base after splitting the GEPs having large offset. |
288 | | SmallSet<AssertingVH<Value>, 2> NewGEPBases; |
289 | | |
290 | | /// Map serial numbers to Large offset GEPs. |
291 | | DenseMap<AssertingVH<GetElementPtrInst>, int> LargeOffsetGEPID; |
292 | | |
293 | | /// Keep track of SExt promoted. |
294 | | ValueToSExts ValToSExtendedUses; |
295 | | |
296 | | /// True if optimizing for size. |
297 | | bool OptSize; |
298 | | |
299 | | /// DataLayout for the Function being processed. |
300 | | const DataLayout *DL = nullptr; |
301 | | |
302 | | /// Building the dominator tree can be expensive, so we only build it |
303 | | /// lazily and update it when required. |
304 | | std::unique_ptr<DominatorTree> DT; |
305 | | |
306 | | public: |
307 | | static char ID; // Pass identification, replacement for typeid |
308 | | |
309 | 34.6k | CodeGenPrepare() : FunctionPass(ID) { |
310 | 34.6k | initializeCodeGenPreparePass(*PassRegistry::getPassRegistry()); |
311 | 34.6k | } |
312 | | |
313 | | bool runOnFunction(Function &F) override; |
314 | | |
315 | 490k | StringRef getPassName() const override { return "CodeGen Prepare"; } |
316 | | |
317 | 34.4k | void getAnalysisUsage(AnalysisUsage &AU) const override { |
318 | 34.4k | // FIXME: When we can selectively preserve passes, preserve the domtree. |
319 | 34.4k | AU.addRequired<ProfileSummaryInfoWrapperPass>(); |
320 | 34.4k | AU.addRequired<TargetLibraryInfoWrapperPass>(); |
321 | 34.4k | AU.addRequired<TargetTransformInfoWrapperPass>(); |
322 | 34.4k | AU.addRequired<LoopInfoWrapperPass>(); |
323 | 34.4k | } |
324 | | |
325 | | private: |
326 | | template <typename F> |
327 | 3.42k | void resetIteratorIfInvalidatedWhileCalling(BasicBlock *BB, F f) { |
328 | 3.42k | // Substituting can cause recursive simplifications, which can invalidate |
329 | 3.42k | // our iterator. Use a WeakTrackingVH to hold onto it in case this |
330 | 3.42k | // happens. |
331 | 3.42k | Value *CurValue = &*CurInstIterator; |
332 | 3.42k | WeakTrackingVH IterHandle(CurValue); |
333 | 3.42k | |
334 | 3.42k | f(); |
335 | 3.42k | |
336 | 3.42k | // If the iterator instruction was recursively deleted, start over at the |
337 | 3.42k | // start of the block. |
338 | 3.42k | if (IterHandle != CurValue) { |
339 | 9 | CurInstIterator = BB->begin(); |
340 | 9 | SunkAddrs.clear(); |
341 | 9 | } |
342 | 3.42k | } CodeGenPrepare.cpp:void (anonymous namespace)::CodeGenPrepare::resetIteratorIfInvalidatedWhileCalling<(anonymous namespace)::CodeGenPrepare::optimizeCallInst(llvm::CallInst*, bool&)::$_1>(llvm::BasicBlock*, (anonymous namespace)::CodeGenPrepare::optimizeCallInst(llvm::CallInst*, bool&)::$_1) Line | Count | Source | 327 | 4 | void resetIteratorIfInvalidatedWhileCalling(BasicBlock *BB, F f) { | 328 | 4 | // Substituting can cause recursive simplifications, which can invalidate | 329 | 4 | // our iterator. Use a WeakTrackingVH to hold onto it in case this | 330 | 4 | // happens. | 331 | 4 | Value *CurValue = &*CurInstIterator; | 332 | 4 | WeakTrackingVH IterHandle(CurValue); | 333 | 4 | | 334 | 4 | f(); | 335 | 4 | | 336 | 4 | // If the iterator instruction was recursively deleted, start over at the | 337 | 4 | // start of the block. | 338 | 4 | if (IterHandle != CurValue) { | 339 | 2 | CurInstIterator = BB->begin(); | 340 | 2 | SunkAddrs.clear(); | 341 | 2 | } | 342 | 4 | } |
CodeGenPrepare.cpp:void (anonymous namespace)::CodeGenPrepare::resetIteratorIfInvalidatedWhileCalling<(anonymous namespace)::CodeGenPrepare::optimizeCallInst(llvm::CallInst*, bool&)::$_2>(llvm::BasicBlock*, (anonymous namespace)::CodeGenPrepare::optimizeCallInst(llvm::CallInst*, bool&)::$_2) Line | Count | Source | 327 | 3.40k | void resetIteratorIfInvalidatedWhileCalling(BasicBlock *BB, F f) { | 328 | 3.40k | // Substituting can cause recursive simplifications, which can invalidate | 329 | 3.40k | // our iterator. Use a WeakTrackingVH to hold onto it in case this | 330 | 3.40k | // happens. | 331 | 3.40k | Value *CurValue = &*CurInstIterator; | 332 | 3.40k | WeakTrackingVH IterHandle(CurValue); | 333 | 3.40k | | 334 | 3.40k | f(); | 335 | 3.40k | | 336 | 3.40k | // If the iterator instruction was recursively deleted, start over at the | 337 | 3.40k | // start of the block. | 338 | 3.40k | if (IterHandle != CurValue) { | 339 | 6 | CurInstIterator = BB->begin(); | 340 | 6 | SunkAddrs.clear(); | 341 | 6 | } | 342 | 3.40k | } |
CodeGenPrepare.cpp:void (anonymous namespace)::CodeGenPrepare::resetIteratorIfInvalidatedWhileCalling<(anonymous namespace)::CodeGenPrepare::optimizeCallInst(llvm::CallInst*, bool&)::$_3>(llvm::BasicBlock*, (anonymous namespace)::CodeGenPrepare::optimizeCallInst(llvm::CallInst*, bool&)::$_3) Line | Count | Source | 327 | 19 | void resetIteratorIfInvalidatedWhileCalling(BasicBlock *BB, F f) { | 328 | 19 | // Substituting can cause recursive simplifications, which can invalidate | 329 | 19 | // our iterator. Use a WeakTrackingVH to hold onto it in case this | 330 | 19 | // happens. | 331 | 19 | Value *CurValue = &*CurInstIterator; | 332 | 19 | WeakTrackingVH IterHandle(CurValue); | 333 | 19 | | 334 | 19 | f(); | 335 | 19 | | 336 | 19 | // If the iterator instruction was recursively deleted, start over at the | 337 | 19 | // start of the block. | 338 | 19 | if (IterHandle != CurValue) { | 339 | 1 | CurInstIterator = BB->begin(); | 340 | 1 | SunkAddrs.clear(); | 341 | 1 | } | 342 | 19 | } |
|
343 | | |
344 | | // Get the DominatorTree, building if necessary. |
345 | 26.4k | DominatorTree &getDT(Function &F) { |
346 | 26.4k | if (!DT) |
347 | 1.17k | DT = llvm::make_unique<DominatorTree>(F); |
348 | 26.4k | return *DT; |
349 | 26.4k | } |
350 | | |
351 | | bool eliminateFallThrough(Function &F); |
352 | | bool eliminateMostlyEmptyBlocks(Function &F); |
353 | | BasicBlock *findDestBlockOfMergeableEmptyBlock(BasicBlock *BB); |
354 | | bool canMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const; |
355 | | void eliminateMostlyEmptyBlock(BasicBlock *BB); |
356 | | bool isMergingEmptyBlockProfitable(BasicBlock *BB, BasicBlock *DestBB, |
357 | | bool isPreheader); |
358 | | bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT); |
359 | | bool optimizeInst(Instruction *I, bool &ModifiedDT); |
360 | | bool optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, |
361 | | Type *AccessTy, unsigned AddrSpace); |
362 | | bool optimizeInlineAsmInst(CallInst *CS); |
363 | | bool optimizeCallInst(CallInst *CI, bool &ModifiedDT); |
364 | | bool optimizeExt(Instruction *&I); |
365 | | bool optimizeExtUses(Instruction *I); |
366 | | bool optimizeLoadExt(LoadInst *Load); |
367 | | bool optimizeShiftInst(BinaryOperator *BO); |
368 | | bool optimizeSelectInst(SelectInst *SI); |
369 | | bool optimizeShuffleVectorInst(ShuffleVectorInst *SVI); |
370 | | bool optimizeSwitchInst(SwitchInst *SI); |
371 | | bool optimizeExtractElementInst(Instruction *Inst); |
372 | | bool dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT); |
373 | | bool placeDbgValues(Function &F); |
374 | | bool canFormExtLd(const SmallVectorImpl<Instruction *> &MovedExts, |
375 | | LoadInst *&LI, Instruction *&Inst, bool HasPromoted); |
376 | | bool tryToPromoteExts(TypePromotionTransaction &TPT, |
377 | | const SmallVectorImpl<Instruction *> &Exts, |
378 | | SmallVectorImpl<Instruction *> &ProfitablyMovedExts, |
379 | | unsigned CreatedInstsCost = 0); |
380 | | bool mergeSExts(Function &F); |
381 | | bool splitLargeGEPOffsets(); |
382 | | bool performAddressTypePromotion( |
383 | | Instruction *&Inst, |
384 | | bool AllowPromotionWithoutCommonHeader, |
385 | | bool HasPromoted, TypePromotionTransaction &TPT, |
386 | | SmallVectorImpl<Instruction *> &SpeculativelyMovedExts); |
387 | | bool splitBranchCondition(Function &F, bool &ModifiedDT); |
388 | | bool simplifyOffsetableRelocate(Instruction &I); |
389 | | |
390 | | bool tryToSinkFreeOperands(Instruction *I); |
391 | | bool replaceMathCmpWithIntrinsic(BinaryOperator *BO, CmpInst *Cmp, |
392 | | Intrinsic::ID IID); |
393 | | bool optimizeCmp(CmpInst *Cmp, bool &ModifiedDT); |
394 | | bool combineToUSubWithOverflow(CmpInst *Cmp, bool &ModifiedDT); |
395 | | bool combineToUAddWithOverflow(CmpInst *Cmp, bool &ModifiedDT); |
396 | | }; |
397 | | |
398 | | } // end anonymous namespace |
399 | | |
400 | | char CodeGenPrepare::ID = 0; |
401 | | |
402 | 49.1k | INITIALIZE_PASS_BEGIN(CodeGenPrepare, DEBUG_TYPE, |
403 | 49.1k | "Optimize for code generation", false, false) |
404 | 49.1k | INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) |
405 | 49.1k | INITIALIZE_PASS_END(CodeGenPrepare, DEBUG_TYPE, |
406 | | "Optimize for code generation", false, false) |
407 | | |
408 | 34.5k | FunctionPass *llvm::createCodeGenPreparePass() { return new CodeGenPrepare(); } |
409 | | |
410 | 490k | bool CodeGenPrepare::runOnFunction(Function &F) { |
411 | 490k | if (skipFunction(F)) |
412 | 272 | return false; |
413 | 490k | |
414 | 490k | DL = &F.getParent()->getDataLayout(); |
415 | 490k | |
416 | 490k | bool EverMadeChange = false; |
417 | 490k | // Clear per function information. |
418 | 490k | InsertedInsts.clear(); |
419 | 490k | PromotedInsts.clear(); |
420 | 490k | |
421 | 490k | if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) { |
422 | 490k | TM = &TPC->getTM<TargetMachine>(); |
423 | 490k | SubtargetInfo = TM->getSubtargetImpl(F); |
424 | 490k | TLI = SubtargetInfo->getTargetLowering(); |
425 | 490k | TRI = SubtargetInfo->getRegisterInfo(); |
426 | 490k | } |
427 | 490k | TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); |
428 | 490k | TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); |
429 | 490k | LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); |
430 | 490k | BPI.reset(new BranchProbabilityInfo(F, *LI)); |
431 | 490k | BFI.reset(new BlockFrequencyInfo(F, *BPI, *LI)); |
432 | 490k | OptSize = F.hasOptSize(); |
433 | 490k | |
434 | 490k | ProfileSummaryInfo *PSI = |
435 | 490k | &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); |
436 | 490k | if (ProfileGuidedSectionPrefix) { |
437 | 490k | if (PSI->isFunctionHotInCallGraph(&F, *BFI)) |
438 | 20 | F.setSectionPrefix(".hot"); |
439 | 490k | else if (PSI->isFunctionColdInCallGraph(&F, *BFI)) |
440 | 12 | F.setSectionPrefix(".unlikely"); |
441 | 490k | } |
442 | 490k | |
443 | 490k | /// This optimization identifies DIV instructions that can be |
444 | 490k | /// profitably bypassed and carried out with a shorter, faster divide. |
445 | 490k | if (!OptSize && !PSI->hasHugeWorkingSetSize()486k && TLI486k && |
446 | 490k | TLI->isSlowDivBypassed()486k ) { |
447 | 11.6k | const DenseMap<unsigned int, unsigned int> &BypassWidths = |
448 | 11.6k | TLI->getBypassSlowDivWidths(); |
449 | 11.6k | BasicBlock* BB = &*F.begin(); |
450 | 89.7k | while (BB != nullptr) { |
451 | 78.1k | // bypassSlowDivision may create new BBs, but we don't want to reapply the |
452 | 78.1k | // optimization to those blocks. |
453 | 78.1k | BasicBlock* Next = BB->getNextNode(); |
454 | 78.1k | EverMadeChange |= bypassSlowDivision(BB, BypassWidths); |
455 | 78.1k | BB = Next; |
456 | 78.1k | } |
457 | 11.6k | } |
458 | 490k | |
459 | 490k | // Eliminate blocks that contain only PHI nodes and an |
460 | 490k | // unconditional branch. |
461 | 490k | EverMadeChange |= eliminateMostlyEmptyBlocks(F); |
462 | 490k | |
463 | 490k | bool ModifiedDT = false; |
464 | 490k | if (!DisableBranchOpts) |
465 | 490k | EverMadeChange |= splitBranchCondition(F, ModifiedDT); |
466 | 490k | |
467 | 490k | // Split some critical edges where one of the sources is an indirect branch, |
468 | 490k | // to help generate sane code for PHIs involving such edges. |
469 | 490k | EverMadeChange |= SplitIndirectBrCriticalEdges(F); |
470 | 490k | |
471 | 490k | bool MadeChange = true; |
472 | 1.13M | while (MadeChange) { |
473 | 644k | MadeChange = false; |
474 | 644k | DT.reset(); |
475 | 6.95M | for (Function::iterator I = F.begin(); I != F.end(); ) { |
476 | 6.33M | BasicBlock *BB = &*I++; |
477 | 6.33M | bool ModifiedDTOnIteration = false; |
478 | 6.33M | MadeChange |= optimizeBlock(*BB, ModifiedDTOnIteration); |
479 | 6.33M | |
480 | 6.33M | // Restart BB iteration if the dominator tree of the Function was changed |
481 | 6.33M | if (ModifiedDTOnIteration) |
482 | 24.5k | break; |
483 | 6.33M | } |
484 | 644k | if (EnableTypePromotionMerge && !ValToSExtendedUses.empty()644k ) |
485 | 26.5k | MadeChange |= mergeSExts(F); |
486 | 644k | if (!LargeOffsetGEPMap.empty()) |
487 | 4.26k | MadeChange |= splitLargeGEPOffsets(); |
488 | 644k | |
489 | 644k | // Really free removed instructions during promotion. |
490 | 644k | for (Instruction *I : RemovedInsts) |
491 | 3.54k | I->deleteValue(); |
492 | 644k | |
493 | 644k | EverMadeChange |= MadeChange; |
494 | 644k | SeenChainsForSExt.clear(); |
495 | 644k | ValToSExtendedUses.clear(); |
496 | 644k | RemovedInsts.clear(); |
497 | 644k | LargeOffsetGEPMap.clear(); |
498 | 644k | LargeOffsetGEPID.clear(); |
499 | 644k | } |
500 | 490k | |
501 | 490k | SunkAddrs.clear(); |
502 | 490k | |
503 | 490k | if (!DisableBranchOpts) { |
504 | 490k | MadeChange = false; |
505 | 490k | // Use a set vector to get deterministic iteration order. The order the |
506 | 490k | // blocks are removed may affect whether or not PHI nodes in successors |
507 | 490k | // are removed. |
508 | 490k | SmallSetVector<BasicBlock*, 8> WorkList; |
509 | 2.56M | for (BasicBlock &BB : F) { |
510 | 2.56M | SmallVector<BasicBlock *, 2> Successors(succ_begin(&BB), succ_end(&BB)); |
511 | 2.56M | MadeChange |= ConstantFoldTerminator(&BB, true); |
512 | 2.56M | if (!MadeChange) continue2.55M ; |
513 | 5.17k | |
514 | 5.17k | for (SmallVectorImpl<BasicBlock*>::iterator |
515 | 13.3k | II = Successors.begin(), IE = Successors.end(); II != IE; ++II8.14k ) |
516 | 8.14k | if (pred_begin(*II) == pred_end(*II)) |
517 | 1.11k | WorkList.insert(*II); |
518 | 5.17k | } |
519 | 490k | |
520 | 490k | // Delete the dead blocks and any of their dead successors. |
521 | 490k | MadeChange |= !WorkList.empty(); |
522 | 492k | while (!WorkList.empty()) { |
523 | 1.71k | BasicBlock *BB = WorkList.pop_back_val(); |
524 | 1.71k | SmallVector<BasicBlock*, 2> Successors(succ_begin(BB), succ_end(BB)); |
525 | 1.71k | |
526 | 1.71k | DeleteDeadBlock(BB); |
527 | 1.71k | |
528 | 1.71k | for (SmallVectorImpl<BasicBlock*>::iterator |
529 | 3.12k | II = Successors.begin(), IE = Successors.end(); II != IE; ++II1.40k ) |
530 | 1.40k | if (pred_begin(*II) == pred_end(*II)) |
531 | 773 | WorkList.insert(*II); |
532 | 1.71k | } |
533 | 490k | |
534 | 490k | // Merge pairs of basic blocks with unconditional branches, connected by |
535 | 490k | // a single edge. |
536 | 490k | if (EverMadeChange || MadeChange362k ) |
537 | 128k | MadeChange |= eliminateFallThrough(F); |
538 | 490k | |
539 | 490k | EverMadeChange |= MadeChange; |
540 | 490k | } |
541 | 490k | |
542 | 490k | if (!DisableGCOpts490k ) { |
543 | 490k | SmallVector<Instruction *, 2> Statepoints; |
544 | 490k | for (BasicBlock &BB : F) |
545 | 2.55M | for (Instruction &I : BB) |
546 | 15.3M | if (isStatepoint(I)) |
547 | 92 | Statepoints.push_back(&I); |
548 | 490k | for (auto &I : Statepoints) |
549 | 92 | EverMadeChange |= simplifyOffsetableRelocate(*I); |
550 | 490k | } |
551 | 490k | |
552 | 490k | // Do this last to clean up use-before-def scenarios introduced by other |
553 | 490k | // preparatory transforms. |
554 | 490k | EverMadeChange |= placeDbgValues(F); |
555 | 490k | |
556 | 490k | return EverMadeChange; |
557 | 490k | } |
558 | | |
559 | | /// Merge basic blocks which are connected by a single edge, where one of the |
560 | | /// basic blocks has a single successor pointing to the other basic block, |
561 | | /// which has a single predecessor. |
562 | 128k | bool CodeGenPrepare::eliminateFallThrough(Function &F) { |
563 | 128k | bool Changed = false; |
564 | 128k | // Scan all of the blocks in the function, except for the entry block. |
565 | 128k | // Use a temporary array to avoid iterator being invalidated when |
566 | 128k | // deleting blocks. |
567 | 128k | SmallVector<WeakTrackingVH, 16> Blocks; |
568 | 128k | for (auto &Block : llvm::make_range(std::next(F.begin()), F.end())) |
569 | 1.93M | Blocks.push_back(&Block); |
570 | 128k | |
571 | 1.93M | for (auto &Block : Blocks) { |
572 | 1.93M | auto *BB = cast_or_null<BasicBlock>(Block); |
573 | 1.93M | if (!BB) |
574 | 0 | continue; |
575 | 1.93M | // If the destination block has a single pred, then this is a trivial |
576 | 1.93M | // edge, just collapse it. |
577 | 1.93M | BasicBlock *SinglePred = BB->getSinglePredecessor(); |
578 | 1.93M | |
579 | 1.93M | // Don't merge if BB's address is taken. |
580 | 1.93M | if (!SinglePred || SinglePred == BB1.25M || BB->hasAddressTaken()1.25M ) continue679k ; |
581 | 1.25M | |
582 | 1.25M | BranchInst *Term = dyn_cast<BranchInst>(SinglePred->getTerminator()); |
583 | 1.25M | if (Term && !Term->isConditional()1.18M ) { |
584 | 2.89k | Changed = true; |
585 | 2.89k | LLVM_DEBUG(dbgs() << "To merge:\n" << *BB << "\n\n\n"); |
586 | 2.89k | |
587 | 2.89k | // Merge BB into SinglePred and delete it. |
588 | 2.89k | MergeBlockIntoPredecessor(BB); |
589 | 2.89k | } |
590 | 1.25M | } |
591 | 128k | return Changed; |
592 | 128k | } |
593 | | |
594 | | /// Find a destination block from BB if BB is mergeable empty block. |
595 | 2.21M | BasicBlock *CodeGenPrepare::findDestBlockOfMergeableEmptyBlock(BasicBlock *BB) { |
596 | 2.21M | // If this block doesn't end with an uncond branch, ignore it. |
597 | 2.21M | BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator()); |
598 | 2.21M | if (!BI || !BI->isUnconditional()1.98M ) |
599 | 1.26M | return nullptr; |
600 | 949k | |
601 | 949k | // If the instruction before the branch (skipping debug info) isn't a phi |
602 | 949k | // node, then other stuff is happening here. |
603 | 949k | BasicBlock::iterator BBI = BI->getIterator(); |
604 | 949k | if (BBI != BB->begin()) { |
605 | 728k | --BBI; |
606 | 728k | while (isa<DbgInfoIntrinsic>(BBI)) { |
607 | 25 | if (BBI == BB->begin()) |
608 | 4 | break; |
609 | 21 | --BBI; |
610 | 21 | } |
611 | 728k | if (!isa<DbgInfoIntrinsic>(BBI) && !isa<PHINode>(BBI)728k ) |
612 | 711k | return nullptr; |
613 | 238k | } |
614 | 238k | |
615 | 238k | // Do not break infinite loops. |
616 | 238k | BasicBlock *DestBB = BI->getSuccessor(0); |
617 | 238k | if (DestBB == BB) |
618 | 123 | return nullptr; |
619 | 238k | |
620 | 238k | if (!canMergeBlocks(BB, DestBB)) |
621 | 17.6k | DestBB = nullptr; |
622 | 238k | |
623 | 238k | return DestBB; |
624 | 238k | } |
625 | | |
626 | | /// Eliminate blocks that contain only PHI nodes, debug info directives, and an |
627 | | /// unconditional branch. Passes before isel (e.g. LSR/loopsimplify) often split |
628 | | /// edges in ways that are non-optimal for isel. Start by eliminating these |
629 | | /// blocks so we can split them the way we want them. |
630 | 490k | bool CodeGenPrepare::eliminateMostlyEmptyBlocks(Function &F) { |
631 | 490k | SmallPtrSet<BasicBlock *, 16> Preheaders; |
632 | 490k | SmallVector<Loop *, 16> LoopList(LI->begin(), LI->end()); |
633 | 698k | while (!LoopList.empty()) { |
634 | 208k | Loop *L = LoopList.pop_back_val(); |
635 | 208k | LoopList.insert(LoopList.end(), L->begin(), L->end()); |
636 | 208k | if (BasicBlock *Preheader = L->getLoopPreheader()) |
637 | 208k | Preheaders.insert(Preheader); |
638 | 208k | } |
639 | 490k | |
640 | 490k | bool MadeChange = false; |
641 | 490k | // Copy blocks into a temporary array to avoid iterator invalidation issues |
642 | 490k | // as we remove them. |
643 | 490k | // Note that this intentionally skips the entry block. |
644 | 490k | SmallVector<WeakTrackingVH, 16> Blocks; |
645 | 490k | for (auto &Block : llvm::make_range(std::next(F.begin()), F.end())) |
646 | 2.21M | Blocks.push_back(&Block); |
647 | 490k | |
648 | 2.21M | for (auto &Block : Blocks) { |
649 | 2.21M | BasicBlock *BB = cast_or_null<BasicBlock>(Block); |
650 | 2.21M | if (!BB) |
651 | 0 | continue; |
652 | 2.21M | BasicBlock *DestBB = findDestBlockOfMergeableEmptyBlock(BB); |
653 | 2.21M | if (!DestBB || |
654 | 2.21M | !isMergingEmptyBlockProfitable(BB, DestBB, Preheaders.count(BB))220k ) |
655 | 2.07M | continue; |
656 | 136k | |
657 | 136k | eliminateMostlyEmptyBlock(BB); |
658 | 136k | MadeChange = true; |
659 | 136k | } |
660 | 490k | return MadeChange; |
661 | 490k | } |
662 | | |
663 | | bool CodeGenPrepare::isMergingEmptyBlockProfitable(BasicBlock *BB, |
664 | | BasicBlock *DestBB, |
665 | 220k | bool isPreheader) { |
666 | 220k | // Do not delete loop preheaders if doing so would create a critical edge. |
667 | 220k | // Loop preheaders can be good locations to spill registers. If the |
668 | 220k | // preheader is deleted and we create a critical edge, registers may be |
669 | 220k | // spilled in the loop body instead. |
670 | 220k | if (!DisablePreheaderProtect && isPreheader220k && |
671 | 220k | !(83.0k BB->getSinglePredecessor()83.0k && |
672 | 83.0k | BB->getSinglePredecessor()->getSingleSuccessor()77.3k )) |
673 | 83.0k | return false; |
674 | 137k | |
675 | 137k | // Skip merging if the block's successor is also a successor to any callbr |
676 | 137k | // that leads to this block. |
677 | 137k | // FIXME: Is this really needed? Is this a correctness issue? |
678 | 292k | for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); 137k PI != E; ++PI155k ) { |
679 | 155k | if (auto *CBI = dyn_cast<CallBrInst>((*PI)->getTerminator())) |
680 | 0 | for (unsigned i = 0, e = CBI->getNumSuccessors(); i != e; ++i) |
681 | 0 | if (DestBB == CBI->getSuccessor(i)) |
682 | 0 | return false; |
683 | 155k | } |
684 | 137k | |
685 | 137k | // Try to skip merging if the unique predecessor of BB is terminated by a |
686 | 137k | // switch or indirect branch instruction, and BB is used as an incoming block |
687 | 137k | // of PHIs in DestBB. In such case, merging BB and DestBB would cause ISel to |
688 | 137k | // add COPY instructions in the predecessor of BB instead of BB (if it is not |
689 | 137k | // merged). Note that the critical edge created by merging such blocks wont be |
690 | 137k | // split in MachineSink because the jump table is not analyzable. By keeping |
691 | 137k | // such empty block (BB), ISel will place COPY instructions in BB, not in the |
692 | 137k | // predecessor of BB. |
693 | 137k | BasicBlock *Pred = BB->getUniquePredecessor(); |
694 | 137k | if (!Pred || |
695 | 137k | !(126k isa<SwitchInst>(Pred->getTerminator())126k || |
696 | 126k | isa<IndirectBrInst>(Pred->getTerminator())125k )) |
697 | 135k | return true; |
698 | 1.44k | |
699 | 1.44k | if (BB->getTerminator() != BB->getFirstNonPHIOrDbg()) |
700 | 0 | return true; |
701 | 1.44k | |
702 | 1.44k | // We use a simple cost heuristic which determine skipping merging is |
703 | 1.44k | // profitable if the cost of skipping merging is less than the cost of |
704 | 1.44k | // merging : Cost(skipping merging) < Cost(merging BB), where the |
705 | 1.44k | // Cost(skipping merging) is Freq(BB) * (Cost(Copy) + Cost(Branch)), and |
706 | 1.44k | // the Cost(merging BB) is Freq(Pred) * Cost(Copy). |
707 | 1.44k | // Assuming Cost(Copy) == Cost(Branch), we could simplify it to : |
708 | 1.44k | // Freq(Pred) / Freq(BB) > 2. |
709 | 1.44k | // Note that if there are multiple empty blocks sharing the same incoming |
710 | 1.44k | // value for the PHIs in the DestBB, we consider them together. In such |
711 | 1.44k | // case, Cost(merging BB) will be the sum of their frequencies. |
712 | 1.44k | |
713 | 1.44k | if (!isa<PHINode>(DestBB->begin())) |
714 | 646 | return true; |
715 | 803 | |
716 | 803 | SmallPtrSet<BasicBlock *, 16> SameIncomingValueBBs; |
717 | 803 | |
718 | 803 | // Find all other incoming blocks from which incoming values of all PHIs in |
719 | 803 | // DestBB are the same as the ones from BB. |
720 | 11.4k | for (pred_iterator PI = pred_begin(DestBB), E = pred_end(DestBB); PI != E; |
721 | 10.6k | ++PI) { |
722 | 10.6k | BasicBlock *DestBBPred = *PI; |
723 | 10.6k | if (DestBBPred == BB) |
724 | 803 | continue; |
725 | 9.84k | |
726 | 10.4k | if (9.84k llvm::all_of(DestBB->phis(), [&](const PHINode &DestPN) 9.84k { |
727 | 10.4k | return DestPN.getIncomingValueForBlock(BB) == |
728 | 10.4k | DestPN.getIncomingValueForBlock(DestBBPred); |
729 | 10.4k | })) |
730 | 386 | SameIncomingValueBBs.insert(DestBBPred); |
731 | 9.84k | } |
732 | 803 | |
733 | 803 | // See if all BB's incoming values are same as the value from Pred. In this |
734 | 803 | // case, no reason to skip merging because COPYs are expected to be place in |
735 | 803 | // Pred already. |
736 | 803 | if (SameIncomingValueBBs.count(Pred)) |
737 | 6 | return true; |
738 | 797 | |
739 | 797 | BlockFrequency PredFreq = BFI->getBlockFreq(Pred); |
740 | 797 | BlockFrequency BBFreq = BFI->getBlockFreq(BB); |
741 | 797 | |
742 | 797 | for (auto SameValueBB : SameIncomingValueBBs) |
743 | 363 | if (SameValueBB->getUniquePredecessor() == Pred && |
744 | 363 | DestBB == findDestBlockOfMergeableEmptyBlock(SameValueBB)49 ) |
745 | 15 | BBFreq += BFI->getBlockFreq(SameValueBB); |
746 | 797 | |
747 | 797 | return PredFreq.getFrequency() <= |
748 | 797 | BBFreq.getFrequency() * FreqRatioToSkipMerge; |
749 | 797 | } |
750 | | |
751 | | /// Return true if we can merge BB into DestBB if there is a single |
752 | | /// unconditional branch between them, and BB contains no other non-phi |
753 | | /// instructions. |
754 | | bool CodeGenPrepare::canMergeBlocks(const BasicBlock *BB, |
755 | 238k | const BasicBlock *DestBB) const { |
756 | 238k | // We only want to eliminate blocks whose phi nodes are used by phi nodes in |
757 | 238k | // the successor. If there are more complex condition (e.g. preheaders), |
758 | 238k | // don't mess around with them. |
759 | 238k | for (const PHINode &PN : BB->phis()) { |
760 | 20.0k | for (const User *U : PN.users()) { |
761 | 20.0k | const Instruction *UI = cast<Instruction>(U); |
762 | 20.0k | if (UI->getParent() != DestBB || !isa<PHINode>(UI)15.4k ) |
763 | 7.98k | return false; |
764 | 12.0k | // If User is inside DestBB block and it is a PHINode then check |
765 | 12.0k | // incoming value. If incoming value is not from BB then this is |
766 | 12.0k | // a complex condition (e.g. preheaders) we want to avoid here. |
767 | 12.0k | if (UI->getParent() == DestBB) { |
768 | 12.0k | if (const PHINode *UPN = dyn_cast<PHINode>(UI)) |
769 | 42.1k | for (unsigned I = 0, E = UPN->getNumIncomingValues(); 12.0k I != E; ++I30.0k ) { |
770 | 30.0k | Instruction *Insn = dyn_cast<Instruction>(UPN->getIncomingValue(I)); |
771 | 30.0k | if (Insn && Insn->getParent() == BB23.4k && |
772 | 30.0k | Insn->getParent() != UPN->getIncomingBlock(I)12.0k ) |
773 | 2 | return false; |
774 | 30.0k | } |
775 | 12.0k | } |
776 | 12.0k | } |
777 | 19.8k | } |
778 | 238k | |
779 | 238k | // If BB and DestBB contain any common predecessors, then the phi nodes in BB |
780 | 238k | // and DestBB may have conflicting incoming values for the block. If so, we |
781 | 238k | // can't merge the block. |
782 | 238k | const PHINode *DestBBPN = dyn_cast<PHINode>(DestBB->begin()); |
783 | 230k | if (!DestBBPN) return true88.7k ; // no conflict. |
784 | 141k | |
785 | 141k | // Collect the preds of BB. |
786 | 141k | SmallPtrSet<const BasicBlock*, 16> BBPreds; |
787 | 141k | if (const PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) { |
788 | 9.16k | // It is faster to get preds from a PHI than with pred_iterator. |
789 | 29.8k | for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i20.7k ) |
790 | 20.7k | BBPreds.insert(BBPN->getIncomingBlock(i)); |
791 | 132k | } else { |
792 | 132k | BBPreds.insert(pred_begin(BB), pred_end(BB)); |
793 | 132k | } |
794 | 141k | |
795 | 141k | // Walk the preds of DestBB. |
796 | 598k | for (unsigned i = 0, e = DestBBPN->getNumIncomingValues(); i != e; ++i457k ) { |
797 | 467k | BasicBlock *Pred = DestBBPN->getIncomingBlock(i); |
798 | 467k | if (BBPreds.count(Pred)) { // Common predecessor? |
799 | 10.0k | for (const PHINode &PN : DestBB->phis()) { |
800 | 10.0k | const Value *V1 = PN.getIncomingValueForBlock(Pred); |
801 | 10.0k | const Value *V2 = PN.getIncomingValueForBlock(BB); |
802 | 10.0k | |
803 | 10.0k | // If V2 is a phi node in BB, look up what the mapped value will be. |
804 | 10.0k | if (const PHINode *V2PN = dyn_cast<PHINode>(V2)) |
805 | 597 | if (V2PN->getParent() == BB) |
806 | 206 | V2 = V2PN->getIncomingValueForBlock(Pred); |
807 | 10.0k | |
808 | 10.0k | // If there is a conflict, bail out. |
809 | 10.0k | if (V1 != V2) return false9.62k ; |
810 | 10.0k | } |
811 | 9.65k | } |
812 | 467k | } |
813 | 141k | |
814 | 141k | return true131k ; |
815 | 141k | } |
816 | | |
817 | | /// Eliminate a basic block that has only phi's and an unconditional branch in |
818 | | /// it. |
819 | 136k | void CodeGenPrepare::eliminateMostlyEmptyBlock(BasicBlock *BB) { |
820 | 136k | BranchInst *BI = cast<BranchInst>(BB->getTerminator()); |
821 | 136k | BasicBlock *DestBB = BI->getSuccessor(0); |
822 | 136k | |
823 | 136k | LLVM_DEBUG(dbgs() << "MERGING MOSTLY EMPTY BLOCKS - BEFORE:\n" |
824 | 136k | << *BB << *DestBB); |
825 | 136k | |
826 | 136k | // If the destination block has a single pred, then this is a trivial edge, |
827 | 136k | // just collapse it. |
828 | 136k | if (BasicBlock *SinglePred = DestBB->getSinglePredecessor()) { |
829 | 108 | if (SinglePred != DestBB) { |
830 | 108 | assert(SinglePred == BB && |
831 | 108 | "Single predecessor not the same as predecessor"); |
832 | 108 | // Merge DestBB into SinglePred/BB and delete it. |
833 | 108 | MergeBlockIntoPredecessor(DestBB); |
834 | 108 | // Note: BB(=SinglePred) will not be deleted on this path. |
835 | 108 | // DestBB(=its single successor) is the one that was deleted. |
836 | 108 | LLVM_DEBUG(dbgs() << "AFTER:\n" << *SinglePred << "\n\n\n"); |
837 | 108 | return; |
838 | 108 | } |
839 | 136k | } |
840 | 136k | |
841 | 136k | // Otherwise, we have multiple predecessors of BB. Update the PHIs in DestBB |
842 | 136k | // to handle the new incoming edges it is about to have. |
843 | 136k | for (PHINode &PN : DestBB->phis()) { |
844 | 70.4k | // Remove the incoming value for BB, and remember it. |
845 | 70.4k | Value *InVal = PN.removeIncomingValue(BB, false); |
846 | 70.4k | |
847 | 70.4k | // Two options: either the InVal is a phi node defined in BB or it is some |
848 | 70.4k | // value that dominates BB. |
849 | 70.4k | PHINode *InValPhi = dyn_cast<PHINode>(InVal); |
850 | 70.4k | if (InValPhi && InValPhi->getParent() == BB25.2k ) { |
851 | 6.27k | // Add all of the input values of the input PHI as inputs of this phi. |
852 | 21.3k | for (unsigned i = 0, e = InValPhi->getNumIncomingValues(); i != e; ++i15.0k ) |
853 | 15.0k | PN.addIncoming(InValPhi->getIncomingValue(i), |
854 | 15.0k | InValPhi->getIncomingBlock(i)); |
855 | 64.1k | } else { |
856 | 64.1k | // Otherwise, add one instance of the dominating value for each edge that |
857 | 64.1k | // we will be adding. |
858 | 64.1k | if (PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) { |
859 | 1.40k | for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i998 ) |
860 | 998 | PN.addIncoming(InVal, BBPN->getIncomingBlock(i)); |
861 | 63.7k | } else { |
862 | 129k | for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI66.1k ) |
863 | 66.1k | PN.addIncoming(InVal, *PI); |
864 | 63.7k | } |
865 | 64.1k | } |
866 | 70.4k | } |
867 | 136k | |
868 | 136k | // The PHIs are now updated, change everything that refers to BB to use |
869 | 136k | // DestBB and remove BB. |
870 | 136k | BB->replaceAllUsesWith(DestBB); |
871 | 136k | BB->eraseFromParent(); |
872 | 136k | ++NumBlocksElim; |
873 | 136k | |
874 | 136k | LLVM_DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n"); |
875 | 136k | } |
876 | | |
877 | | // Computes a map of base pointer relocation instructions to corresponding |
878 | | // derived pointer relocation instructions given a vector of all relocate calls |
879 | | static void computeBaseDerivedRelocateMap( |
880 | | const SmallVectorImpl<GCRelocateInst *> &AllRelocateCalls, |
881 | | DenseMap<GCRelocateInst *, SmallVector<GCRelocateInst *, 2>> |
882 | 26 | &RelocateInstMap) { |
883 | 26 | // Collect information in two maps: one primarily for locating the base object |
884 | 26 | // while filling the second map; the second map is the final structure holding |
885 | 26 | // a mapping between Base and corresponding Derived relocate calls |
886 | 26 | DenseMap<std::pair<unsigned, unsigned>, GCRelocateInst *> RelocateIdxMap; |
887 | 64 | for (auto *ThisRelocate : AllRelocateCalls) { |
888 | 64 | auto K = std::make_pair(ThisRelocate->getBasePtrIndex(), |
889 | 64 | ThisRelocate->getDerivedPtrIndex()); |
890 | 64 | RelocateIdxMap.insert(std::make_pair(K, ThisRelocate)); |
891 | 64 | } |
892 | 64 | for (auto &Item : RelocateIdxMap) { |
893 | 64 | std::pair<unsigned, unsigned> Key = Item.first; |
894 | 64 | if (Key.first == Key.second) |
895 | 36 | // Base relocation: nothing to insert |
896 | 36 | continue; |
897 | 28 | |
898 | 28 | GCRelocateInst *I = Item.second; |
899 | 28 | auto BaseKey = std::make_pair(Key.first, Key.first); |
900 | 28 | |
901 | 28 | // We're iterating over RelocateIdxMap so we cannot modify it. |
902 | 28 | auto MaybeBase = RelocateIdxMap.find(BaseKey); |
903 | 28 | if (MaybeBase == RelocateIdxMap.end()) |
904 | 2 | // TODO: We might want to insert a new base object relocate and gep off |
905 | 2 | // that, if there are enough derived object relocates. |
906 | 2 | continue; |
907 | 26 | |
908 | 26 | RelocateInstMap[MaybeBase->second].push_back(I); |
909 | 26 | } |
910 | 26 | } |
911 | | |
912 | | // Accepts a GEP and extracts the operands into a vector provided they're all |
913 | | // small integer constants |
914 | | static bool getGEPSmallConstantIntOffsetV(GetElementPtrInst *GEP, |
915 | 11 | SmallVectorImpl<Value *> &OffsetV) { |
916 | 21 | for (unsigned i = 1; i < GEP->getNumOperands(); i++10 ) { |
917 | 13 | // Only accept small constant integer operands |
918 | 13 | auto Op = dyn_cast<ConstantInt>(GEP->getOperand(i)); |
919 | 13 | if (!Op || Op->getZExtValue() > 2011 ) |
920 | 3 | return false; |
921 | 13 | } |
922 | 11 | |
923 | 17 | for (unsigned i = 1; 8 i < GEP->getNumOperands(); i++9 ) |
924 | 9 | OffsetV.push_back(GEP->getOperand(i)); |
925 | 8 | return true; |
926 | 11 | } |
927 | | |
928 | | // Takes a RelocatedBase (base pointer relocation instruction) and Targets to |
929 | | // replace, computes a replacement, and affects it. |
930 | | static bool |
931 | | simplifyRelocatesOffABase(GCRelocateInst *RelocatedBase, |
932 | 18 | const SmallVectorImpl<GCRelocateInst *> &Targets) { |
933 | 18 | bool MadeChange = false; |
934 | 18 | // We must ensure the relocation of derived pointer is defined after |
935 | 18 | // relocation of base pointer. If we find a relocation corresponding to base |
936 | 18 | // defined earlier than relocation of base then we move relocation of base |
937 | 18 | // right before found relocation. We consider only relocation in the same |
938 | 18 | // basic block as relocation of base. Relocations from other basic block will |
939 | 18 | // be skipped by optimization and we do not care about them. |
940 | 18 | for (auto R = RelocatedBase->getParent()->getFirstInsertionPt(); |
941 | 64 | &*R != RelocatedBase; ++R46 ) |
942 | 52 | if (auto RI = dyn_cast<GCRelocateInst>(R)) |
943 | 12 | if (RI->getStatepoint() == RelocatedBase->getStatepoint()) |
944 | 6 | if (RI->getBasePtrIndex() == RelocatedBase->getBasePtrIndex()) { |
945 | 6 | RelocatedBase->moveBefore(RI); |
946 | 6 | break; |
947 | 6 | } |
948 | 18 | |
949 | 26 | for (GCRelocateInst *ToReplace : Targets) { |
950 | 26 | assert(ToReplace->getBasePtrIndex() == RelocatedBase->getBasePtrIndex() && |
951 | 26 | "Not relocating a derived object of the original base object"); |
952 | 26 | if (ToReplace->getBasePtrIndex() == ToReplace->getDerivedPtrIndex()) { |
953 | 0 | // A duplicate relocate call. TODO: coalesce duplicates. |
954 | 0 | continue; |
955 | 0 | } |
956 | 26 | |
957 | 26 | if (RelocatedBase->getParent() != ToReplace->getParent()) { |
958 | 1 | // Base and derived relocates are in different basic blocks. |
959 | 1 | // In this case transform is only valid when base dominates derived |
960 | 1 | // relocate. However it would be too expensive to check dominance |
961 | 1 | // for each such relocate, so we skip the whole transformation. |
962 | 1 | continue; |
963 | 1 | } |
964 | 25 | |
965 | 25 | Value *Base = ToReplace->getBasePtr(); |
966 | 25 | auto Derived = dyn_cast<GetElementPtrInst>(ToReplace->getDerivedPtr()); |
967 | 25 | if (!Derived || Derived->getPointerOperand() != Base11 ) |
968 | 14 | continue; |
969 | 11 | |
970 | 11 | SmallVector<Value *, 2> OffsetV; |
971 | 11 | if (!getGEPSmallConstantIntOffsetV(Derived, OffsetV)) |
972 | 3 | continue; |
973 | 8 | |
974 | 8 | // Create a Builder and replace the target callsite with a gep |
975 | 8 | assert(RelocatedBase->getNextNode() && |
976 | 8 | "Should always have one since it's not a terminator"); |
977 | 8 | |
978 | 8 | // Insert after RelocatedBase |
979 | 8 | IRBuilder<> Builder(RelocatedBase->getNextNode()); |
980 | 8 | Builder.SetCurrentDebugLocation(ToReplace->getDebugLoc()); |
981 | 8 | |
982 | 8 | // If gc_relocate does not match the actual type, cast it to the right type. |
983 | 8 | // In theory, there must be a bitcast after gc_relocate if the type does not |
984 | 8 | // match, and we should reuse it to get the derived pointer. But it could be |
985 | 8 | // cases like this: |
986 | 8 | // bb1: |
987 | 8 | // ... |
988 | 8 | // %g1 = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(...) |
989 | 8 | // br label %merge |
990 | 8 | // |
991 | 8 | // bb2: |
992 | 8 | // ... |
993 | 8 | // %g2 = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(...) |
994 | 8 | // br label %merge |
995 | 8 | // |
996 | 8 | // merge: |
997 | 8 | // %p1 = phi i8 addrspace(1)* [ %g1, %bb1 ], [ %g2, %bb2 ] |
998 | 8 | // %cast = bitcast i8 addrspace(1)* %p1 in to i32 addrspace(1)* |
999 | 8 | // |
1000 | 8 | // In this case, we can not find the bitcast any more. So we insert a new bitcast |
1001 | 8 | // no matter there is already one or not. In this way, we can handle all cases, and |
1002 | 8 | // the extra bitcast should be optimized away in later passes. |
1003 | 8 | Value *ActualRelocatedBase = RelocatedBase; |
1004 | 8 | if (RelocatedBase->getType() != Base->getType()) { |
1005 | 0 | ActualRelocatedBase = |
1006 | 0 | Builder.CreateBitCast(RelocatedBase, Base->getType()); |
1007 | 0 | } |
1008 | 8 | Value *Replacement = Builder.CreateGEP( |
1009 | 8 | Derived->getSourceElementType(), ActualRelocatedBase, makeArrayRef(OffsetV)); |
1010 | 8 | Replacement->takeName(ToReplace); |
1011 | 8 | // If the newly generated derived pointer's type does not match the original derived |
1012 | 8 | // pointer's type, cast the new derived pointer to match it. Same reasoning as above. |
1013 | 8 | Value *ActualReplacement = Replacement; |
1014 | 8 | if (Replacement->getType() != ToReplace->getType()) { |
1015 | 0 | ActualReplacement = |
1016 | 0 | Builder.CreateBitCast(Replacement, ToReplace->getType()); |
1017 | 0 | } |
1018 | 8 | ToReplace->replaceAllUsesWith(ActualReplacement); |
1019 | 8 | ToReplace->eraseFromParent(); |
1020 | 8 | |
1021 | 8 | MadeChange = true; |
1022 | 8 | } |
1023 | 18 | return MadeChange; |
1024 | 18 | } |
1025 | | |
1026 | | // Turns this: |
1027 | | // |
1028 | | // %base = ... |
1029 | | // %ptr = gep %base + 15 |
1030 | | // %tok = statepoint (%fun, i32 0, i32 0, i32 0, %base, %ptr) |
1031 | | // %base' = relocate(%tok, i32 4, i32 4) |
1032 | | // %ptr' = relocate(%tok, i32 4, i32 5) |
1033 | | // %val = load %ptr' |
1034 | | // |
1035 | | // into this: |
1036 | | // |
1037 | | // %base = ... |
1038 | | // %ptr = gep %base + 15 |
1039 | | // %tok = statepoint (%fun, i32 0, i32 0, i32 0, %base, %ptr) |
1040 | | // %base' = gc.relocate(%tok, i32 4, i32 4) |
1041 | | // %ptr' = gep %base' + 15 |
1042 | | // %val = load %ptr' |
1043 | 92 | bool CodeGenPrepare::simplifyOffsetableRelocate(Instruction &I) { |
1044 | 92 | bool MadeChange = false; |
1045 | 92 | SmallVector<GCRelocateInst *, 2> AllRelocateCalls; |
1046 | 92 | |
1047 | 92 | for (auto *U : I.users()) |
1048 | 106 | if (GCRelocateInst *Relocate = dyn_cast<GCRelocateInst>(U)) |
1049 | 82 | // Collect all the relocate calls associated with a statepoint |
1050 | 82 | AllRelocateCalls.push_back(Relocate); |
1051 | 92 | |
1052 | 92 | // We need atleast one base pointer relocation + one derived pointer |
1053 | 92 | // relocation to mangle |
1054 | 92 | if (AllRelocateCalls.size() < 2) |
1055 | 66 | return false; |
1056 | 26 | |
1057 | 26 | // RelocateInstMap is a mapping from the base relocate instruction to the |
1058 | 26 | // corresponding derived relocate instructions |
1059 | 26 | DenseMap<GCRelocateInst *, SmallVector<GCRelocateInst *, 2>> RelocateInstMap; |
1060 | 26 | computeBaseDerivedRelocateMap(AllRelocateCalls, RelocateInstMap); |
1061 | 26 | if (RelocateInstMap.empty()) |
1062 | 8 | return false; |
1063 | 18 | |
1064 | 18 | for (auto &Item : RelocateInstMap) |
1065 | 18 | // Item.first is the RelocatedBase to offset against |
1066 | 18 | // Item.second is the vector of Targets to replace |
1067 | 18 | MadeChange = simplifyRelocatesOffABase(Item.first, Item.second); |
1068 | 18 | return MadeChange; |
1069 | 18 | } |
1070 | | |
1071 | | /// Sink the specified cast instruction into its user blocks. |
1072 | 4.15M | static bool SinkCast(CastInst *CI) { |
1073 | 4.15M | BasicBlock *DefBB = CI->getParent(); |
1074 | 4.15M | |
1075 | 4.15M | /// InsertedCasts - Only insert a cast in each block once. |
1076 | 4.15M | DenseMap<BasicBlock*, CastInst*> InsertedCasts; |
1077 | 4.15M | |
1078 | 4.15M | bool MadeChange = false; |
1079 | 4.15M | for (Value::user_iterator UI = CI->user_begin(), E = CI->user_end(); |
1080 | 10.2M | UI != E; ) { |
1081 | 6.06M | Use &TheUse = UI.getUse(); |
1082 | 6.06M | Instruction *User = cast<Instruction>(*UI); |
1083 | 6.06M | |
1084 | 6.06M | // Figure out which BB this cast is used in. For PHI's this is the |
1085 | 6.06M | // appropriate predecessor block. |
1086 | 6.06M | BasicBlock *UserBB = User->getParent(); |
1087 | 6.06M | if (PHINode *PN = dyn_cast<PHINode>(User)) { |
1088 | 250k | UserBB = PN->getIncomingBlock(TheUse); |
1089 | 250k | } |
1090 | 6.06M | |
1091 | 6.06M | // Preincrement use iterator so we don't invalidate it. |
1092 | 6.06M | ++UI; |
1093 | 6.06M | |
1094 | 6.06M | // The first insertion point of a block containing an EH pad is after the |
1095 | 6.06M | // pad. If the pad is the user, we cannot sink the cast past the pad. |
1096 | 6.06M | if (User->isEHPad()) |
1097 | 1 | continue; |
1098 | 6.06M | |
1099 | 6.06M | // If the block selected to receive the cast is an EH pad that does not |
1100 | 6.06M | // allow non-PHI instructions before the terminator, we can't sink the |
1101 | 6.06M | // cast. |
1102 | 6.06M | if (UserBB->getTerminator()->isEHPad()) |
1103 | 0 | continue; |
1104 | 6.06M | |
1105 | 6.06M | // If this user is in the same block as the cast, don't change the cast. |
1106 | 6.06M | if (UserBB == DefBB) continue5.70M ; |
1107 | 353k | |
1108 | 353k | // If we have already inserted a cast into this block, use it. |
1109 | 353k | CastInst *&InsertedCast = InsertedCasts[UserBB]; |
1110 | 353k | |
1111 | 353k | if (!InsertedCast) { |
1112 | 308k | BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); |
1113 | 308k | assert(InsertPt != UserBB->end()); |
1114 | 308k | InsertedCast = CastInst::Create(CI->getOpcode(), CI->getOperand(0), |
1115 | 308k | CI->getType(), "", &*InsertPt); |
1116 | 308k | InsertedCast->setDebugLoc(CI->getDebugLoc()); |
1117 | 308k | } |
1118 | 353k | |
1119 | 353k | // Replace a use of the cast with a use of the new cast. |
1120 | 353k | TheUse = InsertedCast; |
1121 | 353k | MadeChange = true; |
1122 | 353k | ++NumCastUses; |
1123 | 353k | } |
1124 | 4.15M | |
1125 | 4.15M | // If we removed all uses, nuke the cast. |
1126 | 4.15M | if (CI->use_empty()) { |
1127 | 81.4k | salvageDebugInfo(*CI); |
1128 | 81.4k | CI->eraseFromParent(); |
1129 | 81.4k | MadeChange = true; |
1130 | 81.4k | } |
1131 | 4.15M | |
1132 | 4.15M | return MadeChange; |
1133 | 4.15M | } |
1134 | | |
1135 | | /// If the specified cast instruction is a noop copy (e.g. it's casting from |
1136 | | /// one pointer type to another, i32->i8 on PPC), sink it into user blocks to |
1137 | | /// reduce the number of virtual registers that must be created and coalesced. |
1138 | | /// |
1139 | | /// Return true if any changes are made. |
1140 | | static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI, |
1141 | 5.64M | const DataLayout &DL) { |
1142 | 5.64M | // Sink only "cheap" (or nop) address-space casts. This is a weaker condition |
1143 | 5.64M | // than sinking only nop casts, but is helpful on some platforms. |
1144 | 5.64M | if (auto *ASC = dyn_cast<AddrSpaceCastInst>(CI)) { |
1145 | 299 | if (!TLI.isFreeAddrSpaceCast(ASC->getSrcAddressSpace(), |
1146 | 299 | ASC->getDestAddressSpace())) |
1147 | 156 | return false; |
1148 | 5.64M | } |
1149 | 5.64M | |
1150 | 5.64M | // If this is a noop copy, |
1151 | 5.64M | EVT SrcVT = TLI.getValueType(DL, CI->getOperand(0)->getType()); |
1152 | 5.64M | EVT DstVT = TLI.getValueType(DL, CI->getType()); |
1153 | 5.64M | |
1154 | 5.64M | // This is an fp<->int conversion? |
1155 | 5.64M | if (SrcVT.isInteger() != DstVT.isInteger()) |
1156 | 269k | return false; |
1157 | 5.37M | |
1158 | 5.37M | // If this is an extension, it will be a zero or sign extension, which |
1159 | 5.37M | // isn't a noop. |
1160 | 5.37M | if (SrcVT.bitsLT(DstVT)) return false856k ; |
1161 | 4.52M | |
1162 | 4.52M | // If these values will be promoted, find out what they will be promoted |
1163 | 4.52M | // to. This helps us consider truncates on PPC as noop copies when they |
1164 | 4.52M | // are. |
1165 | 4.52M | if (TLI.getTypeAction(CI->getContext(), SrcVT) == |
1166 | 4.52M | TargetLowering::TypePromoteInteger) |
1167 | 11.1k | SrcVT = TLI.getTypeToTransformTo(CI->getContext(), SrcVT); |
1168 | 4.52M | if (TLI.getTypeAction(CI->getContext(), DstVT) == |
1169 | 4.52M | TargetLowering::TypePromoteInteger) |
1170 | 90.3k | DstVT = TLI.getTypeToTransformTo(CI->getContext(), DstVT); |
1171 | 4.52M | |
1172 | 4.52M | // If, after promotion, these are the same types, this is a noop copy. |
1173 | 4.52M | if (SrcVT != DstVT) |
1174 | 379k | return false; |
1175 | 4.14M | |
1176 | 4.14M | return SinkCast(CI); |
1177 | 4.14M | } |
1178 | | |
1179 | | bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO, |
1180 | | CmpInst *Cmp, |
1181 | 8.22k | Intrinsic::ID IID) { |
1182 | 8.22k | if (BO->getParent() != Cmp->getParent()) { |
1183 | 5.30k | // We used to use a dominator tree here to allow multi-block optimization. |
1184 | 5.30k | // But that was problematic because: |
1185 | 5.30k | // 1. It could cause a perf regression by hoisting the math op into the |
1186 | 5.30k | // critical path. |
1187 | 5.30k | // 2. It could cause a perf regression by creating a value that was live |
1188 | 5.30k | // across multiple blocks and increasing register pressure. |
1189 | 5.30k | // 3. Use of a dominator tree could cause large compile-time regression. |
1190 | 5.30k | // This is because we recompute the DT on every change in the main CGP |
1191 | 5.30k | // run-loop. The recomputing is probably unnecessary in many cases, so if |
1192 | 5.30k | // that was fixed, using a DT here would be ok. |
1193 | 5.30k | return false; |
1194 | 5.30k | } |
1195 | 2.92k | |
1196 | 2.92k | // We allow matching the canonical IR (add X, C) back to (usubo X, -C). |
1197 | 2.92k | Value *Arg0 = BO->getOperand(0); |
1198 | 2.92k | Value *Arg1 = BO->getOperand(1); |
1199 | 2.92k | if (BO->getOpcode() == Instruction::Add && |
1200 | 2.92k | IID == Intrinsic::usub_with_overflow2.52k ) { |
1201 | 129 | assert(isa<Constant>(Arg1) && "Unexpected input for usubo"); |
1202 | 129 | Arg1 = ConstantExpr::getNeg(cast<Constant>(Arg1)); |
1203 | 129 | } |
1204 | 2.92k | |
1205 | 2.92k | // Insert at the first instruction of the pair. |
1206 | 2.92k | Instruction *InsertPt = nullptr; |
1207 | 210k | for (Instruction &Iter : *Cmp->getParent()) { |
1208 | 210k | if (&Iter == BO || &Iter == Cmp207k ) { |
1209 | 2.92k | InsertPt = &Iter; |
1210 | 2.92k | break; |
1211 | 2.92k | } |
1212 | 210k | } |
1213 | 2.92k | assert(InsertPt != nullptr && "Parent block did not contain cmp or binop"); |
1214 | 2.92k | |
1215 | 2.92k | IRBuilder<> Builder(InsertPt); |
1216 | 2.92k | Value *MathOV = Builder.CreateBinaryIntrinsic(IID, Arg0, Arg1); |
1217 | 2.92k | Value *Math = Builder.CreateExtractValue(MathOV, 0, "math"); |
1218 | 2.92k | Value *OV = Builder.CreateExtractValue(MathOV, 1, "ov"); |
1219 | 2.92k | BO->replaceAllUsesWith(Math); |
1220 | 2.92k | Cmp->replaceAllUsesWith(OV); |
1221 | 2.92k | BO->eraseFromParent(); |
1222 | 2.92k | Cmp->eraseFromParent(); |
1223 | 2.92k | return true; |
1224 | 2.92k | } |
1225 | | |
1226 | | /// Match special-case patterns that check for unsigned add overflow. |
1227 | | static bool matchUAddWithOverflowConstantEdgeCases(CmpInst *Cmp, |
1228 | 3.60M | BinaryOperator *&Add) { |
1229 | 3.60M | // Add = add A, 1; Cmp = icmp eq A,-1 (overflow if A is max val) |
1230 | 3.60M | // Add = add A,-1; Cmp = icmp ne A, 0 (overflow if A is non-zero) |
1231 | 3.60M | Value *A = Cmp->getOperand(0), *B = Cmp->getOperand(1); |
1232 | 3.60M | |
1233 | 3.60M | // We are not expecting non-canonical/degenerate code. Just bail out. |
1234 | 3.60M | if (isa<Constant>(A)) |
1235 | 2.07k | return false; |
1236 | 3.60M | |
1237 | 3.60M | ICmpInst::Predicate Pred = Cmp->getPredicate(); |
1238 | 3.60M | if (Pred == ICmpInst::ICMP_EQ && match(B, m_AllOnes())2.29M ) |
1239 | 11.8k | B = ConstantInt::get(B->getType(), 1); |
1240 | 3.59M | else if (Pred == ICmpInst::ICMP_NE && match(B, m_ZeroInt())137k ) |
1241 | 112k | B = ConstantInt::get(B->getType(), -1); |
1242 | 3.47M | else |
1243 | 3.47M | return false; |
1244 | 124k | |
1245 | 124k | // Check the users of the variable operand of the compare looking for an add |
1246 | 124k | // with the adjusted constant. |
1247 | 353k | for (User *U : A->users())124k { |
1248 | 353k | if (match(U, m_Add(m_Specific(A), m_Specific(B)))) { |
1249 | 882 | Add = cast<BinaryOperator>(U); |
1250 | 882 | return true; |
1251 | 882 | } |
1252 | 353k | } |
1253 | 124k | return false123k ; |
1254 | 124k | } |
1255 | | |
1256 | | /// Try to combine the compare into a call to the llvm.uadd.with.overflow |
1257 | | /// intrinsic. Return true if any changes were made. |
1258 | | bool CodeGenPrepare::combineToUAddWithOverflow(CmpInst *Cmp, |
1259 | 3.60M | bool &ModifiedDT) { |
1260 | 3.60M | Value *A, *B; |
1261 | 3.60M | BinaryOperator *Add; |
1262 | 3.60M | if (!match(Cmp, m_UAddWithOverflow(m_Value(A), m_Value(B), m_BinOp(Add)))) |
1263 | 3.60M | if (!matchUAddWithOverflowConstantEdgeCases(Cmp, Add)) |
1264 | 3.60M | return false; |
1265 | 3.56k | |
1266 | 3.56k | if (!TLI->shouldFormOverflowOp(ISD::UADDO, |
1267 | 3.56k | TLI->getValueType(*DL, Add->getType()))) |
1268 | 128 | return false; |
1269 | 3.43k | |
1270 | 3.43k | // We don't want to move around uses of condition values this late, so we |
1271 | 3.43k | // check if it is legal to create the call to the intrinsic in the basic |
1272 | 3.43k | // block containing the icmp. |
1273 | 3.43k | if (Add->getParent() != Cmp->getParent() && !Add->hasOneUse()1.04k ) |
1274 | 336 | return false; |
1275 | 3.09k | |
1276 | 3.09k | if (!replaceMathCmpWithIntrinsic(Add, Cmp, Intrinsic::uadd_with_overflow)) |
1277 | 704 | return false; |
1278 | 2.39k | |
1279 | 2.39k | // Reset callers - do not crash by iterating over a dead instruction. |
1280 | 2.39k | ModifiedDT = true; |
1281 | 2.39k | return true; |
1282 | 2.39k | } |
1283 | | |
1284 | | bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp, |
1285 | 3.60M | bool &ModifiedDT) { |
1286 | 3.60M | // We are not expecting non-canonical/degenerate code. Just bail out. |
1287 | 3.60M | Value *A = Cmp->getOperand(0), *B = Cmp->getOperand(1); |
1288 | 3.60M | if (isa<Constant>(A) && isa<Constant>(B)2.07k ) |
1289 | 890 | return false; |
1290 | 3.60M | |
1291 | 3.60M | // Convert (A u> B) to (A u< B) to simplify pattern matching. |
1292 | 3.60M | ICmpInst::Predicate Pred = Cmp->getPredicate(); |
1293 | 3.60M | if (Pred == ICmpInst::ICMP_UGT) { |
1294 | 127k | std::swap(A, B); |
1295 | 127k | Pred = ICmpInst::ICMP_ULT; |
1296 | 127k | } |
1297 | 3.60M | // Convert special-case: (A == 0) is the same as (A u< 1). |
1298 | 3.60M | if (Pred == ICmpInst::ICMP_EQ && match(B, m_ZeroInt())2.29M ) { |
1299 | 1.04M | B = ConstantInt::get(B->getType(), 1); |
1300 | 1.04M | Pred = ICmpInst::ICMP_ULT; |
1301 | 1.04M | } |
1302 | 3.60M | // Convert special-case: (A != 0) is the same as (0 u< A). |
1303 | 3.60M | if (Pred == ICmpInst::ICMP_NE && match(B, m_ZeroInt())137k ) { |
1304 | 112k | std::swap(A, B); |
1305 | 112k | Pred = ICmpInst::ICMP_ULT; |
1306 | 112k | } |
1307 | 3.60M | if (Pred != ICmpInst::ICMP_ULT) |
1308 | 2.03M | return false; |
1309 | 1.56M | |
1310 | 1.56M | // Walk the users of a variable operand of a compare looking for a subtract or |
1311 | 1.56M | // add with that same operand. Also match the 2nd operand of the compare to |
1312 | 1.56M | // the add/sub, but that may be a negated constant operand of an add. |
1313 | 1.56M | Value *CmpVariableOperand = isa<Constant>(A) ? B185k : A1.38M ; |
1314 | 1.56M | BinaryOperator *Sub = nullptr; |
1315 | 3.91M | for (User *U : CmpVariableOperand->users()) { |
1316 | 3.91M | // A - B, A u< B --> usubo(A, B) |
1317 | 3.91M | if (match(U, m_Sub(m_Specific(A), m_Specific(B)))) { |
1318 | 4.26k | Sub = cast<BinaryOperator>(U); |
1319 | 4.26k | break; |
1320 | 4.26k | } |
1321 | 3.91M | |
1322 | 3.91M | // A + (-C), A u< C (canonicalized form of (sub A, C)) |
1323 | 3.91M | const APInt *CmpC, *AddC; |
1324 | 3.91M | if (match(U, m_Add(m_Specific(A), m_APInt(AddC))) && |
1325 | 3.91M | match(B, m_APInt(CmpC))68.3k && *AddC == -(*CmpC)62.3k ) { |
1326 | 15.6k | Sub = cast<BinaryOperator>(U); |
1327 | 15.6k | break; |
1328 | 15.6k | } |
1329 | 3.91M | } |
1330 | 1.56M | if (!Sub) |
1331 | 1.54M | return false; |
1332 | 19.8k | |
1333 | 19.8k | if (!TLI->shouldFormOverflowOp(ISD::USUBO, |
1334 | 19.8k | TLI->getValueType(*DL, Sub->getType()))) |
1335 | 14.7k | return false; |
1336 | 5.12k | |
1337 | 5.12k | if (!replaceMathCmpWithIntrinsic(Sub, Cmp, Intrinsic::usub_with_overflow)) |
1338 | 4.59k | return false; |
1339 | 531 | |
1340 | 531 | // Reset callers - do not crash by iterating over a dead instruction. |
1341 | 531 | ModifiedDT = true; |
1342 | 531 | return true; |
1343 | 531 | } |
1344 | | |
1345 | | /// Sink the given CmpInst into user blocks to reduce the number of virtual |
1346 | | /// registers that must be created and coalesced. This is a clear win except on |
1347 | | /// targets with multiple condition code registers (PowerPC), where it might |
1348 | | /// lose; some adjustment may be wanted there. |
1349 | | /// |
1350 | | /// Return true if any changes are made. |
1351 | 3.64M | static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI) { |
1352 | 3.64M | if (TLI.hasMultipleConditionRegisters()) |
1353 | 10.4k | return false; |
1354 | 3.63M | |
1355 | 3.63M | // Avoid sinking soft-FP comparisons, since this can move them into a loop. |
1356 | 3.63M | if (TLI.useSoftFloat() && isa<FCmpInst>(Cmp)5.93k ) |
1357 | 1.25k | return false; |
1358 | 3.62M | |
1359 | 3.62M | // Only insert a cmp in each block once. |
1360 | 3.62M | DenseMap<BasicBlock*, CmpInst*> InsertedCmps; |
1361 | 3.62M | |
1362 | 3.62M | bool MadeChange = false; |
1363 | 3.62M | for (Value::user_iterator UI = Cmp->user_begin(), E = Cmp->user_end(); |
1364 | 7.32M | UI != E; ) { |
1365 | 3.69M | Use &TheUse = UI.getUse(); |
1366 | 3.69M | Instruction *User = cast<Instruction>(*UI); |
1367 | 3.69M | |
1368 | 3.69M | // Preincrement use iterator so we don't invalidate it. |
1369 | 3.69M | ++UI; |
1370 | 3.69M | |
1371 | 3.69M | // Don't bother for PHI nodes. |
1372 | 3.69M | if (isa<PHINode>(User)) |
1373 | 9.97k | continue; |
1374 | 3.68M | |
1375 | 3.68M | // Figure out which BB this cmp is used in. |
1376 | 3.68M | BasicBlock *UserBB = User->getParent(); |
1377 | 3.68M | BasicBlock *DefBB = Cmp->getParent(); |
1378 | 3.68M | |
1379 | 3.68M | // If this user is in the same block as the cmp, don't change the cmp. |
1380 | 3.68M | if (UserBB == DefBB) continue3.63M ; |
1381 | 51.6k | |
1382 | 51.6k | // If we have already inserted a cmp into this block, use it. |
1383 | 51.6k | CmpInst *&InsertedCmp = InsertedCmps[UserBB]; |
1384 | 51.6k | |
1385 | 51.6k | if (!InsertedCmp) { |
1386 | 51.1k | BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); |
1387 | 51.1k | assert(InsertPt != UserBB->end()); |
1388 | 51.1k | InsertedCmp = |
1389 | 51.1k | CmpInst::Create(Cmp->getOpcode(), Cmp->getPredicate(), |
1390 | 51.1k | Cmp->getOperand(0), Cmp->getOperand(1), "", |
1391 | 51.1k | &*InsertPt); |
1392 | 51.1k | // Propagate the debug info. |
1393 | 51.1k | InsertedCmp->setDebugLoc(Cmp->getDebugLoc()); |
1394 | 51.1k | } |
1395 | 51.6k | |
1396 | 51.6k | // Replace a use of the cmp with a use of the new cmp. |
1397 | 51.6k | TheUse = InsertedCmp; |
1398 | 51.6k | MadeChange = true; |
1399 | 51.6k | ++NumCmpUses; |
1400 | 51.6k | } |
1401 | 3.62M | |
1402 | 3.62M | // If we removed all uses, nuke the cmp. |
1403 | 3.62M | if (Cmp->use_empty()) { |
1404 | 15.4k | Cmp->eraseFromParent(); |
1405 | 15.4k | MadeChange = true; |
1406 | 15.4k | } |
1407 | 3.62M | |
1408 | 3.62M | return MadeChange; |
1409 | 3.62M | } |
1410 | | |
1411 | 3.64M | bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, bool &ModifiedDT) { |
1412 | 3.64M | if (sinkCmpExpression(Cmp, *TLI)) |
1413 | 34.0k | return true; |
1414 | 3.60M | |
1415 | 3.60M | if (combineToUAddWithOverflow(Cmp, ModifiedDT)) |
1416 | 2.39k | return true; |
1417 | 3.60M | |
1418 | 3.60M | if (combineToUSubWithOverflow(Cmp, ModifiedDT)) |
1419 | 531 | return true; |
1420 | 3.60M | |
1421 | 3.60M | return false; |
1422 | 3.60M | } |
1423 | | |
1424 | | /// Duplicate and sink the given 'and' instruction into user blocks where it is |
1425 | | /// used in a compare to allow isel to generate better code for targets where |
1426 | | /// this operation can be combined. |
1427 | | /// |
1428 | | /// Return true if any changes are made. |
1429 | | static bool sinkAndCmp0Expression(Instruction *AndI, |
1430 | | const TargetLowering &TLI, |
1431 | 484k | SetOfInstrs &InsertedInsts) { |
1432 | 484k | // Double-check that we're not trying to optimize an instruction that was |
1433 | 484k | // already optimized by some other part of this pass. |
1434 | 484k | assert(!InsertedInsts.count(AndI) && |
1435 | 484k | "Attempting to optimize already optimized and instruction"); |
1436 | 484k | (void) InsertedInsts; |
1437 | 484k | |
1438 | 484k | // Nothing to do for single use in same basic block. |
1439 | 484k | if (AndI->hasOneUse() && |
1440 | 484k | AndI->getParent() == cast<Instruction>(*AndI->user_begin())->getParent()385k ) |
1441 | 353k | return false; |
1442 | 131k | |
1443 | 131k | // Try to avoid cases where sinking/duplicating is likely to increase register |
1444 | 131k | // pressure. |
1445 | 131k | if (!isa<ConstantInt>(AndI->getOperand(0)) && |
1446 | 131k | !isa<ConstantInt>(AndI->getOperand(1))131k && |
1447 | 131k | AndI->getOperand(0)->hasOneUse()65.2k && AndI->getOperand(1)->hasOneUse()60.6k ) |
1448 | 58.9k | return false; |
1449 | 72.7k | |
1450 | 76.4k | for (auto *U : AndI->users())72.7k { |
1451 | 76.4k | Instruction *User = cast<Instruction>(U); |
1452 | 76.4k | |
1453 | 76.4k | // Only sink 'and' feeding icmp with 0. |
1454 | 76.4k | if (!isa<ICmpInst>(User)) |
1455 | 66.8k | return false; |
1456 | 9.60k | |
1457 | 9.60k | auto *CmpC = dyn_cast<ConstantInt>(User->getOperand(1)); |
1458 | 9.60k | if (!CmpC || !CmpC->isZero()5.62k ) |
1459 | 5.11k | return false; |
1460 | 9.60k | } |
1461 | 72.7k | |
1462 | 72.7k | if (853 !TLI.isMaskAndCmp0FoldingBeneficial(*AndI)853 ) |
1463 | 206 | return false; |
1464 | 647 | |
1465 | 647 | LLVM_DEBUG(dbgs() << "found 'and' feeding only icmp 0;\n"); |
1466 | 647 | LLVM_DEBUG(AndI->getParent()->dump()); |
1467 | 647 | |
1468 | 647 | // Push the 'and' into the same block as the icmp 0. There should only be |
1469 | 647 | // one (icmp (and, 0)) in each block, since CSE/GVN should have removed any |
1470 | 647 | // others, so we don't need to keep track of which BBs we insert into. |
1471 | 647 | for (Value::user_iterator UI = AndI->user_begin(), E = AndI->user_end(); |
1472 | 2.00k | UI != E; ) { |
1473 | 1.36k | Use &TheUse = UI.getUse(); |
1474 | 1.36k | Instruction *User = cast<Instruction>(*UI); |
1475 | 1.36k | |
1476 | 1.36k | // Preincrement use iterator so we don't invalidate it. |
1477 | 1.36k | ++UI; |
1478 | 1.36k | |
1479 | 1.36k | LLVM_DEBUG(dbgs() << "sinking 'and' use: " << *User << "\n"); |
1480 | 1.36k | |
1481 | 1.36k | // Keep the 'and' in the same place if the use is already in the same block. |
1482 | 1.36k | Instruction *InsertPt = |
1483 | 1.36k | User->getParent() == AndI->getParent() ? AndI309 : User1.05k ; |
1484 | 1.36k | Instruction *InsertedAnd = |
1485 | 1.36k | BinaryOperator::Create(Instruction::And, AndI->getOperand(0), |
1486 | 1.36k | AndI->getOperand(1), "", InsertPt); |
1487 | 1.36k | // Propagate the debug info. |
1488 | 1.36k | InsertedAnd->setDebugLoc(AndI->getDebugLoc()); |
1489 | 1.36k | |
1490 | 1.36k | // Replace a use of the 'and' with a use of the new 'and'. |
1491 | 1.36k | TheUse = InsertedAnd; |
1492 | 1.36k | ++NumAndUses; |
1493 | 1.36k | LLVM_DEBUG(User->getParent()->dump()); |
1494 | 1.36k | } |
1495 | 647 | |
1496 | 647 | // We removed all uses, nuke the and. |
1497 | 647 | AndI->eraseFromParent(); |
1498 | 647 | return true; |
1499 | 647 | } |
1500 | | |
1501 | | /// Check if the candidates could be combined with a shift instruction, which |
1502 | | /// includes: |
1503 | | /// 1. Truncate instruction |
1504 | | /// 2. And instruction and the imm is a mask of the low bits: |
1505 | | /// imm & (imm+1) == 0 |
1506 | 155k | static bool isExtractBitsCandidateUse(Instruction *User) { |
1507 | 155k | if (!isa<TruncInst>(User)) { |
1508 | 100k | if (User->getOpcode() != Instruction::And || |
1509 | 100k | !isa<ConstantInt>(User->getOperand(1))13.1k ) |
1510 | 87.6k | return false; |
1511 | 12.5k | |
1512 | 12.5k | const APInt &Cimm = cast<ConstantInt>(User->getOperand(1))->getValue(); |
1513 | 12.5k | |
1514 | 12.5k | if ((Cimm & (Cimm + 1)).getBoolValue()) |
1515 | 561 | return false; |
1516 | 67.2k | } |
1517 | 67.2k | return true; |
1518 | 67.2k | } |
1519 | | |
1520 | | /// Sink both shift and truncate instruction to the use of truncate's BB. |
1521 | | static bool |
1522 | | SinkShiftAndTruncate(BinaryOperator *ShiftI, Instruction *User, ConstantInt *CI, |
1523 | | DenseMap<BasicBlock *, BinaryOperator *> &InsertedShifts, |
1524 | 6.56k | const TargetLowering &TLI, const DataLayout &DL) { |
1525 | 6.56k | BasicBlock *UserBB = User->getParent(); |
1526 | 6.56k | DenseMap<BasicBlock *, CastInst *> InsertedTruncs; |
1527 | 6.56k | TruncInst *TruncI = dyn_cast<TruncInst>(User); |
1528 | 6.56k | bool MadeChange = false; |
1529 | 6.56k | |
1530 | 6.56k | for (Value::user_iterator TruncUI = TruncI->user_begin(), |
1531 | 6.56k | TruncE = TruncI->user_end(); |
1532 | 14.8k | TruncUI != TruncE;) { |
1533 | 8.25k | |
1534 | 8.25k | Use &TruncTheUse = TruncUI.getUse(); |
1535 | 8.25k | Instruction *TruncUser = cast<Instruction>(*TruncUI); |
1536 | 8.25k | // Preincrement use iterator so we don't invalidate it. |
1537 | 8.25k | |
1538 | 8.25k | ++TruncUI; |
1539 | 8.25k | |
1540 | 8.25k | int ISDOpcode = TLI.InstructionOpcodeToISD(TruncUser->getOpcode()); |
1541 | 8.25k | if (!ISDOpcode) |
1542 | 309 | continue; |
1543 | 7.95k | |
1544 | 7.95k | // If the use is actually a legal node, there will not be an |
1545 | 7.95k | // implicit truncate. |
1546 | 7.95k | // FIXME: always querying the result type is just an |
1547 | 7.95k | // approximation; some nodes' legality is determined by the |
1548 | 7.95k | // operand or other means. There's no good way to find out though. |
1549 | 7.95k | if (TLI.isOperationLegalOrCustom( |
1550 | 7.95k | ISDOpcode, TLI.getValueType(DL, TruncUser->getType(), true))) |
1551 | 145 | continue; |
1552 | 7.80k | |
1553 | 7.80k | // Don't bother for PHI nodes. |
1554 | 7.80k | if (isa<PHINode>(TruncUser)) |
1555 | 0 | continue; |
1556 | 7.80k | |
1557 | 7.80k | BasicBlock *TruncUserBB = TruncUser->getParent(); |
1558 | 7.80k | |
1559 | 7.80k | if (UserBB == TruncUserBB) |
1560 | 7.74k | continue; |
1561 | 59 | |
1562 | 59 | BinaryOperator *&InsertedShift = InsertedShifts[TruncUserBB]; |
1563 | 59 | CastInst *&InsertedTrunc = InsertedTruncs[TruncUserBB]; |
1564 | 59 | |
1565 | 59 | if (!InsertedShift && !InsertedTrunc58 ) { |
1566 | 58 | BasicBlock::iterator InsertPt = TruncUserBB->getFirstInsertionPt(); |
1567 | 58 | assert(InsertPt != TruncUserBB->end()); |
1568 | 58 | // Sink the shift |
1569 | 58 | if (ShiftI->getOpcode() == Instruction::AShr) |
1570 | 0 | InsertedShift = BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI, |
1571 | 0 | "", &*InsertPt); |
1572 | 58 | else |
1573 | 58 | InsertedShift = BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI, |
1574 | 58 | "", &*InsertPt); |
1575 | 58 | InsertedShift->setDebugLoc(ShiftI->getDebugLoc()); |
1576 | 58 | |
1577 | 58 | // Sink the trunc |
1578 | 58 | BasicBlock::iterator TruncInsertPt = TruncUserBB->getFirstInsertionPt(); |
1579 | 58 | TruncInsertPt++; |
1580 | 58 | assert(TruncInsertPt != TruncUserBB->end()); |
1581 | 58 | |
1582 | 58 | InsertedTrunc = CastInst::Create(TruncI->getOpcode(), InsertedShift, |
1583 | 58 | TruncI->getType(), "", &*TruncInsertPt); |
1584 | 58 | InsertedTrunc->setDebugLoc(TruncI->getDebugLoc()); |
1585 | 58 | |
1586 | 58 | MadeChange = true; |
1587 | 58 | |
1588 | 58 | TruncTheUse = InsertedTrunc; |
1589 | 58 | } |
1590 | 59 | } |
1591 | 6.56k | return MadeChange; |
1592 | 6.56k | } |
1593 | | |
1594 | | /// Sink the shift *right* instruction into user blocks if the uses could |
1595 | | /// potentially be combined with this shift instruction and generate BitExtract |
1596 | | /// instruction. It will only be applied if the architecture supports BitExtract |
1597 | | /// instruction. Here is an example: |
1598 | | /// BB1: |
1599 | | /// %x.extract.shift = lshr i64 %arg1, 32 |
1600 | | /// BB2: |
1601 | | /// %x.extract.trunc = trunc i64 %x.extract.shift to i16 |
1602 | | /// ==> |
1603 | | /// |
1604 | | /// BB2: |
1605 | | /// %x.extract.shift.1 = lshr i64 %arg1, 32 |
1606 | | /// %x.extract.trunc = trunc i64 %x.extract.shift.1 to i16 |
1607 | | /// |
1608 | | /// CodeGen will recognize the pattern in BB2 and generate BitExtract |
1609 | | /// instruction. |
1610 | | /// Return true if any changes are made. |
1611 | | static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI, |
1612 | | const TargetLowering &TLI, |
1613 | 138k | const DataLayout &DL) { |
1614 | 138k | BasicBlock *DefBB = ShiftI->getParent(); |
1615 | 138k | |
1616 | 138k | /// Only insert instructions in each block once. |
1617 | 138k | DenseMap<BasicBlock *, BinaryOperator *> InsertedShifts; |
1618 | 138k | |
1619 | 138k | bool shiftIsLegal = TLI.isTypeLegal(TLI.getValueType(DL, ShiftI->getType())); |
1620 | 138k | |
1621 | 138k | bool MadeChange = false; |
1622 | 138k | for (Value::user_iterator UI = ShiftI->user_begin(), E = ShiftI->user_end(); |
1623 | 303k | UI != E;) { |
1624 | 165k | Use &TheUse = UI.getUse(); |
1625 | 165k | Instruction *User = cast<Instruction>(*UI); |
1626 | 165k | // Preincrement use iterator so we don't invalidate it. |
1627 | 165k | ++UI; |
1628 | 165k | |
1629 | 165k | // Don't bother for PHI nodes. |
1630 | 165k | if (isa<PHINode>(User)) |
1631 | 9.68k | continue; |
1632 | 155k | |
1633 | 155k | if (!isExtractBitsCandidateUse(User)) |
1634 | 88.1k | continue; |
1635 | 67.2k | |
1636 | 67.2k | BasicBlock *UserBB = User->getParent(); |
1637 | 67.2k | |
1638 | 67.2k | if (UserBB == DefBB) { |
1639 | 65.5k | // If the shift and truncate instruction are in the same BB. The use of |
1640 | 65.5k | // the truncate(TruncUse) may still introduce another truncate if not |
1641 | 65.5k | // legal. In this case, we would like to sink both shift and truncate |
1642 | 65.5k | // instruction to the BB of TruncUse. |
1643 | 65.5k | // for example: |
1644 | 65.5k | // BB1: |
1645 | 65.5k | // i64 shift.result = lshr i64 opnd, imm |
1646 | 65.5k | // trunc.result = trunc shift.result to i16 |
1647 | 65.5k | // |
1648 | 65.5k | // BB2: |
1649 | 65.5k | // ----> We will have an implicit truncate here if the architecture does |
1650 | 65.5k | // not have i16 compare. |
1651 | 65.5k | // cmp i16 trunc.result, opnd2 |
1652 | 65.5k | // |
1653 | 65.5k | if (isa<TruncInst>(User) && shiftIsLegal53.7k |
1654 | 65.5k | // If the type of the truncate is legal, no truncate will be |
1655 | 65.5k | // introduced in other basic blocks. |
1656 | 65.5k | && |
1657 | 65.5k | (!TLI.isTypeLegal(TLI.getValueType(DL, User->getType())))52.6k ) |
1658 | 6.56k | MadeChange = |
1659 | 6.56k | SinkShiftAndTruncate(ShiftI, User, CI, InsertedShifts, TLI, DL); |
1660 | 65.5k | |
1661 | 65.5k | continue; |
1662 | 65.5k | } |
1663 | 1.63k | // If we have already inserted a shift into this block, use it. |
1664 | 1.63k | BinaryOperator *&InsertedShift = InsertedShifts[UserBB]; |
1665 | 1.63k | |
1666 | 1.63k | if (!InsertedShift) { |
1667 | 1.63k | BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); |
1668 | 1.63k | assert(InsertPt != UserBB->end()); |
1669 | 1.63k | |
1670 | 1.63k | if (ShiftI->getOpcode() == Instruction::AShr) |
1671 | 30 | InsertedShift = BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI, |
1672 | 30 | "", &*InsertPt); |
1673 | 1.60k | else |
1674 | 1.60k | InsertedShift = BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI, |
1675 | 1.60k | "", &*InsertPt); |
1676 | 1.63k | InsertedShift->setDebugLoc(ShiftI->getDebugLoc()); |
1677 | 1.63k | |
1678 | 1.63k | MadeChange = true; |
1679 | 1.63k | } |
1680 | 1.63k | |
1681 | 1.63k | // Replace a use of the shift with a use of the new shift. |
1682 | 1.63k | TheUse = InsertedShift; |
1683 | 1.63k | } |
1684 | 138k | |
1685 | 138k | // If we removed all uses, nuke the shift. |
1686 | 138k | if (ShiftI->use_empty()) { |
1687 | 74 | salvageDebugInfo(*ShiftI); |
1688 | 74 | ShiftI->eraseFromParent(); |
1689 | 74 | } |
1690 | 138k | |
1691 | 138k | return MadeChange; |
1692 | 138k | } |
1693 | | |
1694 | | /// If counting leading or trailing zeros is an expensive operation and a zero |
1695 | | /// input is defined, add a check for zero to avoid calling the intrinsic. |
1696 | | /// |
1697 | | /// We want to transform: |
1698 | | /// %z = call i64 @llvm.cttz.i64(i64 %A, i1 false) |
1699 | | /// |
1700 | | /// into: |
1701 | | /// entry: |
1702 | | /// %cmpz = icmp eq i64 %A, 0 |
1703 | | /// br i1 %cmpz, label %cond.end, label %cond.false |
1704 | | /// cond.false: |
1705 | | /// %z = call i64 @llvm.cttz.i64(i64 %A, i1 true) |
1706 | | /// br label %cond.end |
1707 | | /// cond.end: |
1708 | | /// %ctz = phi i64 [ 64, %entry ], [ %z, %cond.false ] |
1709 | | /// |
1710 | | /// If the transform is performed, return true and set ModifiedDT to true. |
1711 | | static bool despeculateCountZeros(IntrinsicInst *CountZeros, |
1712 | | const TargetLowering *TLI, |
1713 | | const DataLayout *DL, |
1714 | 8.47k | bool &ModifiedDT) { |
1715 | 8.47k | if (!TLI || !DL) |
1716 | 0 | return false; |
1717 | 8.47k | |
1718 | 8.47k | // If a zero input is undefined, it doesn't make sense to despeculate that. |
1719 | 8.47k | if (match(CountZeros->getOperand(1), m_One())) |
1720 | 5.79k | return false; |
1721 | 2.67k | |
1722 | 2.67k | // If it's cheap to speculate, there's nothing to do. |
1723 | 2.67k | auto IntrinsicID = CountZeros->getIntrinsicID(); |
1724 | 2.67k | if ((IntrinsicID == Intrinsic::cttz && TLI->isCheapToSpeculateCttz()857 ) || |
1725 | 2.67k | (2.06k IntrinsicID == Intrinsic::ctlz2.06k && TLI->isCheapToSpeculateCtlz()1.82k )) |
1726 | 2.09k | return false; |
1727 | 588 | |
1728 | 588 | // Only handle legal scalar cases. Anything else requires too much work. |
1729 | 588 | Type *Ty = CountZeros->getType(); |
1730 | 588 | unsigned SizeInBits = Ty->getPrimitiveSizeInBits(); |
1731 | 588 | if (Ty->isVectorTy() || SizeInBits > DL->getLargestLegalIntTypeSizeInBits()120 ) |
1732 | 487 | return false; |
1733 | 101 | |
1734 | 101 | // The intrinsic will be sunk behind a compare against zero and branch. |
1735 | 101 | BasicBlock *StartBlock = CountZeros->getParent(); |
1736 | 101 | BasicBlock *CallBlock = StartBlock->splitBasicBlock(CountZeros, "cond.false"); |
1737 | 101 | |
1738 | 101 | // Create another block after the count zero intrinsic. A PHI will be added |
1739 | 101 | // in this block to select the result of the intrinsic or the bit-width |
1740 | 101 | // constant if the input to the intrinsic is zero. |
1741 | 101 | BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(CountZeros)); |
1742 | 101 | BasicBlock *EndBlock = CallBlock->splitBasicBlock(SplitPt, "cond.end"); |
1743 | 101 | |
1744 | 101 | // Set up a builder to create a compare, conditional branch, and PHI. |
1745 | 101 | IRBuilder<> Builder(CountZeros->getContext()); |
1746 | 101 | Builder.SetInsertPoint(StartBlock->getTerminator()); |
1747 | 101 | Builder.SetCurrentDebugLocation(CountZeros->getDebugLoc()); |
1748 | 101 | |
1749 | 101 | // Replace the unconditional branch that was created by the first split with |
1750 | 101 | // a compare against zero and a conditional branch. |
1751 | 101 | Value *Zero = Constant::getNullValue(Ty); |
1752 | 101 | Value *Cmp = Builder.CreateICmpEQ(CountZeros->getOperand(0), Zero, "cmpz"); |
1753 | 101 | Builder.CreateCondBr(Cmp, EndBlock, CallBlock); |
1754 | 101 | StartBlock->getTerminator()->eraseFromParent(); |
1755 | 101 | |
1756 | 101 | // Create a PHI in the end block to select either the output of the intrinsic |
1757 | 101 | // or the bit width of the operand. |
1758 | 101 | Builder.SetInsertPoint(&EndBlock->front()); |
1759 | 101 | PHINode *PN = Builder.CreatePHI(Ty, 2, "ctz"); |
1760 | 101 | CountZeros->replaceAllUsesWith(PN); |
1761 | 101 | Value *BitWidth = Builder.getInt(APInt(SizeInBits, SizeInBits)); |
1762 | 101 | PN->addIncoming(BitWidth, StartBlock); |
1763 | 101 | PN->addIncoming(CountZeros, CallBlock); |
1764 | 101 | |
1765 | 101 | // We are explicitly handling the zero case, so we can set the intrinsic's |
1766 | 101 | // undefined zero argument to 'true'. This will also prevent reprocessing the |
1767 | 101 | // intrinsic; we only despeculate when a zero input is defined. |
1768 | 101 | CountZeros->setArgOperand(1, Builder.getTrue()); |
1769 | 101 | ModifiedDT = true; |
1770 | 101 | return true; |
1771 | 101 | } |
1772 | | |
1773 | 4.03M | bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) { |
1774 | 4.03M | BasicBlock *BB = CI->getParent(); |
1775 | 4.03M | |
1776 | 4.03M | // Lower inline assembly if we can. |
1777 | 4.03M | // If we found an inline asm expession, and if the target knows how to |
1778 | 4.03M | // lower it to normal LLVM code, do so now. |
1779 | 4.03M | if (TLI && isa<InlineAsm>(CI->getCalledValue())4.03M ) { |
1780 | 27.0k | if (TLI->ExpandInlineAsm(CI)) { |
1781 | 27 | // Avoid invalidating the iterator. |
1782 | 27 | CurInstIterator = BB->begin(); |
1783 | 27 | // Avoid processing instructions out of order, which could cause |
1784 | 27 | // reuse before a value is defined. |
1785 | 27 | SunkAddrs.clear(); |
1786 | 27 | return true; |
1787 | 27 | } |
1788 | 26.9k | // Sink address computing for memory operands into the block. |
1789 | 26.9k | if (optimizeInlineAsmInst(CI)) |
1790 | 5 | return true; |
1791 | 4.03M | } |
1792 | 4.03M | |
1793 | 4.03M | // Align the pointer arguments to this call if the target thinks it's a good |
1794 | 4.03M | // idea |
1795 | 4.03M | unsigned MinSize, PrefAlign; |
1796 | 4.03M | if (TLI && TLI->shouldAlignPointerArgs(CI, MinSize, PrefAlign)4.03M ) { |
1797 | 5.42k | for (auto &Arg : CI->arg_operands()) { |
1798 | 5.42k | // We want to align both objects whose address is used directly and |
1799 | 5.42k | // objects whose address is used in casts and GEPs, though it only makes |
1800 | 5.42k | // sense for GEPs if the offset is a multiple of the desired alignment and |
1801 | 5.42k | // if size - offset meets the size threshold. |
1802 | 5.42k | if (!Arg->getType()->isPointerTy()) |
1803 | 3.04k | continue; |
1804 | 2.37k | APInt Offset(DL->getIndexSizeInBits( |
1805 | 2.37k | cast<PointerType>(Arg->getType())->getAddressSpace()), |
1806 | 2.37k | 0); |
1807 | 2.37k | Value *Val = Arg->stripAndAccumulateInBoundsConstantOffsets(*DL, Offset); |
1808 | 2.37k | uint64_t Offset2 = Offset.getLimitedValue(); |
1809 | 2.37k | if ((Offset2 & (PrefAlign-1)) != 0) |
1810 | 64 | continue; |
1811 | 2.31k | AllocaInst *AI; |
1812 | 2.31k | if ((AI = dyn_cast<AllocaInst>(Val)) && AI->getAlignment() < PrefAlign560 && |
1813 | 2.31k | DL->getTypeAllocSize(AI->getAllocatedType()) >= MinSize + Offset2321 ) |
1814 | 145 | AI->setAlignment(PrefAlign); |
1815 | 2.31k | // Global variables can only be aligned if they are defined in this |
1816 | 2.31k | // object (i.e. they are uniquely initialized in this object), and |
1817 | 2.31k | // over-aligning global variables that have an explicit section is |
1818 | 2.31k | // forbidden. |
1819 | 2.31k | GlobalVariable *GV; |
1820 | 2.31k | if ((GV = dyn_cast<GlobalVariable>(Val)) && GV->canIncreaseAlignment()269 && |
1821 | 2.31k | GV->getPointerAlignment(*DL) < PrefAlign127 && |
1822 | 2.31k | DL->getTypeAllocSize(GV->getValueType()) >= |
1823 | 112 | MinSize + Offset2) |
1824 | 51 | GV->setAlignment(PrefAlign); |
1825 | 2.31k | } |
1826 | 1.35k | // If this is a memcpy (or similar) then we may be able to improve the |
1827 | 1.35k | // alignment |
1828 | 1.35k | if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(CI)) { |
1829 | 1.35k | unsigned DestAlign = getKnownAlignment(MI->getDest(), *DL); |
1830 | 1.35k | if (DestAlign > MI->getDestAlignment()) |
1831 | 566 | MI->setDestAlignment(DestAlign); |
1832 | 1.35k | if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) { |
1833 | 1.02k | unsigned SrcAlign = getKnownAlignment(MTI->getSource(), *DL); |
1834 | 1.02k | if (SrcAlign > MTI->getSourceAlignment()) |
1835 | 461 | MTI->setSourceAlignment(SrcAlign); |
1836 | 1.02k | } |
1837 | 1.35k | } |
1838 | 1.35k | } |
1839 | 4.03M | |
1840 | 4.03M | // If we have a cold call site, try to sink addressing computation into the |
1841 | 4.03M | // cold block. This interacts with our handling for loads and stores to |
1842 | 4.03M | // ensure that we can fold all uses of a potential addressing computation |
1843 | 4.03M | // into their uses. TODO: generalize this to work over profiling data |
1844 | 4.03M | if (!OptSize && CI->hasFnAttr(Attribute::Cold)4.02M ) |
1845 | 679 | for (auto &Arg : CI->arg_operands()) { |
1846 | 238 | if (!Arg->getType()->isPointerTy()) |
1847 | 6 | continue; |
1848 | 232 | unsigned AS = Arg->getType()->getPointerAddressSpace(); |
1849 | 232 | return optimizeMemoryInst(CI, Arg, Arg->getType(), AS); |
1850 | 232 | } |
1851 | 4.03M | |
1852 | 4.03M | IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI); |
1853 | 4.03M | if (II) { |
1854 | 940k | switch (II->getIntrinsicID()) { |
1855 | 940k | default: break883k ; |
1856 | 940k | case Intrinsic::experimental_widenable_condition: { |
1857 | 8 | // Give up on future widening oppurtunties so that we can fold away dead |
1858 | 8 | // paths and merge blocks before going into block-local instruction |
1859 | 8 | // selection. |
1860 | 8 | if (II->use_empty()) { |
1861 | 4 | II->eraseFromParent(); |
1862 | 4 | return true; |
1863 | 4 | } |
1864 | 4 | Constant *RetVal = ConstantInt::getTrue(II->getContext()); |
1865 | 4 | resetIteratorIfInvalidatedWhileCalling(BB, [&]() { |
1866 | 4 | replaceAndRecursivelySimplify(CI, RetVal, TLInfo, nullptr); |
1867 | 4 | }); |
1868 | 4 | return true; |
1869 | 4 | } |
1870 | 3.40k | case Intrinsic::objectsize: { |
1871 | 3.40k | // Lower all uses of llvm.objectsize.* |
1872 | 3.40k | Value *RetVal = |
1873 | 3.40k | lowerObjectSizeCall(II, *DL, TLInfo, /*MustSucceed=*/true); |
1874 | 3.40k | |
1875 | 3.40k | resetIteratorIfInvalidatedWhileCalling(BB, [&]() { |
1876 | 3.40k | replaceAndRecursivelySimplify(CI, RetVal, TLInfo, nullptr); |
1877 | 3.40k | }); |
1878 | 3.40k | return true; |
1879 | 4 | } |
1880 | 19 | case Intrinsic::is_constant: { |
1881 | 19 | // If is_constant hasn't folded away yet, lower it to false now. |
1882 | 19 | Constant *RetVal = ConstantInt::get(II->getType(), 0); |
1883 | 19 | resetIteratorIfInvalidatedWhileCalling(BB, [&]() { |
1884 | 19 | replaceAndRecursivelySimplify(CI, RetVal, TLInfo, nullptr); |
1885 | 19 | }); |
1886 | 19 | return true; |
1887 | 4 | } |
1888 | 44.5k | case Intrinsic::aarch64_stlxr: |
1889 | 44.5k | case Intrinsic::aarch64_stxr: { |
1890 | 44.5k | ZExtInst *ExtVal = dyn_cast<ZExtInst>(CI->getArgOperand(0)); |
1891 | 44.5k | if (!ExtVal || !ExtVal->hasOneUse()8.87k || |
1892 | 44.5k | ExtVal->getParent() == CI->getParent()8.87k ) |
1893 | 44.5k | return false; |
1894 | 1 | // Sink a zext feeding stlxr/stxr before it, so it can be folded into it. |
1895 | 1 | ExtVal->moveBefore(CI); |
1896 | 1 | // Mark this instruction as "inserted by CGP", so that other |
1897 | 1 | // optimizations don't touch it. |
1898 | 1 | InsertedInsts.insert(ExtVal); |
1899 | 1 | return true; |
1900 | 1 | } |
1901 | 1 | |
1902 | 5 | case Intrinsic::launder_invariant_group: |
1903 | 5 | case Intrinsic::strip_invariant_group: { |
1904 | 5 | Value *ArgVal = II->getArgOperand(0); |
1905 | 5 | auto it = LargeOffsetGEPMap.find(II); |
1906 | 5 | if (it != LargeOffsetGEPMap.end()) { |
1907 | 1 | // Merge entries in LargeOffsetGEPMap to reflect the RAUW. |
1908 | 1 | // Make sure not to have to deal with iterator invalidation |
1909 | 1 | // after possibly adding ArgVal to LargeOffsetGEPMap. |
1910 | 1 | auto GEPs = std::move(it->second); |
1911 | 1 | LargeOffsetGEPMap[ArgVal].append(GEPs.begin(), GEPs.end()); |
1912 | 1 | LargeOffsetGEPMap.erase(II); |
1913 | 1 | } |
1914 | 5 | |
1915 | 5 | II->replaceAllUsesWith(ArgVal); |
1916 | 5 | II->eraseFromParent(); |
1917 | 5 | return true; |
1918 | 5 | } |
1919 | 8.47k | case Intrinsic::cttz: |
1920 | 8.47k | case Intrinsic::ctlz: |
1921 | 8.47k | // If counting zeros is expensive, try to avoid it. |
1922 | 8.47k | return despeculateCountZeros(II, TLI, DL, ModifiedDT); |
1923 | 883k | } |
1924 | 883k | |
1925 | 883k | if (TLI) { |
1926 | 883k | SmallVector<Value*, 2> PtrOps; |
1927 | 883k | Type *AccessTy; |
1928 | 883k | if (TLI->getAddrModeArguments(II, PtrOps, AccessTy)) |
1929 | 660 | while (338 !PtrOps.empty()) { |
1930 | 338 | Value *PtrVal = PtrOps.pop_back_val(); |
1931 | 338 | unsigned AS = PtrVal->getType()->getPointerAddressSpace(); |
1932 | 338 | if (optimizeMemoryInst(II, PtrVal, AccessTy, AS)) |
1933 | 16 | return true; |
1934 | 338 | } |
1935 | 883k | } |
1936 | 883k | } |
1937 | 4.03M | |
1938 | 4.03M | // From here on out we're working with named functions. |
1939 | 4.03M | if (3.97M !CI->getCalledFunction()3.97M ) return false149k ; |
1940 | 3.82M | |
1941 | 3.82M | // Lower all default uses of _chk calls. This is very similar |
1942 | 3.82M | // to what InstCombineCalls does, but here we are only lowering calls |
1943 | 3.82M | // to fortified library functions (e.g. __memcpy_chk) that have the default |
1944 | 3.82M | // "don't know" as the objectsize. Anything else should be left alone. |
1945 | 3.82M | FortifiedLibCallSimplifier Simplifier(TLInfo, true); |
1946 | 3.82M | if (Value *V = Simplifier.optimizeCall(CI)) { |
1947 | 3.50k | CI->replaceAllUsesWith(V); |
1948 | 3.50k | CI->eraseFromParent(); |
1949 | 3.50k | return true; |
1950 | 3.50k | } |
1951 | 3.82M | |
1952 | 3.82M | return false; |
1953 | 3.82M | } |
1954 | | |
1955 | | /// Look for opportunities to duplicate return instructions to the predecessor |
1956 | | /// to enable tail call optimizations. The case it is currently looking for is: |
1957 | | /// @code |
1958 | | /// bb0: |
1959 | | /// %tmp0 = tail call i32 @f0() |
1960 | | /// br label %return |
1961 | | /// bb1: |
1962 | | /// %tmp1 = tail call i32 @f1() |
1963 | | /// br label %return |
1964 | | /// bb2: |
1965 | | /// %tmp2 = tail call i32 @f2() |
1966 | | /// br label %return |
1967 | | /// return: |
1968 | | /// %retval = phi i32 [ %tmp0, %bb0 ], [ %tmp1, %bb1 ], [ %tmp2, %bb2 ] |
1969 | | /// ret i32 %retval |
1970 | | /// @endcode |
1971 | | /// |
1972 | | /// => |
1973 | | /// |
1974 | | /// @code |
1975 | | /// bb0: |
1976 | | /// %tmp0 = tail call i32 @f0() |
1977 | | /// ret i32 %tmp0 |
1978 | | /// bb1: |
1979 | | /// %tmp1 = tail call i32 @f1() |
1980 | | /// ret i32 %tmp1 |
1981 | | /// bb2: |
1982 | | /// %tmp2 = tail call i32 @f2() |
1983 | | /// ret i32 %tmp2 |
1984 | | /// @endcode |
1985 | 6.33M | bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT) { |
1986 | 6.33M | if (!TLI) |
1987 | 193 | return false; |
1988 | 6.33M | |
1989 | 6.33M | ReturnInst *RetI = dyn_cast<ReturnInst>(BB->getTerminator()); |
1990 | 6.33M | if (!RetI) |
1991 | 5.14M | return false; |
1992 | 1.19M | |
1993 | 1.19M | PHINode *PN = nullptr; |
1994 | 1.19M | BitCastInst *BCI = nullptr; |
1995 | 1.19M | Value *V = RetI->getReturnValue(); |
1996 | 1.19M | if (V) { |
1997 | 969k | BCI = dyn_cast<BitCastInst>(V); |
1998 | 969k | if (BCI) |
1999 | 16.5k | V = BCI->getOperand(0); |
2000 | 969k | |
2001 | 969k | PN = dyn_cast<PHINode>(V); |
2002 | 969k | if (!PN) |
2003 | 860k | return false; |
2004 | 330k | } |
2005 | 330k | |
2006 | 330k | if (PN && PN->getParent() != BB109k ) |
2007 | 4.65k | return false; |
2008 | 325k | |
2009 | 325k | // Make sure there are no instructions between the PHI and return, or that the |
2010 | 325k | // return is the first instruction in the block. |
2011 | 325k | if (PN) { |
2012 | 104k | BasicBlock::iterator BI = BB->begin(); |
2013 | 104k | // Skip over debug and the bitcast. |
2014 | 105k | do { ++BI; } while (isa<DbgInfoIntrinsic>(BI) || &*BI == BCI105k ); |
2015 | 104k | if (&*BI != RetI) |
2016 | 23.5k | return false; |
2017 | 220k | } else { |
2018 | 220k | BasicBlock::iterator BI = BB->begin(); |
2019 | 224k | while (isa<DbgInfoIntrinsic>(BI)) ++BI4.07k ; |
2020 | 220k | if (&*BI != RetI) |
2021 | 157k | return false; |
2022 | 144k | } |
2023 | 144k | |
2024 | 144k | /// Only dup the ReturnInst if the CallInst is likely to be emitted as a tail |
2025 | 144k | /// call. |
2026 | 144k | const Function *F = BB->getParent(); |
2027 | 144k | SmallVector<CallInst*, 4> TailCalls; |
2028 | 144k | if (PN) { |
2029 | 451k | for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I369k ) { |
2030 | 369k | // Look through bitcasts. |
2031 | 369k | Value *IncomingVal = PN->getIncomingValue(I)->stripPointerCasts(); |
2032 | 369k | CallInst *CI = dyn_cast<CallInst>(IncomingVal); |
2033 | 369k | // Make sure the phi value is indeed produced by the tail call. |
2034 | 369k | if (CI && CI->hasOneUse()119k && CI->getParent() == PN->getIncomingBlock(I)104k && |
2035 | 369k | TLI->mayBeEmittedAsTailCall(CI)104k && |
2036 | 369k | attributesPermitTailCall(F, CI, RetI, *TLI)103k ) |
2037 | 103k | TailCalls.push_back(CI); |
2038 | 369k | } |
2039 | 81.4k | } else { |
2040 | 62.7k | SmallPtrSet<BasicBlock*, 4> VisitedBBs; |
2041 | 201k | for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI138k ) { |
2042 | 138k | if (!VisitedBBs.insert(*PI).second) |
2043 | 384 | continue; |
2044 | 138k | |
2045 | 138k | BasicBlock::InstListType &InstList = (*PI)->getInstList(); |
2046 | 138k | BasicBlock::InstListType::reverse_iterator RI = InstList.rbegin(); |
2047 | 138k | BasicBlock::InstListType::reverse_iterator RE = InstList.rend(); |
2048 | 138k | do { ++RI; } while (RI != RE && isa<DbgInfoIntrinsic>(&*RI)136k ); |
2049 | 138k | if (RI == RE) |
2050 | 1.65k | continue; |
2051 | 136k | |
2052 | 136k | CallInst *CI = dyn_cast<CallInst>(&*RI); |
2053 | 136k | if (CI && CI->use_empty()20.1k && TLI->mayBeEmittedAsTailCall(CI)18.8k && |
2054 | 136k | attributesPermitTailCall(F, CI, RetI, *TLI)14.2k ) |
2055 | 14.1k | TailCalls.push_back(CI); |
2056 | 136k | } |
2057 | 62.7k | } |
2058 | 144k | |
2059 | 144k | bool Changed = false; |
2060 | 261k | for (unsigned i = 0, e = TailCalls.size(); i != e; ++i117k ) { |
2061 | 117k | CallInst *CI = TailCalls[i]; |
2062 | 117k | CallSite CS(CI); |
2063 | 117k | |
2064 | 117k | // Make sure the call instruction is followed by an unconditional branch to |
2065 | 117k | // the return block. |
2066 | 117k | BasicBlock *CallBB = CI->getParent(); |
2067 | 117k | BranchInst *BI = dyn_cast<BranchInst>(CallBB->getTerminator()); |
2068 | 117k | if (!BI || !BI->isUnconditional()117k || BI->getSuccessor(0) != BB117k ) |
2069 | 153 | continue; |
2070 | 117k | |
2071 | 117k | // Duplicate the return into CallBB. |
2072 | 117k | (void)FoldReturnIntoUncondBranch(RetI, BB, CallBB); |
2073 | 117k | ModifiedDT = Changed = true; |
2074 | 117k | ++NumRetsDup; |
2075 | 117k | } |
2076 | 144k | |
2077 | 144k | // If we eliminated all predecessors of the block, delete the block now. |
2078 | 144k | if (Changed && !BB->hasAddressTaken()21.4k && pred_begin(BB) == pred_end(BB)21.4k ) |
2079 | 5.99k | BB->eraseFromParent(); |
2080 | 144k | |
2081 | 144k | return Changed; |
2082 | 144k | } |
2083 | | |
2084 | | //===----------------------------------------------------------------------===// |
2085 | | // Memory Optimization |
2086 | | //===----------------------------------------------------------------------===// |
2087 | | |
2088 | | namespace { |
2089 | | |
2090 | | /// This is an extended version of TargetLowering::AddrMode |
2091 | | /// which holds actual Value*'s for register values. |
2092 | | struct ExtAddrMode : public TargetLowering::AddrMode { |
2093 | | Value *BaseReg = nullptr; |
2094 | | Value *ScaledReg = nullptr; |
2095 | | Value *OriginalValue = nullptr; |
2096 | | bool InBounds = true; |
2097 | | |
2098 | | enum FieldName { |
2099 | | NoField = 0x00, |
2100 | | BaseRegField = 0x01, |
2101 | | BaseGVField = 0x02, |
2102 | | BaseOffsField = 0x04, |
2103 | | ScaledRegField = 0x08, |
2104 | | ScaleField = 0x10, |
2105 | | MultipleFields = 0xff |
2106 | | }; |
2107 | | |
2108 | | |
2109 | 8.27M | ExtAddrMode() = default; |
2110 | | |
2111 | | void print(raw_ostream &OS) const; |
2112 | | void dump() const; |
2113 | | |
2114 | 106k | FieldName compare(const ExtAddrMode &other) { |
2115 | 106k | // First check that the types are the same on each field, as differing types |
2116 | 106k | // is something we can't cope with later on. |
2117 | 106k | if (BaseReg && other.BaseReg105k && |
2118 | 106k | BaseReg->getType() != other.BaseReg->getType()105k ) |
2119 | 19.8k | return MultipleFields; |
2120 | 86.6k | if (BaseGV && other.BaseGV140 && |
2121 | 86.6k | BaseGV->getType() != other.BaseGV->getType()16 ) |
2122 | 0 | return MultipleFields; |
2123 | 86.6k | if (ScaledReg && other.ScaledReg3.48k && |
2124 | 86.6k | ScaledReg->getType() != other.ScaledReg->getType()1.24k ) |
2125 | 0 | return MultipleFields; |
2126 | 86.6k | |
2127 | 86.6k | // Conservatively reject 'inbounds' mismatches. |
2128 | 86.6k | if (InBounds != other.InBounds) |
2129 | 28.8k | return MultipleFields; |
2130 | 57.7k | |
2131 | 57.7k | // Check each field to see if it differs. |
2132 | 57.7k | unsigned Result = NoField; |
2133 | 57.7k | if (BaseReg != other.BaseReg) |
2134 | 49.9k | Result |= BaseRegField; |
2135 | 57.7k | if (BaseGV != other.BaseGV) |
2136 | 171 | Result |= BaseGVField; |
2137 | 57.7k | if (BaseOffs != other.BaseOffs) |
2138 | 34.1k | Result |= BaseOffsField; |
2139 | 57.7k | if (ScaledReg != other.ScaledReg) |
2140 | 11.3k | Result |= ScaledRegField; |
2141 | 57.7k | // Don't count 0 as being a different scale, because that actually means |
2142 | 57.7k | // unscaled (which will already be counted by having no ScaledReg). |
2143 | 57.7k | if (Scale && other.Scale2.78k && Scale != other.Scale1.20k ) |
2144 | 69 | Result |= ScaleField; |
2145 | 57.7k | |
2146 | 57.7k | if (countPopulation(Result) > 1) |
2147 | 30.8k | return MultipleFields; |
2148 | 26.8k | else |
2149 | 26.8k | return static_cast<FieldName>(Result); |
2150 | 57.7k | } |
2151 | | |
2152 | | // An AddrMode is trivial if it involves no calculation i.e. it is just a base |
2153 | | // with no offset. |
2154 | 7.77M | bool isTrivial() { |
2155 | 7.77M | // An AddrMode is (BaseGV + BaseReg + BaseOffs + ScaleReg * Scale) so it is |
2156 | 7.77M | // trivial if at most one of these terms is nonzero, except that BaseGV and |
2157 | 7.77M | // BaseReg both being zero actually means a null pointer value, which we |
2158 | 7.77M | // consider to be 'non-zero' here. |
2159 | 7.77M | return !BaseOffs && !Scale2.27M && !(1.86M BaseGV1.86M && BaseReg17.3k ); |
2160 | 7.77M | } |
2161 | | |
2162 | 12.4k | Value *GetFieldAsValue(FieldName Field, Type *IntPtrTy) { |
2163 | 12.4k | switch (Field) { |
2164 | 12.4k | default: |
2165 | 0 | return nullptr; |
2166 | 12.4k | case BaseRegField: |
2167 | 7.05k | return BaseReg; |
2168 | 12.4k | case BaseGVField: |
2169 | 0 | return BaseGV; |
2170 | 12.4k | case ScaledRegField: |
2171 | 1.02k | return ScaledReg; |
2172 | 12.4k | case BaseOffsField: |
2173 | 4.35k | return ConstantInt::get(IntPtrTy, BaseOffs); |
2174 | 12.4k | } |
2175 | 12.4k | } |
2176 | | |
2177 | | void SetCombinedField(FieldName Field, Value *V, |
2178 | 875 | const SmallVectorImpl<ExtAddrMode> &AddrModes) { |
2179 | 875 | switch (Field) { |
2180 | 875 | default: |
2181 | 0 | llvm_unreachable("Unhandled fields are expected to be rejected earlier"); |
2182 | 875 | break0 ; |
2183 | 875 | case ExtAddrMode::BaseRegField: |
2184 | 139 | BaseReg = V; |
2185 | 139 | break; |
2186 | 875 | case ExtAddrMode::BaseGVField: |
2187 | 0 | // A combined BaseGV is an Instruction, not a GlobalValue, so it goes |
2188 | 0 | // in the BaseReg field. |
2189 | 0 | assert(BaseReg == nullptr); |
2190 | 0 | BaseReg = V; |
2191 | 0 | BaseGV = nullptr; |
2192 | 0 | break; |
2193 | 875 | case ExtAddrMode::ScaledRegField: |
2194 | 37 | ScaledReg = V; |
2195 | 37 | // If we have a mix of scaled and unscaled addrmodes then we want scale |
2196 | 37 | // to be the scale and not zero. |
2197 | 37 | if (!Scale) |
2198 | 19 | for (const ExtAddrMode &AM : AddrModes) |
2199 | 38 | if (AM.Scale) { |
2200 | 19 | Scale = AM.Scale; |
2201 | 19 | break; |
2202 | 19 | } |
2203 | 37 | break; |
2204 | 875 | case ExtAddrMode::BaseOffsField: |
2205 | 699 | // The offset is no longer a constant, so it goes in ScaledReg with a |
2206 | 699 | // scale of 1. |
2207 | 699 | assert(ScaledReg == nullptr); |
2208 | 699 | ScaledReg = V; |
2209 | 699 | Scale = 1; |
2210 | 699 | BaseOffs = 0; |
2211 | 699 | break; |
2212 | 875 | } |
2213 | 875 | } |
2214 | | }; |
2215 | | |
2216 | | } // end anonymous namespace |
2217 | | |
2218 | | #ifndef NDEBUG |
2219 | | static inline raw_ostream &operator<<(raw_ostream &OS, const ExtAddrMode &AM) { |
2220 | | AM.print(OS); |
2221 | | return OS; |
2222 | | } |
2223 | | #endif |
2224 | | |
2225 | | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
2226 | | void ExtAddrMode::print(raw_ostream &OS) const { |
2227 | | bool NeedPlus = false; |
2228 | | OS << "["; |
2229 | | if (InBounds) |
2230 | | OS << "inbounds "; |
2231 | | if (BaseGV) { |
2232 | | OS << (NeedPlus ? " + " : "") |
2233 | | << "GV:"; |
2234 | | BaseGV->printAsOperand(OS, /*PrintType=*/false); |
2235 | | NeedPlus = true; |
2236 | | } |
2237 | | |
2238 | | if (BaseOffs) { |
2239 | | OS << (NeedPlus ? " + " : "") |
2240 | | << BaseOffs; |
2241 | | NeedPlus = true; |
2242 | | } |
2243 | | |
2244 | | if (BaseReg) { |
2245 | | OS << (NeedPlus ? " + " : "") |
2246 | | << "Base:"; |
2247 | | BaseReg->printAsOperand(OS, /*PrintType=*/false); |
2248 | | NeedPlus = true; |
2249 | | } |
2250 | | if (Scale) { |
2251 | | OS << (NeedPlus ? " + " : "") |
2252 | | << Scale << "*"; |
2253 | | ScaledReg->printAsOperand(OS, /*PrintType=*/false); |
2254 | | } |
2255 | | |
2256 | | OS << ']'; |
2257 | | } |
2258 | | |
2259 | | LLVM_DUMP_METHOD void ExtAddrMode::dump() const { |
2260 | | print(dbgs()); |
2261 | | dbgs() << '\n'; |
2262 | | } |
2263 | | #endif |
2264 | | |
2265 | | namespace { |
2266 | | |
2267 | | /// This class provides transaction based operation on the IR. |
2268 | | /// Every change made through this class is recorded in the internal state and |
2269 | | /// can be undone (rollback) until commit is called. |
2270 | | class TypePromotionTransaction { |
2271 | | /// This represents the common interface of the individual transaction. |
2272 | | /// Each class implements the logic for doing one specific modification on |
2273 | | /// the IR via the TypePromotionTransaction. |
2274 | | class TypePromotionAction { |
2275 | | protected: |
2276 | | /// The Instruction modified. |
2277 | | Instruction *Inst; |
2278 | | |
2279 | | public: |
2280 | | /// Constructor of the action. |
2281 | | /// The constructor performs the related action on the IR. |
2282 | 1.02M | TypePromotionAction(Instruction *Inst) : Inst(Inst) {} |
2283 | | |
2284 | 1.02M | virtual ~TypePromotionAction() = default; |
2285 | | |
2286 | | /// Undo the modification done by this action. |
2287 | | /// When this method is called, the IR must be in the same state as it was |
2288 | | /// before this action was applied. |
2289 | | /// \pre Undoing the action works if and only if the IR is in the exact same |
2290 | | /// state as it was directly after this action was applied. |
2291 | | virtual void undo() = 0; |
2292 | | |
2293 | | /// Advocate every change made by this action. |
2294 | | /// When the results on the IR of the action are to be kept, it is important |
2295 | | /// to call this function, otherwise hidden information may be kept forever. |
2296 | 169k | virtual void commit() { |
2297 | 169k | // Nothing to be done, this action is not doing anything. |
2298 | 169k | } |
2299 | | }; |
2300 | | |
2301 | | /// Utility to remember the position of an instruction. |
2302 | | class InsertionHandler { |
2303 | | /// Position of an instruction. |
2304 | | /// Either an instruction: |
2305 | | /// - Is the first in a basic block: BB is used. |
2306 | | /// - Has a previous instruction: PrevInst is used. |
2307 | | union { |
2308 | | Instruction *PrevInst; |
2309 | | BasicBlock *BB; |
2310 | | } Point; |
2311 | | |
2312 | | /// Remember whether or not the instruction had a previous instruction. |
2313 | | bool HasPrevInstruction; |
2314 | | |
2315 | | public: |
2316 | | /// Record the position of \p Inst. |
2317 | 200k | InsertionHandler(Instruction *Inst) { |
2318 | 200k | BasicBlock::iterator It = Inst->getIterator(); |
2319 | 200k | HasPrevInstruction = (It != (Inst->getParent()->begin())); |
2320 | 200k | if (HasPrevInstruction) |
2321 | 193k | Point.PrevInst = &*--It; |
2322 | 7.41k | else |
2323 | 7.41k | Point.BB = Inst->getParent(); |
2324 | 200k | } |
2325 | | |
2326 | | /// Insert \p Inst at the recorded position. |
2327 | 167k | void insert(Instruction *Inst) { |
2328 | 167k | if (HasPrevInstruction) { |
2329 | 160k | if (Inst->getParent()) |
2330 | 148k | Inst->removeFromParent(); |
2331 | 160k | Inst->insertAfter(Point.PrevInst); |
2332 | 160k | } else { |
2333 | 6.78k | Instruction *Position = &*Point.BB->getFirstInsertionPt(); |
2334 | 6.78k | if (Inst->getParent()) |
2335 | 6.42k | Inst->moveBefore(Position); |
2336 | 361 | else |
2337 | 361 | Inst->insertBefore(Position); |
2338 | 6.78k | } |
2339 | 167k | } |
2340 | | }; |
2341 | | |
2342 | | /// Move an instruction before another. |
2343 | | class InstructionMoveBefore : public TypePromotionAction { |
2344 | | /// Original position of the instruction. |
2345 | | InsertionHandler Position; |
2346 | | |
2347 | | public: |
2348 | | /// Move \p Inst before \p Before. |
2349 | | InstructionMoveBefore(Instruction *Inst, Instruction *Before) |
2350 | 186k | : TypePromotionAction(Inst), Position(Inst) { |
2351 | 186k | LLVM_DEBUG(dbgs() << "Do: move: " << *Inst << "\nbefore: " << *Before |
2352 | 186k | << "\n"); |
2353 | 186k | Inst->moveBefore(Before); |
2354 | 186k | } |
2355 | | |
2356 | | /// Move the instruction back to its original position. |
2357 | 154k | void undo() override { |
2358 | 154k | LLVM_DEBUG(dbgs() << "Undo: moveBefore: " << *Inst << "\n"); |
2359 | 154k | Position.insert(Inst); |
2360 | 154k | } |
2361 | | }; |
2362 | | |
2363 | | /// Set the operand of an instruction with a new value. |
2364 | | class OperandSetter : public TypePromotionAction { |
2365 | | /// Original operand of the instruction. |
2366 | | Value *Origin; |
2367 | | |
2368 | | /// Index of the modified instruction. |
2369 | | unsigned Idx; |
2370 | | |
2371 | | public: |
2372 | | /// Set \p Idx operand of \p Inst with \p NewVal. |
2373 | | OperandSetter(Instruction *Inst, unsigned Idx, Value *NewVal) |
2374 | 446k | : TypePromotionAction(Inst), Idx(Idx) { |
2375 | 446k | LLVM_DEBUG(dbgs() << "Do: setOperand: " << Idx << "\n" |
2376 | 446k | << "for:" << *Inst << "\n" |
2377 | 446k | << "with:" << *NewVal << "\n"); |
2378 | 446k | Origin = Inst->getOperand(Idx); |
2379 | 446k | Inst->setOperand(Idx, NewVal); |
2380 | 446k | } |
2381 | | |
2382 | | /// Restore the original value of the instruction. |
2383 | 370k | void undo() override { |
2384 | 370k | LLVM_DEBUG(dbgs() << "Undo: setOperand:" << Idx << "\n" |
2385 | 370k | << "for: " << *Inst << "\n" |
2386 | 370k | << "with: " << *Origin << "\n"); |
2387 | 370k | Inst->setOperand(Idx, Origin); |
2388 | 370k | } |
2389 | | }; |
2390 | | |
2391 | | /// Hide the operands of an instruction. |
2392 | | /// Do as if this instruction was not using any of its operands. |
2393 | | class OperandsHider : public TypePromotionAction { |
2394 | | /// The list of original operands. |
2395 | | SmallVector<Value *, 4> OriginalValues; |
2396 | | |
2397 | | public: |
2398 | | /// Remove \p Inst from the uses of the operands of \p Inst. |
2399 | 13.9k | OperandsHider(Instruction *Inst) : TypePromotionAction(Inst) { |
2400 | 13.9k | LLVM_DEBUG(dbgs() << "Do: OperandsHider: " << *Inst << "\n"); |
2401 | 13.9k | unsigned NumOpnds = Inst->getNumOperands(); |
2402 | 13.9k | OriginalValues.reserve(NumOpnds); |
2403 | 27.8k | for (unsigned It = 0; It < NumOpnds; ++It13.9k ) { |
2404 | 13.9k | // Save the current operand. |
2405 | 13.9k | Value *Val = Inst->getOperand(It); |
2406 | 13.9k | OriginalValues.push_back(Val); |
2407 | 13.9k | // Set a dummy one. |
2408 | 13.9k | // We could use OperandSetter here, but that would imply an overhead |
2409 | 13.9k | // that we are not willing to pay. |
2410 | 13.9k | Inst->setOperand(It, UndefValue::get(Val->getType())); |
2411 | 13.9k | } |
2412 | 13.9k | } |
2413 | | |
2414 | | /// Restore the original list of uses. |
2415 | 12.1k | void undo() override { |
2416 | 12.1k | LLVM_DEBUG(dbgs() << "Undo: OperandsHider: " << *Inst << "\n"); |
2417 | 24.2k | for (unsigned It = 0, EndIt = OriginalValues.size(); It != EndIt; ++It12.1k ) |
2418 | 12.1k | Inst->setOperand(It, OriginalValues[It]); |
2419 | 12.1k | } |
2420 | | }; |
2421 | | |
2422 | | /// Build a truncate instruction. |
2423 | | class TruncBuilder : public TypePromotionAction { |
2424 | | Value *Val; |
2425 | | |
2426 | | public: |
2427 | | /// Build a truncate instruction of \p Opnd producing a \p Ty |
2428 | | /// result. |
2429 | | /// trunc Opnd to Ty. |
2430 | 23.0k | TruncBuilder(Instruction *Opnd, Type *Ty) : TypePromotionAction(Opnd) { |
2431 | 23.0k | IRBuilder<> Builder(Opnd); |
2432 | 23.0k | Val = Builder.CreateTrunc(Opnd, Ty, "promoted"); |
2433 | 23.0k | LLVM_DEBUG(dbgs() << "Do: TruncBuilder: " << *Val << "\n"); |
2434 | 23.0k | } |
2435 | | |
2436 | | /// Get the built value. |
2437 | 23.0k | Value *getBuiltValue() { return Val; } |
2438 | | |
2439 | | /// Remove the built instruction. |
2440 | 20.7k | void undo() override { |
2441 | 20.7k | LLVM_DEBUG(dbgs() << "Undo: TruncBuilder: " << *Val << "\n"); |
2442 | 20.7k | if (Instruction *IVal = dyn_cast<Instruction>(Val)) |
2443 | 20.7k | IVal->eraseFromParent(); |
2444 | 20.7k | } |
2445 | | }; |
2446 | | |
2447 | | /// Build a sign extension instruction. |
2448 | | class SExtBuilder : public TypePromotionAction { |
2449 | | Value *Val; |
2450 | | |
2451 | | public: |
2452 | | /// Build a sign extension instruction of \p Opnd producing a \p Ty |
2453 | | /// result. |
2454 | | /// sext Opnd to Ty. |
2455 | | SExtBuilder(Instruction *InsertPt, Value *Opnd, Type *Ty) |
2456 | 62.6k | : TypePromotionAction(InsertPt) { |
2457 | 62.6k | IRBuilder<> Builder(InsertPt); |
2458 | 62.6k | Val = Builder.CreateSExt(Opnd, Ty, "promoted"); |
2459 | 62.6k | LLVM_DEBUG(dbgs() << "Do: SExtBuilder: " << *Val << "\n"); |
2460 | 62.6k | } |
2461 | | |
2462 | | /// Get the built value. |
2463 | 62.6k | Value *getBuiltValue() { return Val; } |
2464 | | |
2465 | | /// Remove the built instruction. |
2466 | 52.8k | void undo() override { |
2467 | 52.8k | LLVM_DEBUG(dbgs() << "Undo: SExtBuilder: " << *Val << "\n"); |
2468 | 52.8k | if (Instruction *IVal = dyn_cast<Instruction>(Val)) |
2469 | 52.8k | IVal->eraseFromParent(); |
2470 | 52.8k | } |
2471 | | }; |
2472 | | |
2473 | | /// Build a zero extension instruction. |
2474 | | class ZExtBuilder : public TypePromotionAction { |
2475 | | Value *Val; |
2476 | | |
2477 | | public: |
2478 | | /// Build a zero extension instruction of \p Opnd producing a \p Ty |
2479 | | /// result. |
2480 | | /// zext Opnd to Ty. |
2481 | | ZExtBuilder(Instruction *InsertPt, Value *Opnd, Type *Ty) |
2482 | 12.2k | : TypePromotionAction(InsertPt) { |
2483 | 12.2k | IRBuilder<> Builder(InsertPt); |
2484 | 12.2k | Val = Builder.CreateZExt(Opnd, Ty, "promoted"); |
2485 | 12.2k | LLVM_DEBUG(dbgs() << "Do: ZExtBuilder: " << *Val << "\n"); |
2486 | 12.2k | } |
2487 | | |
2488 | | /// Get the built value. |
2489 | 12.2k | Value *getBuiltValue() { return Val; } |
2490 | | |
2491 | | /// Remove the built instruction. |
2492 | 10.5k | void undo() override { |
2493 | 10.5k | LLVM_DEBUG(dbgs() << "Undo: ZExtBuilder: " << *Val << "\n"); |
2494 | 10.5k | if (Instruction *IVal = dyn_cast<Instruction>(Val)) |
2495 | 10.5k | IVal->eraseFromParent(); |
2496 | 10.5k | } |
2497 | | }; |
2498 | | |
2499 | | /// Mutate an instruction to another type. |
2500 | | class TypeMutator : public TypePromotionAction { |
2501 | | /// Record the original type. |
2502 | | Type *OrigTy; |
2503 | | |
2504 | | public: |
2505 | | /// Mutate the type of \p Inst into \p NewTy. |
2506 | | TypeMutator(Instruction *Inst, Type *NewTy) |
2507 | 116k | : TypePromotionAction(Inst), OrigTy(Inst->getType()) { |
2508 | 116k | LLVM_DEBUG(dbgs() << "Do: MutateType: " << *Inst << " with " << *NewTy |
2509 | 116k | << "\n"); |
2510 | 116k | Inst->mutateType(NewTy); |
2511 | 116k | } |
2512 | | |
2513 | | /// Mutate the instruction back to its original type. |
2514 | 94.9k | void undo() override { |
2515 | 94.9k | LLVM_DEBUG(dbgs() << "Undo: MutateType: " << *Inst << " with " << *OrigTy |
2516 | 94.9k | << "\n"); |
2517 | 94.9k | Inst->mutateType(OrigTy); |
2518 | 94.9k | } |
2519 | | }; |
2520 | | |
2521 | | /// Replace the uses of an instruction by another instruction. |
2522 | | class UsesReplacer : public TypePromotionAction { |
2523 | | /// Helper structure to keep track of the replaced uses. |
2524 | | struct InstructionAndIdx { |
2525 | | /// The instruction using the instruction. |
2526 | | Instruction *Inst; |
2527 | | |
2528 | | /// The index where this instruction is used for Inst. |
2529 | | unsigned Idx; |
2530 | | |
2531 | | InstructionAndIdx(Instruction *Inst, unsigned Idx) |
2532 | 336k | : Inst(Inst), Idx(Idx) {} |
2533 | | }; |
2534 | | |
2535 | | /// Keep track of the original uses (pair Instruction, Index). |
2536 | | SmallVector<InstructionAndIdx, 4> OriginalUses; |
2537 | | /// Keep track of the debug users. |
2538 | | SmallVector<DbgValueInst *, 1> DbgValues; |
2539 | | |
2540 | | using use_iterator = SmallVectorImpl<InstructionAndIdx>::iterator; |
2541 | | |
2542 | | public: |
2543 | | /// Replace all the use of \p Inst by \p New. |
2544 | 147k | UsesReplacer(Instruction *Inst, Value *New) : TypePromotionAction(Inst) { |
2545 | 147k | LLVM_DEBUG(dbgs() << "Do: UsersReplacer: " << *Inst << " with " << *New |
2546 | 147k | << "\n"); |
2547 | 147k | // Record the original uses. |
2548 | 336k | for (Use &U : Inst->uses()) { |
2549 | 336k | Instruction *UserI = cast<Instruction>(U.getUser()); |
2550 | 336k | OriginalUses.push_back(InstructionAndIdx(UserI, U.getOperandNo())); |
2551 | 336k | } |
2552 | 147k | // Record the debug uses separately. They are not in the instruction's |
2553 | 147k | // use list, but they are replaced by RAUW. |
2554 | 147k | findDbgValues(DbgValues, Inst); |
2555 | 147k | |
2556 | 147k | // Now, we can replace the uses. |
2557 | 147k | Inst->replaceAllUsesWith(New); |
2558 | 147k | } |
2559 | | |
2560 | | /// Reassign the original uses of Inst to Inst. |
2561 | 123k | void undo() override { |
2562 | 123k | LLVM_DEBUG(dbgs() << "Undo: UsersReplacer: " << *Inst << "\n"); |
2563 | 123k | for (use_iterator UseIt = OriginalUses.begin(), |
2564 | 123k | EndIt = OriginalUses.end(); |
2565 | 425k | UseIt != EndIt; ++UseIt302k ) { |
2566 | 302k | UseIt->Inst->setOperand(UseIt->Idx, Inst); |
2567 | 302k | } |
2568 | 123k | // RAUW has replaced all original uses with references to the new value, |
2569 | 123k | // including the debug uses. Since we are undoing the replacements, |
2570 | 123k | // the original debug uses must also be reinstated to maintain the |
2571 | 123k | // correctness and utility of debug value instructions. |
2572 | 123k | for (auto *DVI: DbgValues) { |
2573 | 0 | LLVMContext &Ctx = Inst->getType()->getContext(); |
2574 | 0 | auto *MV = MetadataAsValue::get(Ctx, ValueAsMetadata::get(Inst)); |
2575 | 0 | DVI->setOperand(0, MV); |
2576 | 0 | } |
2577 | 123k | } |
2578 | | }; |
2579 | | |
2580 | | /// Remove an instruction from the IR. |
2581 | | class InstructionRemover : public TypePromotionAction { |
2582 | | /// Original position of the instruction. |
2583 | | InsertionHandler Inserter; |
2584 | | |
2585 | | /// Helper structure to hide all the link to the instruction. In other |
2586 | | /// words, this helps to do as if the instruction was removed. |
2587 | | OperandsHider Hider; |
2588 | | |
2589 | | /// Keep track of the uses replaced, if any. |
2590 | | UsesReplacer *Replacer = nullptr; |
2591 | | |
2592 | | /// Keep track of instructions removed. |
2593 | | SetOfInstrs &RemovedInsts; |
2594 | | |
2595 | | public: |
2596 | | /// Remove all reference of \p Inst and optionally replace all its |
2597 | | /// uses with New. |
2598 | | /// \p RemovedInsts Keep track of the instructions removed by this Action. |
2599 | | /// \pre If !Inst->use_empty(), then New != nullptr |
2600 | | InstructionRemover(Instruction *Inst, SetOfInstrs &RemovedInsts, |
2601 | | Value *New = nullptr) |
2602 | | : TypePromotionAction(Inst), Inserter(Inst), Hider(Inst), |
2603 | 13.9k | RemovedInsts(RemovedInsts) { |
2604 | 13.9k | if (New) |
2605 | 4.29k | Replacer = new UsesReplacer(Inst, New); |
2606 | 13.9k | LLVM_DEBUG(dbgs() << "Do: InstructionRemover: " << *Inst << "\n"); |
2607 | 13.9k | RemovedInsts.insert(Inst); |
2608 | 13.9k | /// The instructions removed here will be freed after completing |
2609 | 13.9k | /// optimizeBlock() for all blocks as we need to keep track of the |
2610 | 13.9k | /// removed instructions during promotion. |
2611 | 13.9k | Inst->removeFromParent(); |
2612 | 13.9k | } |
2613 | | |
2614 | 13.9k | ~InstructionRemover() override { delete Replacer; } |
2615 | | |
2616 | | /// Resurrect the instruction and reassign it to the proper uses if |
2617 | | /// new value was provided when build this action. |
2618 | 12.1k | void undo() override { |
2619 | 12.1k | LLVM_DEBUG(dbgs() << "Undo: InstructionRemover: " << *Inst << "\n"); |
2620 | 12.1k | Inserter.insert(Inst); |
2621 | 12.1k | if (Replacer) |
2622 | 4.24k | Replacer->undo(); |
2623 | 12.1k | Hider.undo(); |
2624 | 12.1k | RemovedInsts.erase(Inst); |
2625 | 12.1k | } |
2626 | | }; |
2627 | | |
2628 | | public: |
2629 | | /// Restoration point. |
2630 | | /// The restoration point is a pointer to an action instead of an iterator |
2631 | | /// because the iterator may be invalidated but not the pointer. |
2632 | | using ConstRestorationPt = const TypePromotionAction *; |
2633 | | |
2634 | | TypePromotionTransaction(SetOfInstrs &RemovedInsts) |
2635 | 8.53M | : RemovedInsts(RemovedInsts) {} |
2636 | | |
2637 | | /// Advocate every changes made in that transaction. |
2638 | | void commit(); |
2639 | | |
2640 | | /// Undo all the changes made after the given point. |
2641 | | void rollback(ConstRestorationPt Point); |
2642 | | |
2643 | | /// Get the current restoration point. |
2644 | | ConstRestorationPt getRestorationPoint() const; |
2645 | | |
2646 | | /// \name API for IR modification with state keeping to support rollback. |
2647 | | /// @{ |
2648 | | /// Same as Instruction::setOperand. |
2649 | | void setOperand(Instruction *Inst, unsigned Idx, Value *NewVal); |
2650 | | |
2651 | | /// Same as Instruction::eraseFromParent. |
2652 | | void eraseInstruction(Instruction *Inst, Value *NewVal = nullptr); |
2653 | | |
2654 | | /// Same as Value::replaceAllUsesWith. |
2655 | | void replaceAllUsesWith(Instruction *Inst, Value *New); |
2656 | | |
2657 | | /// Same as Value::mutateType. |
2658 | | void mutateType(Instruction *Inst, Type *NewTy); |
2659 | | |
2660 | | /// Same as IRBuilder::createTrunc. |
2661 | | Value *createTrunc(Instruction *Opnd, Type *Ty); |
2662 | | |
2663 | | /// Same as IRBuilder::createSExt. |
2664 | | Value *createSExt(Instruction *Inst, Value *Opnd, Type *Ty); |
2665 | | |
2666 | | /// Same as IRBuilder::createZExt. |
2667 | | Value *createZExt(Instruction *Inst, Value *Opnd, Type *Ty); |
2668 | | |
2669 | | /// Same as Instruction::moveBefore. |
2670 | | void moveBefore(Instruction *Inst, Instruction *Before); |
2671 | | /// @} |
2672 | | |
2673 | | private: |
2674 | | /// The ordered list of actions made so far. |
2675 | | SmallVector<std::unique_ptr<TypePromotionAction>, 16> Actions; |
2676 | | |
2677 | | using CommitPt = SmallVectorImpl<std::unique_ptr<TypePromotionAction>>::iterator; |
2678 | | |
2679 | | SetOfInstrs &RemovedInsts; |
2680 | | }; |
2681 | | |
2682 | | } // end anonymous namespace |
2683 | | |
2684 | | void TypePromotionTransaction::setOperand(Instruction *Inst, unsigned Idx, |
2685 | 446k | Value *NewVal) { |
2686 | 446k | Actions.push_back(llvm::make_unique<TypePromotionTransaction::OperandSetter>( |
2687 | 446k | Inst, Idx, NewVal)); |
2688 | 446k | } |
2689 | | |
2690 | | void TypePromotionTransaction::eraseInstruction(Instruction *Inst, |
2691 | 13.9k | Value *NewVal) { |
2692 | 13.9k | Actions.push_back( |
2693 | 13.9k | llvm::make_unique<TypePromotionTransaction::InstructionRemover>( |
2694 | 13.9k | Inst, RemovedInsts, NewVal)); |
2695 | 13.9k | } |
2696 | | |
2697 | | void TypePromotionTransaction::replaceAllUsesWith(Instruction *Inst, |
2698 | 143k | Value *New) { |
2699 | 143k | Actions.push_back( |
2700 | 143k | llvm::make_unique<TypePromotionTransaction::UsesReplacer>(Inst, New)); |
2701 | 143k | } |
2702 | | |
2703 | 116k | void TypePromotionTransaction::mutateType(Instruction *Inst, Type *NewTy) { |
2704 | 116k | Actions.push_back( |
2705 | 116k | llvm::make_unique<TypePromotionTransaction::TypeMutator>(Inst, NewTy)); |
2706 | 116k | } |
2707 | | |
2708 | | Value *TypePromotionTransaction::createTrunc(Instruction *Opnd, |
2709 | 23.0k | Type *Ty) { |
2710 | 23.0k | std::unique_ptr<TruncBuilder> Ptr(new TruncBuilder(Opnd, Ty)); |
2711 | 23.0k | Value *Val = Ptr->getBuiltValue(); |
2712 | 23.0k | Actions.push_back(std::move(Ptr)); |
2713 | 23.0k | return Val; |
2714 | 23.0k | } |
2715 | | |
2716 | | Value *TypePromotionTransaction::createSExt(Instruction *Inst, |
2717 | 62.6k | Value *Opnd, Type *Ty) { |
2718 | 62.6k | std::unique_ptr<SExtBuilder> Ptr(new SExtBuilder(Inst, Opnd, Ty)); |
2719 | 62.6k | Value *Val = Ptr->getBuiltValue(); |
2720 | 62.6k | Actions.push_back(std::move(Ptr)); |
2721 | 62.6k | return Val; |
2722 | 62.6k | } |
2723 | | |
2724 | | Value *TypePromotionTransaction::createZExt(Instruction *Inst, |
2725 | 12.2k | Value *Opnd, Type *Ty) { |
2726 | 12.2k | std::unique_ptr<ZExtBuilder> Ptr(new ZExtBuilder(Inst, Opnd, Ty)); |
2727 | 12.2k | Value *Val = Ptr->getBuiltValue(); |
2728 | 12.2k | Actions.push_back(std::move(Ptr)); |
2729 | 12.2k | return Val; |
2730 | 12.2k | } |
2731 | | |
2732 | | void TypePromotionTransaction::moveBefore(Instruction *Inst, |
2733 | 186k | Instruction *Before) { |
2734 | 186k | Actions.push_back( |
2735 | 186k | llvm::make_unique<TypePromotionTransaction::InstructionMoveBefore>( |
2736 | 186k | Inst, Before)); |
2737 | 186k | } |
2738 | | |
2739 | | TypePromotionTransaction::ConstRestorationPt |
2740 | 29.4M | TypePromotionTransaction::getRestorationPoint() const { |
2741 | 29.4M | return !Actions.empty() ? Actions.back().get()41.4k : nullptr29.4M ; |
2742 | 29.4M | } |
2743 | | |
2744 | 7.72M | void TypePromotionTransaction::commit() { |
2745 | 7.89M | for (CommitPt It = Actions.begin(), EndIt = Actions.end(); It != EndIt; |
2746 | 7.72M | ++It169k ) |
2747 | 169k | (*It)->commit(); |
2748 | 7.72M | Actions.clear(); |
2749 | 7.72M | } |
2750 | | |
2751 | | void TypePromotionTransaction::rollback( |
2752 | 1.74M | TypePromotionTransaction::ConstRestorationPt Point) { |
2753 | 2.58M | while (!Actions.empty() && Point != Actions.back().get()854k ) { |
2754 | 835k | std::unique_ptr<TypePromotionAction> Curr = Actions.pop_back_val(); |
2755 | 835k | Curr->undo(); |
2756 | 835k | } |
2757 | 1.74M | } |
2758 | | |
2759 | | namespace { |
2760 | | |
2761 | | /// A helper class for matching addressing modes. |
2762 | | /// |
2763 | | /// This encapsulates the logic for matching the target-legal addressing modes. |
2764 | | class AddressingModeMatcher { |
2765 | | SmallVectorImpl<Instruction*> &AddrModeInsts; |
2766 | | const TargetLowering &TLI; |
2767 | | const TargetRegisterInfo &TRI; |
2768 | | const DataLayout &DL; |
2769 | | |
2770 | | /// AccessTy/MemoryInst - This is the type for the access (e.g. double) and |
2771 | | /// the memory instruction that we're computing this address for. |
2772 | | Type *AccessTy; |
2773 | | unsigned AddrSpace; |
2774 | | Instruction *MemoryInst; |
2775 | | |
2776 | | /// This is the addressing mode that we're building up. This is |
2777 | | /// part of the return value of this addressing mode matching stuff. |
2778 | | ExtAddrMode &AddrMode; |
2779 | | |
2780 | | /// The instructions inserted by other CodeGenPrepare optimizations. |
2781 | | const SetOfInstrs &InsertedInsts; |
2782 | | |
2783 | | /// A map from the instructions to their type before promotion. |
2784 | | InstrToOrigTy &PromotedInsts; |
2785 | | |
2786 | | /// The ongoing transaction where every action should be registered. |
2787 | | TypePromotionTransaction &TPT; |
2788 | | |
2789 | | // A GEP which has too large offset to be folded into the addressing mode. |
2790 | | std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP; |
2791 | | |
2792 | | /// This is set to true when we should not do profitability checks. |
2793 | | /// When true, IsProfitableToFoldIntoAddressingMode always returns true. |
2794 | | bool IgnoreProfitability; |
2795 | | |
2796 | | AddressingModeMatcher( |
2797 | | SmallVectorImpl<Instruction *> &AMI, const TargetLowering &TLI, |
2798 | | const TargetRegisterInfo &TRI, Type *AT, unsigned AS, Instruction *MI, |
2799 | | ExtAddrMode &AM, const SetOfInstrs &InsertedInsts, |
2800 | | InstrToOrigTy &PromotedInsts, TypePromotionTransaction &TPT, |
2801 | | std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP) |
2802 | | : AddrModeInsts(AMI), TLI(TLI), TRI(TRI), |
2803 | | DL(MI->getModule()->getDataLayout()), AccessTy(AT), AddrSpace(AS), |
2804 | | MemoryInst(MI), AddrMode(AM), InsertedInsts(InsertedInsts), |
2805 | 8.27M | PromotedInsts(PromotedInsts), TPT(TPT), LargeOffsetGEP(LargeOffsetGEP) { |
2806 | 8.27M | IgnoreProfitability = false; |
2807 | 8.27M | } |
2808 | | |
2809 | | public: |
2810 | | /// Find the maximal addressing mode that a load/store of V can fold, |
2811 | | /// give an access type of AccessTy. This returns a list of involved |
2812 | | /// instructions in AddrModeInsts. |
2813 | | /// \p InsertedInsts The instructions inserted by other CodeGenPrepare |
2814 | | /// optimizations. |
2815 | | /// \p PromotedInsts maps the instructions to their type before promotion. |
2816 | | /// \p The ongoing transaction where every action should be registered. |
2817 | | static ExtAddrMode |
2818 | | Match(Value *V, Type *AccessTy, unsigned AS, Instruction *MemoryInst, |
2819 | | SmallVectorImpl<Instruction *> &AddrModeInsts, |
2820 | | const TargetLowering &TLI, const TargetRegisterInfo &TRI, |
2821 | | const SetOfInstrs &InsertedInsts, InstrToOrigTy &PromotedInsts, |
2822 | | TypePromotionTransaction &TPT, |
2823 | 7.83M | std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP) { |
2824 | 7.83M | ExtAddrMode Result; |
2825 | 7.83M | |
2826 | 7.83M | bool Success = AddressingModeMatcher(AddrModeInsts, TLI, TRI, AccessTy, AS, |
2827 | 7.83M | MemoryInst, Result, InsertedInsts, |
2828 | 7.83M | PromotedInsts, TPT, LargeOffsetGEP) |
2829 | 7.83M | .matchAddr(V, 0); |
2830 | 7.83M | (void)Success; assert(Success && "Couldn't select *anything*?"); |
2831 | 7.83M | return Result; |
2832 | 7.83M | } |
2833 | | |
2834 | | private: |
2835 | | bool matchScaledValue(Value *ScaleReg, int64_t Scale, unsigned Depth); |
2836 | | bool matchAddr(Value *Addr, unsigned Depth); |
2837 | | bool matchOperationAddr(User *AddrInst, unsigned Opcode, unsigned Depth, |
2838 | | bool *MovedAway = nullptr); |
2839 | | bool isProfitableToFoldIntoAddressingMode(Instruction *I, |
2840 | | ExtAddrMode &AMBefore, |
2841 | | ExtAddrMode &AMAfter); |
2842 | | bool valueAlreadyLiveAtInst(Value *Val, Value *KnownLive1, Value *KnownLive2); |
2843 | | bool isPromotionProfitable(unsigned NewCost, unsigned OldCost, |
2844 | | Value *PromotedOperand) const; |
2845 | | }; |
2846 | | |
2847 | | class PhiNodeSet; |
2848 | | |
2849 | | /// An iterator for PhiNodeSet. |
2850 | | class PhiNodeSetIterator { |
2851 | | PhiNodeSet * const Set; |
2852 | | size_t CurrentIndex = 0; |
2853 | | |
2854 | | public: |
2855 | | /// The constructor. Start should point to either a valid element, or be equal |
2856 | | /// to the size of the underlying SmallVector of the PhiNodeSet. |
2857 | | PhiNodeSetIterator(PhiNodeSet * const Set, size_t Start); |
2858 | | PHINode * operator*() const; |
2859 | | PhiNodeSetIterator& operator++(); |
2860 | | bool operator==(const PhiNodeSetIterator &RHS) const; |
2861 | | bool operator!=(const PhiNodeSetIterator &RHS) const; |
2862 | | }; |
2863 | | |
2864 | | /// Keeps a set of PHINodes. |
2865 | | /// |
2866 | | /// This is a minimal set implementation for a specific use case: |
2867 | | /// It is very fast when there are very few elements, but also provides good |
2868 | | /// performance when there are many. It is similar to SmallPtrSet, but also |
2869 | | /// provides iteration by insertion order, which is deterministic and stable |
2870 | | /// across runs. It is also similar to SmallSetVector, but provides removing |
2871 | | /// elements in O(1) time. This is achieved by not actually removing the element |
2872 | | /// from the underlying vector, so comes at the cost of using more memory, but |
2873 | | /// that is fine, since PhiNodeSets are used as short lived objects. |
2874 | | class PhiNodeSet { |
2875 | | friend class PhiNodeSetIterator; |
2876 | | |
2877 | | using MapType = SmallDenseMap<PHINode *, size_t, 32>; |
2878 | | using iterator = PhiNodeSetIterator; |
2879 | | |
2880 | | /// Keeps the elements in the order of their insertion in the underlying |
2881 | | /// vector. To achieve constant time removal, it never deletes any element. |
2882 | | SmallVector<PHINode *, 32> NodeList; |
2883 | | |
2884 | | /// Keeps the elements in the underlying set implementation. This (and not the |
2885 | | /// NodeList defined above) is the source of truth on whether an element |
2886 | | /// is actually in the collection. |
2887 | | MapType NodeMap; |
2888 | | |
2889 | | /// Points to the first valid (not deleted) element when the set is not empty |
2890 | | /// and the value is not zero. Equals to the size of the underlying vector |
2891 | | /// when the set is empty. When the value is 0, as in the beginning, the |
2892 | | /// first element may or may not be valid. |
2893 | | size_t FirstValidElement = 0; |
2894 | | |
2895 | | public: |
2896 | | /// Inserts a new element to the collection. |
2897 | | /// \returns true if the element is actually added, i.e. was not in the |
2898 | | /// collection before the operation. |
2899 | 4.92k | bool insert(PHINode *Ptr) { |
2900 | 4.92k | if (NodeMap.insert(std::make_pair(Ptr, NodeList.size())).second) { |
2901 | 4.92k | NodeList.push_back(Ptr); |
2902 | 4.92k | return true; |
2903 | 4.92k | } |
2904 | 0 | return false; |
2905 | 0 | } |
2906 | | |
2907 | | /// Removes the element from the collection. |
2908 | | /// \returns whether the element is actually removed, i.e. was in the |
2909 | | /// collection before the operation. |
2910 | 161 | bool erase(PHINode *Ptr) { |
2911 | 161 | auto it = NodeMap.find(Ptr); |
2912 | 161 | if (it != NodeMap.end()) { |
2913 | 160 | NodeMap.erase(Ptr); |
2914 | 160 | SkipRemovedElements(FirstValidElement); |
2915 | 160 | return true; |
2916 | 160 | } |
2917 | 1 | return false; |
2918 | 1 | } |
2919 | | |
2920 | | /// Removes all elements and clears the collection. |
2921 | 4.53k | void clear() { |
2922 | 4.53k | NodeMap.clear(); |
2923 | 4.53k | NodeList.clear(); |
2924 | 4.53k | FirstValidElement = 0; |
2925 | 4.53k | } |
2926 | | |
2927 | | /// \returns an iterator that will iterate the elements in the order of |
2928 | | /// insertion. |
2929 | 9.21k | iterator begin() { |
2930 | 9.21k | if (FirstValidElement == 0) |
2931 | 9.21k | SkipRemovedElements(FirstValidElement); |
2932 | 9.21k | return PhiNodeSetIterator(this, FirstValidElement); |
2933 | 9.21k | } |
2934 | | |
2935 | | /// \returns an iterator that points to the end of the collection. |
2936 | 4.53k | iterator end() { return PhiNodeSetIterator(this, NodeList.size()); } |
2937 | | |
2938 | | /// Returns the number of elements in the collection. |
2939 | 6.43k | size_t size() const { |
2940 | 6.43k | return NodeMap.size(); |
2941 | 6.43k | } |
2942 | | |
2943 | | /// \returns 1 if the given element is in the collection, and 0 if otherwise. |
2944 | 702 | size_t count(PHINode *Ptr) const { |
2945 | 702 | return NodeMap.count(Ptr); |
2946 | 702 | } |
2947 | | |
2948 | | private: |
2949 | | /// Updates the CurrentIndex so that it will point to a valid element. |
2950 | | /// |
2951 | | /// If the element of NodeList at CurrentIndex is valid, it does not |
2952 | | /// change it. If there are no more valid elements, it updates CurrentIndex |
2953 | | /// to point to the end of the NodeList. |
2954 | 14.1k | void SkipRemovedElements(size_t &CurrentIndex) { |
2955 | 14.2k | while (CurrentIndex < NodeList.size()) { |
2956 | 9.61k | auto it = NodeMap.find(NodeList[CurrentIndex]); |
2957 | 9.61k | // If the element has been deleted and added again later, NodeMap will |
2958 | 9.61k | // point to a different index, so CurrentIndex will still be invalid. |
2959 | 9.61k | if (it != NodeMap.end() && it->second == CurrentIndex9.45k ) |
2960 | 9.45k | break; |
2961 | 160 | ++CurrentIndex; |
2962 | 160 | } |
2963 | 14.1k | } |
2964 | | }; |
2965 | | |
2966 | | PhiNodeSetIterator::PhiNodeSetIterator(PhiNodeSet *const Set, size_t Start) |
2967 | 13.7k | : Set(Set), CurrentIndex(Start) {} |
2968 | | |
2969 | 9.45k | PHINode * PhiNodeSetIterator::operator*() const { |
2970 | 9.45k | assert(CurrentIndex < Set->NodeList.size() && |
2971 | 9.45k | "PhiNodeSet access out of range"); |
2972 | 9.45k | return Set->NodeList[CurrentIndex]; |
2973 | 9.45k | } |
2974 | | |
2975 | 4.76k | PhiNodeSetIterator& PhiNodeSetIterator::operator++() { |
2976 | 4.76k | assert(CurrentIndex < Set->NodeList.size() && |
2977 | 4.76k | "PhiNodeSet access out of range"); |
2978 | 4.76k | ++CurrentIndex; |
2979 | 4.76k | Set->SkipRemovedElements(CurrentIndex); |
2980 | 4.76k | return *this; |
2981 | 4.76k | } |
2982 | | |
2983 | 9.29k | bool PhiNodeSetIterator::operator==(const PhiNodeSetIterator &RHS) const { |
2984 | 9.29k | return CurrentIndex == RHS.CurrentIndex; |
2985 | 9.29k | } |
2986 | | |
2987 | 9.29k | bool PhiNodeSetIterator::operator!=(const PhiNodeSetIterator &RHS) const { |
2988 | 9.29k | return !((*this) == RHS); |
2989 | 9.29k | } |
2990 | | |
2991 | | /// Keep track of simplification of Phi nodes. |
2992 | | /// Accept the set of all phi nodes and erase phi node from this set |
2993 | | /// if it is simplified. |
2994 | | class SimplificationTracker { |
2995 | | DenseMap<Value *, Value *> Storage; |
2996 | | const SimplifyQuery &SQ; |
2997 | | // Tracks newly created Phi nodes. The elements are iterated by insertion |
2998 | | // order. |
2999 | | PhiNodeSet AllPhiNodes; |
3000 | | // Tracks newly created Select nodes. |
3001 | | SmallPtrSet<SelectInst *, 32> AllSelectNodes; |
3002 | | |
3003 | | public: |
3004 | | SimplificationTracker(const SimplifyQuery &sq) |
3005 | 5.40k | : SQ(sq) {} |
3006 | | |
3007 | 19.8k | Value *Get(Value *V) { |
3008 | 19.9k | do { |
3009 | 19.9k | auto SV = Storage.find(V); |
3010 | 19.9k | if (SV == Storage.end()) |
3011 | 19.8k | return V; |
3012 | 126 | V = SV->second; |
3013 | 126 | } while (true); |
3014 | 19.8k | } |
3015 | | |
3016 | 5.68k | Value *Simplify(Value *Val) { |
3017 | 5.68k | SmallVector<Value *, 32> WorkList; |
3018 | 5.68k | SmallPtrSet<Value *, 32> Visited; |
3019 | 5.68k | WorkList.push_back(Val); |
3020 | 11.3k | while (!WorkList.empty()) { |
3021 | 5.68k | auto P = WorkList.pop_back_val(); |
3022 | 5.68k | if (!Visited.insert(P).second) |
3023 | 0 | continue; |
3024 | 5.68k | if (auto *PI = dyn_cast<Instruction>(P)) |
3025 | 5.68k | if (Value *V = SimplifyInstruction(cast<Instruction>(PI), SQ)) { |
3026 | 1 | for (auto *U : PI->users()) |
3027 | 0 | WorkList.push_back(cast<Value>(U)); |
3028 | 1 | Put(PI, V); |
3029 | 1 | PI->replaceAllUsesWith(V); |
3030 | 1 | if (auto *PHI = dyn_cast<PHINode>(PI)) |
3031 | 1 | AllPhiNodes.erase(PHI); |
3032 | 1 | if (auto *Select = dyn_cast<SelectInst>(PI)) |
3033 | 0 | AllSelectNodes.erase(Select); |
3034 | 1 | PI->eraseFromParent(); |
3035 | 1 | } |
3036 | 5.68k | } |
3037 | 5.68k | return Get(Val); |
3038 | 5.68k | } |
3039 | | |
3040 | 135 | void Put(Value *From, Value *To) { |
3041 | 135 | Storage.insert({ From, To }); |
3042 | 135 | } |
3043 | | |
3044 | 134 | void ReplacePhi(PHINode *From, PHINode *To) { |
3045 | 134 | Value* OldReplacement = Get(From); |
3046 | 136 | while (OldReplacement != From) { |
3047 | 2 | From = To; |
3048 | 2 | To = dyn_cast<PHINode>(OldReplacement); |
3049 | 2 | OldReplacement = Get(From); |
3050 | 2 | } |
3051 | 134 | assert(Get(To) == To && "Replacement PHI node is already replaced."); |
3052 | 134 | Put(From, To); |
3053 | 134 | From->replaceAllUsesWith(To); |
3054 | 134 | AllPhiNodes.erase(From); |
3055 | 134 | From->eraseFromParent(); |
3056 | 134 | } |
3057 | | |
3058 | 5.40k | PhiNodeSet& newPhiNodes() { return AllPhiNodes; } |
3059 | | |
3060 | 4.92k | void insertNewPhi(PHINode *PN) { AllPhiNodes.insert(PN); } |
3061 | | |
3062 | 764 | void insertNewSelect(SelectInst *SI) { AllSelectNodes.insert(SI); } |
3063 | | |
3064 | 875 | unsigned countNewPhiNodes() const { return AllPhiNodes.size(); } |
3065 | | |
3066 | 875 | unsigned countNewSelectNodes() const { return AllSelectNodes.size(); } |
3067 | | |
3068 | 4.53k | void destroyNewNodes(Type *CommonType) { |
3069 | 4.53k | // For safe erasing, replace the uses with dummy value first. |
3070 | 4.53k | auto Dummy = UndefValue::get(CommonType); |
3071 | 4.76k | for (auto I : AllPhiNodes) { |
3072 | 4.76k | I->replaceAllUsesWith(Dummy); |
3073 | 4.76k | I->eraseFromParent(); |
3074 | 4.76k | } |
3075 | 4.53k | AllPhiNodes.clear(); |
3076 | 4.53k | for (auto I : AllSelectNodes) { |
3077 | 26 | I->replaceAllUsesWith(Dummy); |
3078 | 26 | I->eraseFromParent(); |
3079 | 26 | } |
3080 | 4.53k | AllSelectNodes.clear(); |
3081 | 4.53k | } |
3082 | | }; |
3083 | | |
3084 | | /// A helper class for combining addressing modes. |
3085 | | class AddressingModeCombiner { |
3086 | | typedef DenseMap<Value *, Value *> FoldAddrToValueMapping; |
3087 | | typedef std::pair<PHINode *, PHINode *> PHIPair; |
3088 | | |
3089 | | private: |
3090 | | /// The addressing modes we've collected. |
3091 | | SmallVector<ExtAddrMode, 16> AddrModes; |
3092 | | |
3093 | | /// The field in which the AddrModes differ, when we have more than one. |
3094 | | ExtAddrMode::FieldName DifferentField = ExtAddrMode::NoField; |
3095 | | |
3096 | | /// Are the AddrModes that we have all just equal to their original values? |
3097 | | bool AllAddrModesTrivial = true; |
3098 | | |
3099 | | /// Common Type for all different fields in addressing modes. |
3100 | | Type *CommonType; |
3101 | | |
3102 | | /// SimplifyQuery for simplifyInstruction utility. |
3103 | | const SimplifyQuery &SQ; |
3104 | | |
3105 | | /// Original Address. |
3106 | | Value *Original; |
3107 | | |
3108 | | public: |
3109 | | AddressingModeCombiner(const SimplifyQuery &_SQ, Value *OriginalValue) |
3110 | 7.72M | : CommonType(nullptr), SQ(_SQ), Original(OriginalValue) {} |
3111 | | |
3112 | | /// Get the combined AddrMode |
3113 | 7.63M | const ExtAddrMode &getAddrMode() const { |
3114 | 7.63M | return AddrModes[0]; |
3115 | 7.63M | } |
3116 | | |
3117 | | /// Add a new AddrMode if it's compatible with the AddrModes we already |
3118 | | /// have. |
3119 | | /// \return True iff we succeeded in doing so. |
3120 | 7.83M | bool addNewAddrMode(ExtAddrMode &NewAddrMode) { |
3121 | 7.83M | // Take note of if we have any non-trivial AddrModes, as we need to detect |
3122 | 7.83M | // when all AddrModes are trivial as then we would introduce a phi or select |
3123 | 7.83M | // which just duplicates what's already there. |
3124 | 7.83M | AllAddrModesTrivial = AllAddrModesTrivial && NewAddrMode.isTrivial()7.77M ; |
3125 | 7.83M | |
3126 | 7.83M | // If this is the first addrmode then everything is fine. |
3127 | 7.83M | if (AddrModes.empty()) { |
3128 | 7.72M | AddrModes.emplace_back(NewAddrMode); |
3129 | 7.72M | return true; |
3130 | 7.72M | } |
3131 | 106k | |
3132 | 106k | // Figure out how different this is from the other address modes, which we |
3133 | 106k | // can do just by comparing against the first one given that we only care |
3134 | 106k | // about the cumulative difference. |
3135 | 106k | ExtAddrMode::FieldName ThisDifferentField = |
3136 | 106k | AddrModes[0].compare(NewAddrMode); |
3137 | 106k | if (DifferentField == ExtAddrMode::NoField) |
3138 | 94.2k | DifferentField = ThisDifferentField; |
3139 | 12.2k | else if (DifferentField != ThisDifferentField) |
3140 | 5.86k | DifferentField = ExtAddrMode::MultipleFields; |
3141 | 106k | |
3142 | 106k | // If NewAddrMode differs in more than one dimension we cannot handle it. |
3143 | 106k | bool CanHandle = DifferentField != ExtAddrMode::MultipleFields; |
3144 | 106k | |
3145 | 106k | // If Scale Field is different then we reject. |
3146 | 106k | CanHandle = CanHandle && DifferentField != ExtAddrMode::ScaleField26.7k ; |
3147 | 106k | |
3148 | 106k | // We also must reject the case when base offset is different and |
3149 | 106k | // scale reg is not null, we cannot handle this case due to merge of |
3150 | 106k | // different offsets will be used as ScaleReg. |
3151 | 106k | CanHandle = CanHandle && (26.7k DifferentField != ExtAddrMode::BaseOffsField26.7k || |
3152 | 26.7k | !NewAddrMode.ScaledReg4.71k ); |
3153 | 106k | |
3154 | 106k | // We also must reject the case when GV is different and BaseReg installed |
3155 | 106k | // due to we want to use base reg as a merge of GV values. |
3156 | 106k | CanHandle = CanHandle && (26.7k DifferentField != ExtAddrMode::BaseGVField26.7k || |
3157 | 26.7k | !NewAddrMode.HasBaseReg14 ); |
3158 | 106k | |
3159 | 106k | // Even if NewAddMode is the same we still need to collect it due to |
3160 | 106k | // original value is different. And later we will need all original values |
3161 | 106k | // as anchors during finding the common Phi node. |
3162 | 106k | if (CanHandle) |
3163 | 26.7k | AddrModes.emplace_back(NewAddrMode); |
3164 | 79.7k | else |
3165 | 79.7k | AddrModes.clear(); |
3166 | 106k | |
3167 | 106k | return CanHandle; |
3168 | 106k | } |
3169 | | |
3170 | | /// Combine the addressing modes we've collected into a single |
3171 | | /// addressing mode. |
3172 | | /// \return True iff we successfully combined them or we only had one so |
3173 | | /// didn't need to combine them anyway. |
3174 | 7.72M | bool combineAddrModes() { |
3175 | 7.72M | // If we have no AddrModes then they can't be combined. |
3176 | 7.72M | if (AddrModes.size() == 0) |
3177 | 79.7k | return false; |
3178 | 7.64M | |
3179 | 7.64M | // A single AddrMode can trivially be combined. |
3180 | 7.64M | if (AddrModes.size() == 1 || DifferentField == ExtAddrMode::NoField13.3k ) |
3181 | 7.63M | return true; |
3182 | 12.4k | |
3183 | 12.4k | // If the AddrModes we collected are all just equal to the value they are |
3184 | 12.4k | // derived from then combining them wouldn't do anything useful. |
3185 | 12.4k | if (AllAddrModesTrivial) |
3186 | 7.02k | return false; |
3187 | 5.41k | |
3188 | 5.41k | if (!addrModeCombiningAllowed()) |
3189 | 0 | return false; |
3190 | 5.41k | |
3191 | 5.41k | // Build a map between <original value, basic block where we saw it> to |
3192 | 5.41k | // value of base register. |
3193 | 5.41k | // Bail out if there is no common type. |
3194 | 5.41k | FoldAddrToValueMapping Map; |
3195 | 5.41k | if (!initializeMap(Map)) |
3196 | 4 | return false; |
3197 | 5.40k | |
3198 | 5.40k | Value *CommonValue = findCommon(Map); |
3199 | 5.40k | if (CommonValue) |
3200 | 875 | AddrModes[0].SetCombinedField(DifferentField, CommonValue, AddrModes); |
3201 | 5.40k | return CommonValue != nullptr; |
3202 | 5.40k | } |
3203 | | |
3204 | | private: |
3205 | | /// Initialize Map with anchor values. For address seen |
3206 | | /// we set the value of different field saw in this address. |
3207 | | /// At the same time we find a common type for different field we will |
3208 | | /// use to create new Phi/Select nodes. Keep it in CommonType field. |
3209 | | /// Return false if there is no common type found. |
3210 | 5.41k | bool initializeMap(FoldAddrToValueMapping &Map) { |
3211 | 5.41k | // Keep track of keys where the value is null. We will need to replace it |
3212 | 5.41k | // with constant null when we know the common type. |
3213 | 5.41k | SmallVector<Value *, 2> NullValue; |
3214 | 5.41k | Type *IntPtrTy = SQ.DL.getIntPtrType(AddrModes[0].OriginalValue->getType()); |
3215 | 12.4k | for (auto &AM : AddrModes) { |
3216 | 12.4k | Value *DV = AM.GetFieldAsValue(DifferentField, IntPtrTy); |
3217 | 12.4k | if (DV) { |
3218 | 12.4k | auto *Type = DV->getType(); |
3219 | 12.4k | if (CommonType && CommonType != Type6.99k ) |
3220 | 4 | return false; |
3221 | 12.3k | CommonType = Type; |
3222 | 12.3k | Map[AM.OriginalValue] = DV; |
3223 | 12.3k | } else { |
3224 | 41 | NullValue.push_back(AM.OriginalValue); |
3225 | 41 | } |
3226 | 12.4k | } |
3227 | 5.41k | assert(CommonType && "At least one non-null value must be!"); |
3228 | 5.40k | for (auto *V : NullValue) |
3229 | 37 | Map[V] = Constant::getNullValue(CommonType); |
3230 | 5.40k | return true; |
3231 | 5.41k | } |
3232 | | |
3233 | | /// We have mapping between value A and other value B where B was a field in |
3234 | | /// addressing mode represented by A. Also we have an original value C |
3235 | | /// representing an address we start with. Traversing from C through phi and |
3236 | | /// selects we ended up with A's in a map. This utility function tries to find |
3237 | | /// a value V which is a field in addressing mode C and traversing through phi |
3238 | | /// nodes and selects we will end up in corresponded values B in a map. |
3239 | | /// The utility will create a new Phi/Selects if needed. |
3240 | | // The simple example looks as follows: |
3241 | | // BB1: |
3242 | | // p1 = b1 + 40 |
3243 | | // br cond BB2, BB3 |
3244 | | // BB2: |
3245 | | // p2 = b2 + 40 |
3246 | | // br BB3 |
3247 | | // BB3: |
3248 | | // p = phi [p1, BB1], [p2, BB2] |
3249 | | // v = load p |
3250 | | // Map is |
3251 | | // p1 -> b1 |
3252 | | // p2 -> b2 |
3253 | | // Request is |
3254 | | // p -> ? |
3255 | | // The function tries to find or build phi [b1, BB1], [b2, BB2] in BB3. |
3256 | 5.40k | Value *findCommon(FoldAddrToValueMapping &Map) { |
3257 | 5.40k | // Tracks the simplification of newly created phi nodes. The reason we use |
3258 | 5.40k | // this mapping is because we will add new created Phi nodes in AddrToBase. |
3259 | 5.40k | // Simplification of Phi nodes is recursive, so some Phi node may |
3260 | 5.40k | // be simplified after we added it to AddrToBase. In reality this |
3261 | 5.40k | // simplification is possible only if original phi/selects were not |
3262 | 5.40k | // simplified yet. |
3263 | 5.40k | // Using this mapping we can find the current value in AddrToBase. |
3264 | 5.40k | SimplificationTracker ST(SQ); |
3265 | 5.40k | |
3266 | 5.40k | // First step, DFS to create PHI nodes for all intermediate blocks. |
3267 | 5.40k | // Also fill traverse order for the second step. |
3268 | 5.40k | SmallVector<Value *, 32> TraverseOrder; |
3269 | 5.40k | InsertPlaceholders(Map, TraverseOrder, ST); |
3270 | 5.40k | |
3271 | 5.40k | // Second Step, fill new nodes by merged values and simplify if possible. |
3272 | 5.40k | FillPlaceholders(Map, TraverseOrder, ST); |
3273 | 5.40k | |
3274 | 5.40k | if (!AddrSinkNewSelects && ST.countNewSelectNodes() > 00 ) { |
3275 | 0 | ST.destroyNewNodes(CommonType); |
3276 | 0 | return nullptr; |
3277 | 0 | } |
3278 | 5.40k | |
3279 | 5.40k | // Now we'd like to match New Phi nodes to existed ones. |
3280 | 5.40k | unsigned PhiNotMatchedCount = 0; |
3281 | 5.40k | if (!MatchPhiSet(ST, AddrSinkNewPhis, PhiNotMatchedCount)) { |
3282 | 4.53k | ST.destroyNewNodes(CommonType); |
3283 | 4.53k | return nullptr; |
3284 | 4.53k | } |
3285 | 875 | |
3286 | 875 | auto *Result = ST.Get(Map.find(Original)->second); |
3287 | 875 | if (Result) { |
3288 | 875 | NumMemoryInstsPhiCreated += ST.countNewPhiNodes() + PhiNotMatchedCount; |
3289 | 875 | NumMemoryInstsSelectCreated += ST.countNewSelectNodes(); |
3290 | 875 | } |
3291 | 875 | return Result; |
3292 | 875 | } |
3293 | | |
3294 | | /// Try to match PHI node to Candidate. |
3295 | | /// Matcher tracks the matched Phi nodes. |
3296 | | bool MatchPhiNode(PHINode *PHI, PHINode *Candidate, |
3297 | | SmallSetVector<PHIPair, 8> &Matcher, |
3298 | 13.7k | PhiNodeSet &PhiNodesToMatch) { |
3299 | 13.7k | SmallVector<PHIPair, 8> WorkList; |
3300 | 13.7k | Matcher.insert({ PHI, Candidate }); |
3301 | 13.7k | SmallSet<PHINode *, 8> MatchedPHIs; |
3302 | 13.7k | MatchedPHIs.insert(PHI); |
3303 | 13.7k | WorkList.push_back({ PHI, Candidate }); |
3304 | 13.7k | SmallSet<PHIPair, 8> Visited; |
3305 | 13.8k | while (!WorkList.empty()) { |
3306 | 13.7k | auto Item = WorkList.pop_back_val(); |
3307 | 13.7k | if (!Visited.insert(Item).second) |
3308 | 3 | continue; |
3309 | 13.7k | // We iterate over all incoming values to Phi to compare them. |
3310 | 13.7k | // If values are different and both of them Phi and the first one is a |
3311 | 13.7k | // Phi we added (subject to match) and both of them is in the same basic |
3312 | 13.7k | // block then we can match our pair if values match. So we state that |
3313 | 13.7k | // these values match and add it to work list to verify that. |
3314 | 14.2k | for (auto B : Item.first->blocks())13.7k { |
3315 | 14.2k | Value *FirstValue = Item.first->getIncomingValueForBlock(B); |
3316 | 14.2k | Value *SecondValue = Item.second->getIncomingValueForBlock(B); |
3317 | 14.2k | if (FirstValue == SecondValue) |
3318 | 419 | continue; |
3319 | 13.8k | |
3320 | 13.8k | PHINode *FirstPhi = dyn_cast<PHINode>(FirstValue); |
3321 | 13.8k | PHINode *SecondPhi = dyn_cast<PHINode>(SecondValue); |
3322 | 13.8k | |
3323 | 13.8k | // One of them is not Phi or |
3324 | 13.8k | // The first one is not Phi node from the set we'd like to match or |
3325 | 13.8k | // Phi nodes from different basic blocks then |
3326 | 13.8k | // we will not be able to match. |
3327 | 13.8k | if (!FirstPhi || !SecondPhi6.00k || !PhiNodesToMatch.count(FirstPhi)702 || |
3328 | 13.8k | FirstPhi->getParent() != SecondPhi->getParent()232 ) |
3329 | 13.5k | return false; |
3330 | 208 | |
3331 | 208 | // If we already matched them then continue. |
3332 | 208 | if (Matcher.count({ FirstPhi, SecondPhi })) |
3333 | 2 | continue; |
3334 | 206 | // So the values are different and does not match. So we need them to |
3335 | 206 | // match. (But we register no more than one match per PHI node, so that |
3336 | 206 | // we won't later try to replace them twice.) |
3337 | 206 | if (!MatchedPHIs.insert(FirstPhi).second) |
3338 | 21 | Matcher.insert({ FirstPhi, SecondPhi }); |
3339 | 206 | // But me must check it. |
3340 | 206 | WorkList.push_back({ FirstPhi, SecondPhi }); |
3341 | 206 | } |
3342 | 13.7k | } |
3343 | 13.7k | return true129 ; |
3344 | 13.7k | } |
3345 | | |
3346 | | /// For the given set of PHI nodes (in the SimplificationTracker) try |
3347 | | /// to find their equivalents. |
3348 | | /// Returns false if this matching fails and creation of new Phi is disabled. |
3349 | | bool MatchPhiSet(SimplificationTracker &ST, bool AllowNewPhiNodes, |
3350 | 5.40k | unsigned &PhiNotMatchedCount) { |
3351 | 5.40k | // Matched and PhiNodesToMatch iterate their elements in a deterministic |
3352 | 5.40k | // order, so the replacements (ReplacePhi) are also done in a deterministic |
3353 | 5.40k | // order. |
3354 | 5.40k | SmallSetVector<PHIPair, 8> Matched; |
3355 | 5.40k | SmallPtrSet<PHINode *, 8> WillNotMatch; |
3356 | 5.40k | PhiNodeSet &PhiNodesToMatch = ST.newPhiNodes(); |
3357 | 5.56k | while (PhiNodesToMatch.size()) { |
3358 | 4.68k | PHINode *PHI = *PhiNodesToMatch.begin(); |
3359 | 4.68k | |
3360 | 4.68k | // Add us, if no Phi nodes in the basic block we do not match. |
3361 | 4.68k | WillNotMatch.clear(); |
3362 | 4.68k | WillNotMatch.insert(PHI); |
3363 | 4.68k | |
3364 | 4.68k | // Traverse all Phis until we found equivalent or fail to do that. |
3365 | 4.68k | bool IsMatched = false; |
3366 | 18.3k | for (auto &P : PHI->getParent()->phis()) { |
3367 | 18.3k | if (&P == PHI) |
3368 | 4.64k | continue; |
3369 | 13.7k | if ((IsMatched = MatchPhiNode(PHI, &P, Matched, PhiNodesToMatch))) |
3370 | 129 | break; |
3371 | 13.5k | // If it does not match, collect all Phi nodes from matcher. |
3372 | 13.5k | // if we end up with no match, them all these Phi nodes will not match |
3373 | 13.5k | // later. |
3374 | 13.5k | for (auto M : Matched) |
3375 | 13.6k | WillNotMatch.insert(M.first); |
3376 | 13.5k | Matched.clear(); |
3377 | 13.5k | } |
3378 | 4.68k | if (IsMatched) { |
3379 | 129 | // Replace all matched values and erase them. |
3380 | 129 | for (auto MV : Matched) |
3381 | 134 | ST.ReplacePhi(MV.first, MV.second); |
3382 | 129 | Matched.clear(); |
3383 | 129 | continue; |
3384 | 129 | } |
3385 | 4.55k | // If we are not allowed to create new nodes then bail out. |
3386 | 4.55k | if (!AllowNewPhiNodes) |
3387 | 4.53k | return false; |
3388 | 26 | // Just remove all seen values in matcher. They will not match anything. |
3389 | 26 | PhiNotMatchedCount += WillNotMatch.size(); |
3390 | 26 | for (auto *P : WillNotMatch) |
3391 | 26 | PhiNodesToMatch.erase(P); |
3392 | 26 | } |
3393 | 5.40k | return true875 ; |
3394 | 5.40k | } |
3395 | | /// Fill the placeholders with values from predecessors and simplify them. |
3396 | | void FillPlaceholders(FoldAddrToValueMapping &Map, |
3397 | | SmallVectorImpl<Value *> &TraverseOrder, |
3398 | 5.40k | SimplificationTracker &ST) { |
3399 | 11.0k | while (!TraverseOrder.empty()) { |
3400 | 5.68k | Value *Current = TraverseOrder.pop_back_val(); |
3401 | 5.68k | assert(Map.find(Current) != Map.end() && "No node to fill!!!"); |
3402 | 5.68k | Value *V = Map[Current]; |
3403 | 5.68k | |
3404 | 5.68k | if (SelectInst *Select = dyn_cast<SelectInst>(V)) { |
3405 | 764 | // CurrentValue also must be Select. |
3406 | 764 | auto *CurrentSelect = cast<SelectInst>(Current); |
3407 | 764 | auto *TrueValue = CurrentSelect->getTrueValue(); |
3408 | 764 | assert(Map.find(TrueValue) != Map.end() && "No True Value!"); |
3409 | 764 | Select->setTrueValue(ST.Get(Map[TrueValue])); |
3410 | 764 | auto *FalseValue = CurrentSelect->getFalseValue(); |
3411 | 764 | assert(Map.find(FalseValue) != Map.end() && "No False Value!"); |
3412 | 764 | Select->setFalseValue(ST.Get(Map[FalseValue])); |
3413 | 4.92k | } else { |
3414 | 4.92k | // Must be a Phi node then. |
3415 | 4.92k | PHINode *PHI = cast<PHINode>(V); |
3416 | 4.92k | auto *CurrentPhi = dyn_cast<PHINode>(Current); |
3417 | 4.92k | // Fill the Phi node with values from predecessors. |
3418 | 11.6k | for (auto B : predecessors(PHI->getParent())) { |
3419 | 11.6k | Value *PV = CurrentPhi->getIncomingValueForBlock(B); |
3420 | 11.6k | assert(Map.find(PV) != Map.end() && "No predecessor Value!"); |
3421 | 11.6k | PHI->addIncoming(ST.Get(Map[PV]), B); |
3422 | 11.6k | } |
3423 | 4.92k | } |
3424 | 5.68k | Map[Current] = ST.Simplify(V); |
3425 | 5.68k | } |
3426 | 5.40k | } |
3427 | | |
3428 | | /// Starting from original value recursively iterates over def-use chain up to |
3429 | | /// known ending values represented in a map. For each traversed phi/select |
3430 | | /// inserts a placeholder Phi or Select. |
3431 | | /// Reports all new created Phi/Select nodes by adding them to set. |
3432 | | /// Also reports and order in what values have been traversed. |
3433 | | void InsertPlaceholders(FoldAddrToValueMapping &Map, |
3434 | | SmallVectorImpl<Value *> &TraverseOrder, |
3435 | 5.40k | SimplificationTracker &ST) { |
3436 | 5.40k | SmallVector<Value *, 32> Worklist; |
3437 | 5.40k | assert((isa<PHINode>(Original) || isa<SelectInst>(Original)) && |
3438 | 5.40k | "Address must be a Phi or Select node"); |
3439 | 5.40k | auto *Dummy = UndefValue::get(CommonType); |
3440 | 5.40k | Worklist.push_back(Original); |
3441 | 23.9k | while (!Worklist.empty()) { |
3442 | 18.5k | Value *Current = Worklist.pop_back_val(); |
3443 | 18.5k | // if it is already visited or it is an ending value then skip it. |
3444 | 18.5k | if (Map.find(Current) != Map.end()) |
3445 | 12.8k | continue; |
3446 | 5.68k | TraverseOrder.push_back(Current); |
3447 | 5.68k | |
3448 | 5.68k | // CurrentValue must be a Phi node or select. All others must be covered |
3449 | 5.68k | // by anchors. |
3450 | 5.68k | if (SelectInst *CurrentSelect = dyn_cast<SelectInst>(Current)) { |
3451 | 764 | // Is it OK to get metadata from OrigSelect?! |
3452 | 764 | // Create a Select placeholder with dummy value. |
3453 | 764 | SelectInst *Select = SelectInst::Create( |
3454 | 764 | CurrentSelect->getCondition(), Dummy, Dummy, |
3455 | 764 | CurrentSelect->getName(), CurrentSelect, CurrentSelect); |
3456 | 764 | Map[Current] = Select; |
3457 | 764 | ST.insertNewSelect(Select); |
3458 | 764 | // We are interested in True and False values. |
3459 | 764 | Worklist.push_back(CurrentSelect->getTrueValue()); |
3460 | 764 | Worklist.push_back(CurrentSelect->getFalseValue()); |
3461 | 4.92k | } else { |
3462 | 4.92k | // It must be a Phi node then. |
3463 | 4.92k | PHINode *CurrentPhi = cast<PHINode>(Current); |
3464 | 4.92k | unsigned PredCount = CurrentPhi->getNumIncomingValues(); |
3465 | 4.92k | PHINode *PHI = |
3466 | 4.92k | PHINode::Create(CommonType, PredCount, "sunk_phi", CurrentPhi); |
3467 | 4.92k | Map[Current] = PHI; |
3468 | 4.92k | ST.insertNewPhi(PHI); |
3469 | 4.92k | for (Value *P : CurrentPhi->incoming_values()) |
3470 | 11.6k | Worklist.push_back(P); |
3471 | 4.92k | } |
3472 | 5.68k | } |
3473 | 5.40k | } |
3474 | | |
3475 | 5.41k | bool addrModeCombiningAllowed() { |
3476 | 5.41k | if (DisableComplexAddrModes) |
3477 | 0 | return false; |
3478 | 5.41k | switch (DifferentField) { |
3479 | 5.41k | default: |
3480 | 0 | return false; |
3481 | 5.41k | case ExtAddrMode::BaseRegField: |
3482 | 3.41k | return AddrSinkCombineBaseReg; |
3483 | 5.41k | case ExtAddrMode::BaseGVField: |
3484 | 0 | return AddrSinkCombineBaseGV; |
3485 | 5.41k | case ExtAddrMode::BaseOffsField: |
3486 | 1.78k | return AddrSinkCombineBaseOffs; |
3487 | 5.41k | case ExtAddrMode::ScaledRegField: |
3488 | 215 | return AddrSinkCombineScaledReg; |
3489 | 5.41k | } |
3490 | 5.41k | } |
3491 | | }; |
3492 | | } // end anonymous namespace |
3493 | | |
3494 | | /// Try adding ScaleReg*Scale to the current addressing mode. |
3495 | | /// Return true and update AddrMode if this addr mode is legal for the target, |
3496 | | /// false if not. |
3497 | | bool AddressingModeMatcher::matchScaledValue(Value *ScaleReg, int64_t Scale, |
3498 | 1.36M | unsigned Depth) { |
3499 | 1.36M | // If Scale is 1, then this is the same as adding ScaleReg to the addressing |
3500 | 1.36M | // mode. Just process that directly. |
3501 | 1.36M | if (Scale == 1) |
3502 | 457k | return matchAddr(ScaleReg, Depth); |
3503 | 902k | |
3504 | 902k | // If the scale is 0, it takes nothing to add this. |
3505 | 902k | if (Scale == 0) |
3506 | 4 | return true; |
3507 | 902k | |
3508 | 902k | // If we already have a scale of this value, we can add to it, otherwise, we |
3509 | 902k | // need an available scale field. |
3510 | 902k | if (AddrMode.Scale != 0 && AddrMode.ScaledReg != ScaleReg25.2k ) |
3511 | 21.6k | return false; |
3512 | 881k | |
3513 | 881k | ExtAddrMode TestAddrMode = AddrMode; |
3514 | 881k | |
3515 | 881k | // Add scale to turn X*4+X*3 -> X*7. This could also do things like |
3516 | 881k | // [A+B + A*7] -> [B+A*8]. |
3517 | 881k | TestAddrMode.Scale += Scale; |
3518 | 881k | TestAddrMode.ScaledReg = ScaleReg; |
3519 | 881k | |
3520 | 881k | // If the new address isn't legal, bail out. |
3521 | 881k | if (!TLI.isLegalAddressingMode(DL, TestAddrMode, AccessTy, AddrSpace)) |
3522 | 496k | return false; |
3523 | 384k | |
3524 | 384k | // It was legal, so commit it. |
3525 | 384k | AddrMode = TestAddrMode; |
3526 | 384k | |
3527 | 384k | // Okay, we decided that we can add ScaleReg+Scale to AddrMode. Check now |
3528 | 384k | // to see if ScaleReg is actually X+C. If so, we can turn this into adding |
3529 | 384k | // X*Scale + C*Scale to addr mode. |
3530 | 384k | ConstantInt *CI = nullptr; Value *AddLHS = nullptr; |
3531 | 384k | if (isa<Instruction>(ScaleReg) && // not a constant expr. |
3532 | 384k | match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI)))382k ) { |
3533 | 19.4k | TestAddrMode.InBounds = false; |
3534 | 19.4k | TestAddrMode.ScaledReg = AddLHS; |
3535 | 19.4k | TestAddrMode.BaseOffs += CI->getSExtValue()*TestAddrMode.Scale; |
3536 | 19.4k | |
3537 | 19.4k | // If this addressing mode is legal, commit it and remember that we folded |
3538 | 19.4k | // this instruction. |
3539 | 19.4k | if (TLI.isLegalAddressingMode(DL, TestAddrMode, AccessTy, AddrSpace)) { |
3540 | 5.97k | AddrModeInsts.push_back(cast<Instruction>(ScaleReg)); |
3541 | 5.97k | AddrMode = TestAddrMode; |
3542 | 5.97k | return true; |
3543 | 5.97k | } |
3544 | 378k | } |
3545 | 378k | |
3546 | 378k | // Otherwise, not (x+c)*scale, just return what we have. |
3547 | 378k | return true; |
3548 | 378k | } |
3549 | | |
3550 | | /// This is a little filter, which returns true if an addressing computation |
3551 | | /// involving I might be folded into a load/store accessing it. |
3552 | | /// This doesn't need to be perfect, but needs to accept at least |
3553 | | /// the set of instructions that MatchOperationAddr can. |
3554 | 1.38M | static bool MightBeFoldableInst(Instruction *I) { |
3555 | 1.38M | switch (I->getOpcode()) { |
3556 | 1.38M | case Instruction::BitCast: |
3557 | 602k | case Instruction::AddrSpaceCast: |
3558 | 602k | // Don't touch identity bitcasts. |
3559 | 602k | if (I->getType() == I->getOperand(0)->getType()) |
3560 | 23 | return false; |
3561 | 601k | return I->getType()->isIntOrPtrTy(); |
3562 | 601k | case Instruction::PtrToInt: |
3563 | 2.29k | // PtrToInt is always a noop, as we know that the int type is pointer sized. |
3564 | 2.29k | return true; |
3565 | 601k | case Instruction::IntToPtr: |
3566 | 5.11k | // We know the input is intptr_t, so this is foldable. |
3567 | 5.11k | return true; |
3568 | 601k | case Instruction::Add: |
3569 | 5.71k | return true; |
3570 | 601k | case Instruction::Mul: |
3571 | 791 | case Instruction::Shl: |
3572 | 791 | // Can only handle X*C and X << C. |
3573 | 791 | return isa<ConstantInt>(I->getOperand(1)); |
3574 | 734k | case Instruction::GetElementPtr: |
3575 | 734k | return true; |
3576 | 38.1k | default: |
3577 | 38.1k | return false; |
3578 | 1.38M | } |
3579 | 1.38M | } |
3580 | | |
3581 | | /// Check whether or not \p Val is a legal instruction for \p TLI. |
3582 | | /// \note \p Val is assumed to be the product of some type promotion. |
3583 | | /// Therefore if \p Val has an undefined state in \p TLI, this is assumed |
3584 | | /// to be legal, as the non-promoted value would have had the same state. |
3585 | | static bool isPromotedInstructionLegal(const TargetLowering &TLI, |
3586 | 101k | const DataLayout &DL, Value *Val) { |
3587 | 101k | Instruction *PromotedInst = dyn_cast<Instruction>(Val); |
3588 | 101k | if (!PromotedInst) |
3589 | 1 | return false; |
3590 | 101k | int ISDOpcode = TLI.InstructionOpcodeToISD(PromotedInst->getOpcode()); |
3591 | 101k | // If the ISDOpcode is undefined, it was undefined before the promotion. |
3592 | 101k | if (!ISDOpcode) |
3593 | 0 | return true; |
3594 | 101k | // Otherwise, check if the promoted instruction is legal or not. |
3595 | 101k | return TLI.isOperationLegalOrCustom( |
3596 | 101k | ISDOpcode, TLI.getValueType(DL, PromotedInst->getType())); |
3597 | 101k | } |
3598 | | |
3599 | | namespace { |
3600 | | |
3601 | | /// Hepler class to perform type promotion. |
3602 | | class TypePromotionHelper { |
3603 | | /// Utility function to add a promoted instruction \p ExtOpnd to |
3604 | | /// \p PromotedInsts and record the type of extension we have seen. |
3605 | | static void addPromotedInst(InstrToOrigTy &PromotedInsts, |
3606 | | Instruction *ExtOpnd, |
3607 | 116k | bool IsSExt) { |
3608 | 116k | ExtType ExtTy = IsSExt ? SignExtension78.7k : ZeroExtension37.4k ; |
3609 | 116k | InstrToOrigTy::iterator It = PromotedInsts.find(ExtOpnd); |
3610 | 116k | if (It != PromotedInsts.end()) { |
3611 | 68.1k | // If the new extension is same as original, the information in |
3612 | 68.1k | // PromotedInsts[ExtOpnd] is still correct. |
3613 | 68.1k | if (It->second.getInt() == ExtTy) |
3614 | 68.1k | return; |
3615 | 53 | |
3616 | 53 | // Now the new extension is different from old extension, we make |
3617 | 53 | // the type information invalid by setting extension type to |
3618 | 53 | // BothExtension. |
3619 | 53 | ExtTy = BothExtension; |
3620 | 53 | } |
3621 | 116k | PromotedInsts[ExtOpnd] = TypeIsSExt(ExtOpnd->getType(), ExtTy); |
3622 | 48.0k | } |
3623 | | |
3624 | | /// Utility function to query the original type of instruction \p Opnd |
3625 | | /// with a matched extension type. If the extension doesn't match, we |
3626 | | /// cannot use the information we had on the original type. |
3627 | | /// BothExtension doesn't match any extension type. |
3628 | | static const Type *getOrigType(const InstrToOrigTy &PromotedInsts, |
3629 | | Instruction *Opnd, |
3630 | 53.0k | bool IsSExt) { |
3631 | 53.0k | ExtType ExtTy = IsSExt ? SignExtension42.2k : ZeroExtension10.7k ; |
3632 | 53.0k | InstrToOrigTy::const_iterator It = PromotedInsts.find(Opnd); |
3633 | 53.0k | if (It != PromotedInsts.end() && It->second.getInt() == ExtTy5.19k ) |
3634 | 4.31k | return It->second.getPointer(); |
3635 | 48.6k | return nullptr; |
3636 | 48.6k | } |
3637 | | |
3638 | | /// Utility function to check whether or not a sign or zero extension |
3639 | | /// of \p Inst with \p ConsideredExtType can be moved through \p Inst by |
3640 | | /// either using the operands of \p Inst or promoting \p Inst. |
3641 | | /// The type of the extension is defined by \p IsSExt. |
3642 | | /// In other words, check if: |
3643 | | /// ext (Ty Inst opnd1 opnd2 ... opndN) to ConsideredExtType. |
3644 | | /// #1 Promotion applies: |
3645 | | /// ConsideredExtType Inst (ext opnd1 to ConsideredExtType, ...). |
3646 | | /// #2 Operand reuses: |
3647 | | /// ext opnd1 to ConsideredExtType. |
3648 | | /// \p PromotedInsts maps the instructions to their type before promotion. |
3649 | | static bool canGetThrough(const Instruction *Inst, Type *ConsideredExtType, |
3650 | | const InstrToOrigTy &PromotedInsts, bool IsSExt); |
3651 | | |
3652 | | /// Utility function to determine if \p OpIdx should be promoted when |
3653 | | /// promoting \p Inst. |
3654 | 232k | static bool shouldExtOperand(const Instruction *Inst, int OpIdx) { |
3655 | 232k | return !(isa<SelectInst>(Inst) && OpIdx == 00 ); |
3656 | 232k | } |
3657 | | |
3658 | | /// Utility function to promote the operand of \p Ext when this |
3659 | | /// operand is a promotable trunc or sext or zext. |
3660 | | /// \p PromotedInsts maps the instructions to their type before promotion. |
3661 | | /// \p CreatedInstsCost[out] contains the cost of all instructions |
3662 | | /// created to promote the operand of Ext. |
3663 | | /// Newly added extensions are inserted in \p Exts. |
3664 | | /// Newly added truncates are inserted in \p Truncs. |
3665 | | /// Should never be called directly. |
3666 | | /// \return The promoted value which is used instead of Ext. |
3667 | | static Value *promoteOperandForTruncAndAnyExt( |
3668 | | Instruction *Ext, TypePromotionTransaction &TPT, |
3669 | | InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost, |
3670 | | SmallVectorImpl<Instruction *> *Exts, |
3671 | | SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI); |
3672 | | |
3673 | | /// Utility function to promote the operand of \p Ext when this |
3674 | | /// operand is promotable and is not a supported trunc or sext. |
3675 | | /// \p PromotedInsts maps the instructions to their type before promotion. |
3676 | | /// \p CreatedInstsCost[out] contains the cost of all the instructions |
3677 | | /// created to promote the operand of Ext. |
3678 | | /// Newly added extensions are inserted in \p Exts. |
3679 | | /// Newly added truncates are inserted in \p Truncs. |
3680 | | /// Should never be called directly. |
3681 | | /// \return The promoted value which is used instead of Ext. |
3682 | | static Value *promoteOperandForOther(Instruction *Ext, |
3683 | | TypePromotionTransaction &TPT, |
3684 | | InstrToOrigTy &PromotedInsts, |
3685 | | unsigned &CreatedInstsCost, |
3686 | | SmallVectorImpl<Instruction *> *Exts, |
3687 | | SmallVectorImpl<Instruction *> *Truncs, |
3688 | | const TargetLowering &TLI, bool IsSExt); |
3689 | | |
3690 | | /// \see promoteOperandForOther. |
3691 | | static Value *signExtendOperandForOther( |
3692 | | Instruction *Ext, TypePromotionTransaction &TPT, |
3693 | | InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost, |
3694 | | SmallVectorImpl<Instruction *> *Exts, |
3695 | 78.7k | SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI) { |
3696 | 78.7k | return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInstsCost, |
3697 | 78.7k | Exts, Truncs, TLI, true); |
3698 | 78.7k | } |
3699 | | |
3700 | | /// \see promoteOperandForOther. |
3701 | | static Value *zeroExtendOperandForOther( |
3702 | | Instruction *Ext, TypePromotionTransaction &TPT, |
3703 | | InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost, |
3704 | | SmallVectorImpl<Instruction *> *Exts, |
3705 | 37.4k | SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI) { |
3706 | 37.4k | return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInstsCost, |
3707 | 37.4k | Exts, Truncs, TLI, false); |
3708 | 37.4k | } |
3709 | | |
3710 | | public: |
3711 | | /// Type for the utility function that promotes the operand of Ext. |
3712 | | using Action = Value *(*)(Instruction *Ext, TypePromotionTransaction &TPT, |
3713 | | InstrToOrigTy &PromotedInsts, |
3714 | | unsigned &CreatedInstsCost, |
3715 | | SmallVectorImpl<Instruction *> *Exts, |
3716 | | SmallVectorImpl<Instruction *> *Truncs, |
3717 | | const TargetLowering &TLI); |
3718 | | |
3719 | | /// Given a sign/zero extend instruction \p Ext, return the appropriate |
3720 | | /// action to promote the operand of \p Ext instead of using Ext. |
3721 | | /// \return NULL if no promotable action is possible with the current |
3722 | | /// sign extension. |
3723 | | /// \p InsertedInsts keeps track of all the instructions inserted by the |
3724 | | /// other CodeGenPrepare optimizations. This information is important |
3725 | | /// because we do not want to promote these instructions as CodeGenPrepare |
3726 | | /// will reinsert them later. Thus creating an infinite loop: create/remove. |
3727 | | /// \p PromotedInsts maps the instructions to their type before promotion. |
3728 | | static Action getAction(Instruction *Ext, const SetOfInstrs &InsertedInsts, |
3729 | | const TargetLowering &TLI, |
3730 | | const InstrToOrigTy &PromotedInsts); |
3731 | | }; |
3732 | | |
3733 | | } // end anonymous namespace |
3734 | | |
3735 | | bool TypePromotionHelper::canGetThrough(const Instruction *Inst, |
3736 | | Type *ConsideredExtType, |
3737 | | const InstrToOrigTy &PromotedInsts, |
3738 | 456k | bool IsSExt) { |
3739 | 456k | // The promotion helper does not know how to deal with vector types yet. |
3740 | 456k | // To be able to fix that, we would need to fix the places where we |
3741 | 456k | // statically extend, e.g., constants and such. |
3742 | 456k | if (Inst->getType()->isVectorTy()) |
3743 | 10.7k | return false; |
3744 | 445k | |
3745 | 445k | // We can always get through zext. |
3746 | 445k | if (isa<ZExtInst>(Inst)) |
3747 | 4.36k | return true; |
3748 | 441k | |
3749 | 441k | // sext(sext) is ok too. |
3750 | 441k | if (IsSExt && isa<SExtInst>(Inst)252k ) |
3751 | 614 | return true; |
3752 | 440k | |
3753 | 440k | // We can get through binary operator, if it is legal. In other words, the |
3754 | 440k | // binary operator must have a nuw or nsw flag. |
3755 | 440k | const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst); |
3756 | 440k | if (BinOp && isa<OverflowingBinaryOperator>(BinOp)200k && |
3757 | 440k | (144k (144k !IsSExt144k && BinOp->hasNoUnsignedWrap()47.0k ) || |
3758 | 144k | (136k IsSExt136k && BinOp->hasNoSignedWrap()97.0k ))) |
3759 | 84.5k | return true; |
3760 | 356k | |
3761 | 356k | // ext(and(opnd, cst)) --> and(ext(opnd), ext(cst)) |
3762 | 356k | if ((Inst->getOpcode() == Instruction::And || |
3763 | 356k | Inst->getOpcode() == Instruction::Or333k )) |
3764 | 25.2k | return true; |
3765 | 331k | |
3766 | 331k | // ext(xor(opnd, cst)) --> xor(ext(opnd), ext(cst)) |
3767 | 331k | if (Inst->getOpcode() == Instruction::Xor) { |
3768 | 5.02k | const ConstantInt *Cst = dyn_cast<ConstantInt>(Inst->getOperand(1)); |
3769 | 5.02k | // Make sure it is not a NOT. |
3770 | 5.02k | if (Cst && !Cst->getValue().isAllOnesValue()3.06k ) |
3771 | 655 | return true; |
3772 | 330k | } |
3773 | 330k | |
3774 | 330k | // zext(shrl(opnd, cst)) --> shrl(zext(opnd), zext(cst)) |
3775 | 330k | // It may change a poisoned value into a regular value, like |
3776 | 330k | // zext i32 (shrl i8 %val, 12) --> shrl i32 (zext i8 %val), 12 |
3777 | 330k | // poisoned value regular value |
3778 | 330k | // It should be OK since undef covers valid value. |
3779 | 330k | if (Inst->getOpcode() == Instruction::LShr && !IsSExt5.78k ) |
3780 | 5.61k | return true; |
3781 | 324k | |
3782 | 324k | // and(ext(shl(opnd, cst)), cst) --> and(shl(ext(opnd), ext(cst)), cst) |
3783 | 324k | // It may change a poisoned value into a regular value, like |
3784 | 324k | // zext i32 (shl i8 %val, 12) --> shl i32 (zext i8 %val), 12 |
3785 | 324k | // poisoned value regular value |
3786 | 324k | // It should be OK since undef covers valid value. |
3787 | 324k | if (Inst->getOpcode() == Instruction::Shl && Inst->hasOneUse()4.47k ) { |
3788 | 2.84k | const Instruction *ExtInst = |
3789 | 2.84k | dyn_cast<const Instruction>(*Inst->user_begin()); |
3790 | 2.84k | if (ExtInst->hasOneUse()) { |
3791 | 2.43k | const Instruction *AndInst = |
3792 | 2.43k | dyn_cast<const Instruction>(*ExtInst->user_begin()); |
3793 | 2.43k | if (AndInst && AndInst->getOpcode() == Instruction::And) { |
3794 | 289 | const ConstantInt *Cst = dyn_cast<ConstantInt>(AndInst->getOperand(1)); |
3795 | 289 | if (Cst && |
3796 | 289 | Cst->getValue().isIntN(Inst->getType()->getIntegerBitWidth())204 ) |
3797 | 200 | return true; |
3798 | 324k | } |
3799 | 2.43k | } |
3800 | 2.84k | } |
3801 | 324k | |
3802 | 324k | // Check if we can do the following simplification. |
3803 | 324k | // ext(trunc(opnd)) --> ext(opnd) |
3804 | 324k | if (!isa<TruncInst>(Inst)) |
3805 | 270k | return false; |
3806 | 53.6k | |
3807 | 53.6k | Value *OpndVal = Inst->getOperand(0); |
3808 | 53.6k | // Check if we can use this operand in the extension. |
3809 | 53.6k | // If the type is larger than the result type of the extension, we cannot. |
3810 | 53.6k | if (!OpndVal->getType()->isIntegerTy() || |
3811 | 53.6k | OpndVal->getType()->getIntegerBitWidth() > |
3812 | 53.6k | ConsideredExtType->getIntegerBitWidth()) |
3813 | 530 | return false; |
3814 | 53.1k | |
3815 | 53.1k | // If the operand of the truncate is not an instruction, we will not have |
3816 | 53.1k | // any information on the dropped bits. |
3817 | 53.1k | // (Actually we could for constant but it is not worth the extra logic). |
3818 | 53.1k | Instruction *Opnd = dyn_cast<Instruction>(OpndVal); |
3819 | 53.1k | if (!Opnd) |
3820 | 151 | return false; |
3821 | 53.0k | |
3822 | 53.0k | // Check if the source of the type is narrow enough. |
3823 | 53.0k | // I.e., check that trunc just drops extended bits of the same kind of |
3824 | 53.0k | // the extension. |
3825 | 53.0k | // #1 get the type of the operand and check the kind of the extended bits. |
3826 | 53.0k | const Type *OpndType = getOrigType(PromotedInsts, Opnd, IsSExt); |
3827 | 53.0k | if (OpndType) |
3828 | 4.31k | ; |
3829 | 48.6k | else if ((IsSExt && isa<SExtInst>(Opnd)39.1k ) || (16.2k !IsSExt16.2k && isa<ZExtInst>(Opnd)9.58k )) |
3830 | 34.5k | OpndType = Opnd->getOperand(0)->getType(); |
3831 | 14.1k | else |
3832 | 14.1k | return false; |
3833 | 38.8k | |
3834 | 38.8k | // #2 check that the truncate just drops extended bits. |
3835 | 38.8k | return Inst->getType()->getIntegerBitWidth() >= |
3836 | 38.8k | OpndType->getIntegerBitWidth(); |
3837 | 38.8k | } |
3838 | | |
3839 | | TypePromotionHelper::Action TypePromotionHelper::getAction( |
3840 | | Instruction *Ext, const SetOfInstrs &InsertedInsts, |
3841 | 577k | const TargetLowering &TLI, const InstrToOrigTy &PromotedInsts) { |
3842 | 577k | assert((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) && |
3843 | 577k | "Unexpected instruction type"); |
3844 | 577k | Instruction *ExtOpnd = dyn_cast<Instruction>(Ext->getOperand(0)); |
3845 | 577k | Type *ExtTy = Ext->getType(); |
3846 | 577k | bool IsSExt = isa<SExtInst>(Ext); |
3847 | 577k | // If the operand of the extension is not an instruction, we cannot |
3848 | 577k | // get through. |
3849 | 577k | // If it, check we can get through. |
3850 | 577k | if (!ExtOpnd || !canGetThrough(ExtOpnd, ExtTy, PromotedInsts, IsSExt)456k ) |
3851 | 417k | return nullptr; |
3852 | 160k | |
3853 | 160k | // Do not promote if the operand has been added by codegenprepare. |
3854 | 160k | // Otherwise, it means we are undoing an optimization that is likely to be |
3855 | 160k | // redone, thus causing potential infinite loop. |
3856 | 160k | if (isa<TruncInst>(ExtOpnd) && InsertedInsts.count(ExtOpnd)38.8k ) |
3857 | 34.5k | return nullptr; |
3858 | 125k | |
3859 | 125k | // SExt or Trunc instructions. |
3860 | 125k | // Return the related handler. |
3861 | 125k | if (isa<SExtInst>(ExtOpnd) || isa<TruncInst>(ExtOpnd)124k || |
3862 | 125k | isa<ZExtInst>(ExtOpnd)120k ) |
3863 | 9.29k | return promoteOperandForTruncAndAnyExt; |
3864 | 116k | |
3865 | 116k | // Regular instruction. |
3866 | 116k | // Abort early if we will have to insert non-free instructions. |
3867 | 116k | if (!ExtOpnd->hasOneUse() && !TLI.isTruncateFree(ExtTy, ExtOpnd->getType())23.0k ) |
3868 | 0 | return nullptr; |
3869 | 116k | return IsSExt ? signExtendOperandForOther78.7k : zeroExtendOperandForOther37.4k ; |
3870 | 116k | } |
3871 | | |
3872 | | Value *TypePromotionHelper::promoteOperandForTruncAndAnyExt( |
3873 | | Instruction *SExt, TypePromotionTransaction &TPT, |
3874 | | InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost, |
3875 | | SmallVectorImpl<Instruction *> *Exts, |
3876 | 9.29k | SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI) { |
3877 | 9.29k | // By construction, the operand of SExt is an instruction. Otherwise we cannot |
3878 | 9.29k | // get through it and this method should not be called. |
3879 | 9.29k | Instruction *SExtOpnd = cast<Instruction>(SExt->getOperand(0)); |
3880 | 9.29k | Value *ExtVal = SExt; |
3881 | 9.29k | bool HasMergedNonFreeExt = false; |
3882 | 9.29k | if (isa<ZExtInst>(SExtOpnd)) { |
3883 | 4.36k | // Replace s|zext(zext(opnd)) |
3884 | 4.36k | // => zext(opnd). |
3885 | 4.36k | HasMergedNonFreeExt = !TLI.isExtFree(SExtOpnd); |
3886 | 4.36k | Value *ZExt = |
3887 | 4.36k | TPT.createZExt(SExt, SExtOpnd->getOperand(0), SExt->getType()); |
3888 | 4.36k | TPT.replaceAllUsesWith(SExt, ZExt); |
3889 | 4.36k | TPT.eraseInstruction(SExt); |
3890 | 4.36k | ExtVal = ZExt; |
3891 | 4.93k | } else { |
3892 | 4.93k | // Replace z|sext(trunc(opnd)) or sext(sext(opnd)) |
3893 | 4.93k | // => z|sext(opnd). |
3894 | 4.93k | TPT.setOperand(SExt, 0, SExtOpnd->getOperand(0)); |
3895 | 4.93k | } |
3896 | 9.29k | CreatedInstsCost = 0; |
3897 | 9.29k | |
3898 | 9.29k | // Remove dead code. |
3899 | 9.29k | if (SExtOpnd->use_empty()) |
3900 | 5.26k | TPT.eraseInstruction(SExtOpnd); |
3901 | 9.29k | |
3902 | 9.29k | // Check if the extension is still needed. |
3903 | 9.29k | Instruction *ExtInst = dyn_cast<Instruction>(ExtVal); |
3904 | 9.29k | if (!ExtInst || ExtInst->getType() != ExtInst->getOperand(0)->getType()9.29k ) { |
3905 | 5.00k | if (ExtInst) { |
3906 | 4.99k | if (Exts) |
3907 | 4.87k | Exts->push_back(ExtInst); |
3908 | 4.99k | CreatedInstsCost = !TLI.isExtFree(ExtInst) && !HasMergedNonFreeExt4.68k ; |
3909 | 4.99k | } |
3910 | 5.00k | return ExtVal; |
3911 | 5.00k | } |
3912 | 4.29k | |
3913 | 4.29k | // At this point we have: ext ty opnd to ty. |
3914 | 4.29k | // Reassign the uses of ExtInst to the opnd and remove ExtInst. |
3915 | 4.29k | Value *NextVal = ExtInst->getOperand(0); |
3916 | 4.29k | TPT.eraseInstruction(ExtInst, NextVal); |
3917 | 4.29k | return NextVal; |
3918 | 4.29k | } |
3919 | | |
3920 | | Value *TypePromotionHelper::promoteOperandForOther( |
3921 | | Instruction *Ext, TypePromotionTransaction &TPT, |
3922 | | InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost, |
3923 | | SmallVectorImpl<Instruction *> *Exts, |
3924 | | SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI, |
3925 | 116k | bool IsSExt) { |
3926 | 116k | // By construction, the operand of Ext is an instruction. Otherwise we cannot |
3927 | 116k | // get through it and this method should not be called. |
3928 | 116k | Instruction *ExtOpnd = cast<Instruction>(Ext->getOperand(0)); |
3929 | 116k | CreatedInstsCost = 0; |
3930 | 116k | if (!ExtOpnd->hasOneUse()) { |
3931 | 23.0k | // ExtOpnd will be promoted. |
3932 | 23.0k | // All its uses, but Ext, will need to use a truncated value of the |
3933 | 23.0k | // promoted version. |
3934 | 23.0k | // Create the truncate now. |
3935 | 23.0k | Value *Trunc = TPT.createTrunc(Ext, ExtOpnd->getType()); |
3936 | 23.0k | if (Instruction *ITrunc = dyn_cast<Instruction>(Trunc)) { |
3937 | 23.0k | // Insert it just after the definition. |
3938 | 23.0k | ITrunc->moveAfter(ExtOpnd); |
3939 | 23.0k | if (Truncs) |
3940 | 0 | Truncs->push_back(ITrunc); |
3941 | 23.0k | } |
3942 | 23.0k | |
3943 | 23.0k | TPT.replaceAllUsesWith(ExtOpnd, Trunc); |
3944 | 23.0k | // Restore the operand of Ext (which has been replaced by the previous call |
3945 | 23.0k | // to replaceAllUsesWith) to avoid creating a cycle trunc <-> sext. |
3946 | 23.0k | TPT.setOperand(Ext, 0, ExtOpnd); |
3947 | 23.0k | } |
3948 | 116k | |
3949 | 116k | // Get through the Instruction: |
3950 | 116k | // 1. Update its type. |
3951 | 116k | // 2. Replace the uses of Ext by Inst. |
3952 | 116k | // 3. Extend each operand that needs to be extended. |
3953 | 116k | |
3954 | 116k | // Remember the original type of the instruction before promotion. |
3955 | 116k | // This is useful to know that the high bits are sign extended bits. |
3956 | 116k | addPromotedInst(PromotedInsts, ExtOpnd, IsSExt); |
3957 | 116k | // Step #1. |
3958 | 116k | TPT.mutateType(ExtOpnd, Ext->getType()); |
3959 | 116k | // Step #2. |
3960 | 116k | TPT.replaceAllUsesWith(Ext, ExtOpnd); |
3961 | 116k | // Step #3. |
3962 | 116k | Instruction *ExtForOpnd = Ext; |
3963 | 116k | |
3964 | 116k | LLVM_DEBUG(dbgs() << "Propagate Ext to operands\n"); |
3965 | 348k | for (int OpIdx = 0, EndOpIdx = ExtOpnd->getNumOperands(); OpIdx != EndOpIdx; |
3966 | 232k | ++OpIdx) { |
3967 | 232k | LLVM_DEBUG(dbgs() << "Operand:\n" << *(ExtOpnd->getOperand(OpIdx)) << '\n'); |
3968 | 232k | if (ExtOpnd->getOperand(OpIdx)->getType() == Ext->getType() || |
3969 | 232k | !shouldExtOperand(ExtOpnd, OpIdx)) { |
3970 | 0 | LLVM_DEBUG(dbgs() << "No need to propagate\n"); |
3971 | 0 | continue; |
3972 | 0 | } |
3973 | 232k | // Check if we can statically extend the operand. |
3974 | 232k | Value *Opnd = ExtOpnd->getOperand(OpIdx); |
3975 | 232k | if (const ConstantInt *Cst = dyn_cast<ConstantInt>(Opnd)) { |
3976 | 45.6k | LLVM_DEBUG(dbgs() << "Statically extend\n"); |
3977 | 45.6k | unsigned BitWidth = Ext->getType()->getIntegerBitWidth(); |
3978 | 45.6k | APInt CstVal = IsSExt ? Cst->getValue().sext(BitWidth)16.1k |
3979 | 45.6k | : Cst->getValue().zext(BitWidth)29.5k ; |
3980 | 45.6k | TPT.setOperand(ExtOpnd, OpIdx, ConstantInt::get(Ext->getType(), CstVal)); |
3981 | 45.6k | continue; |
3982 | 45.6k | } |
3983 | 186k | // UndefValue are typed, so we have to statically sign extend them. |
3984 | 186k | if (isa<UndefValue>(Opnd)) { |
3985 | 0 | LLVM_DEBUG(dbgs() << "Statically extend\n"); |
3986 | 0 | TPT.setOperand(ExtOpnd, OpIdx, UndefValue::get(Ext->getType())); |
3987 | 0 | continue; |
3988 | 0 | } |
3989 | 186k | |
3990 | 186k | // Otherwise we have to explicitly sign extend the operand. |
3991 | 186k | // Check if Ext was reused to extend an operand. |
3992 | 186k | if (!ExtForOpnd) { |
3993 | 70.4k | // If yes, create a new one. |
3994 | 70.4k | LLVM_DEBUG(dbgs() << "More operands to ext\n"); |
3995 | 70.4k | Value *ValForExtOpnd = IsSExt ? TPT.createSExt(Ext, Opnd, Ext->getType())62.6k |
3996 | 70.4k | : TPT.createZExt(Ext, Opnd, Ext->getType())7.84k ; |
3997 | 70.4k | if (!isa<Instruction>(ValForExtOpnd)) { |
3998 | 6 | TPT.setOperand(ExtOpnd, OpIdx, ValForExtOpnd); |
3999 | 6 | continue; |
4000 | 6 | } |
4001 | 70.4k | ExtForOpnd = cast<Instruction>(ValForExtOpnd); |
4002 | 70.4k | } |
4003 | 186k | if (186k Exts186k ) |
4004 | 165k | Exts->push_back(ExtForOpnd); |
4005 | 186k | TPT.setOperand(ExtForOpnd, 0, Opnd); |
4006 | 186k | |
4007 | 186k | // Move the sign extension before the insertion point. |
4008 | 186k | TPT.moveBefore(ExtForOpnd, ExtOpnd); |
4009 | 186k | TPT.setOperand(ExtOpnd, OpIdx, ExtForOpnd); |
4010 | 186k | CreatedInstsCost += !TLI.isExtFree(ExtForOpnd); |
4011 | 186k | // If more sext are required, new instructions will have to be created. |
4012 | 186k | ExtForOpnd = nullptr; |
4013 | 186k | } |
4014 | 116k | if (ExtForOpnd == Ext) { |
4015 | 0 | LLVM_DEBUG(dbgs() << "Extension is useless now\n"); |
4016 | 0 | TPT.eraseInstruction(Ext); |
4017 | 0 | } |
4018 | 116k | return ExtOpnd; |
4019 | 116k | } |
4020 | | |
4021 | | /// Check whether or not promoting an instruction to a wider type is profitable. |
4022 | | /// \p NewCost gives the cost of extension instructions created by the |
4023 | | /// promotion. |
4024 | | /// \p OldCost gives the cost of extension instructions before the promotion |
4025 | | /// plus the number of instructions that have been |
4026 | | /// matched in the addressing mode the promotion. |
4027 | | /// \p PromotedOperand is the value that has been promoted. |
4028 | | /// \return True if the promotion is profitable, false otherwise. |
4029 | | bool AddressingModeMatcher::isPromotionProfitable( |
4030 | 6.25k | unsigned NewCost, unsigned OldCost, Value *PromotedOperand) const { |
4031 | 6.25k | LLVM_DEBUG(dbgs() << "OldCost: " << OldCost << "\tNewCost: " << NewCost |
4032 | 6.25k | << '\n'); |
4033 | 6.25k | // The cost of the new extensions is greater than the cost of the |
4034 | 6.25k | // old extension plus what we folded. |
4035 | 6.25k | // This is not profitable. |
4036 | 6.25k | if (NewCost > OldCost) |
4037 | 5.18k | return false; |
4038 | 1.06k | if (NewCost < OldCost) |
4039 | 334 | return true; |
4040 | 735 | // The promotion is neutral but it may help folding the sign extension in |
4041 | 735 | // loads for instance. |
4042 | 735 | // Check that we did not create an illegal instruction. |
4043 | 735 | return isPromotedInstructionLegal(TLI, DL, PromotedOperand); |
4044 | 735 | } |
4045 | | |
4046 | | /// Given an instruction or constant expr, see if we can fold the operation |
4047 | | /// into the addressing mode. If so, update the addressing mode and return |
4048 | | /// true, otherwise return false without modifying AddrMode. |
4049 | | /// If \p MovedAway is not NULL, it contains the information of whether or |
4050 | | /// not AddrInst has to be folded into the addressing mode on success. |
4051 | | /// If \p MovedAway == true, \p AddrInst will not be part of the addressing |
4052 | | /// because it has been moved away. |
4053 | | /// Thus AddrInst must not be added in the matched instructions. |
4054 | | /// This state can happen when AddrInst is a sext, since it may be moved away. |
4055 | | /// Therefore, AddrInst may not be valid when MovedAway is true and it must |
4056 | | /// not be referenced anymore. |
4057 | | bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode, |
4058 | | unsigned Depth, |
4059 | 17.3M | bool *MovedAway) { |
4060 | 17.3M | // Avoid exponential behavior on extremely deep expression trees. |
4061 | 17.3M | if (Depth >= 5) return false2.99k ; |
4062 | 17.3M | |
4063 | 17.3M | // By default, all matched instructions stay in place. |
4064 | 17.3M | if (MovedAway) |
4065 | 16.9M | *MovedAway = false; |
4066 | 17.3M | |
4067 | 17.3M | switch (Opcode) { |
4068 | 17.3M | case Instruction::PtrToInt: |
4069 | 1.40k | // PtrToInt is always a noop, as we know that the int type is pointer sized. |
4070 | 1.40k | return matchAddr(AddrInst->getOperand(0), Depth); |
4071 | 17.3M | case Instruction::IntToPtr: { |
4072 | 80.3k | auto AS = AddrInst->getType()->getPointerAddressSpace(); |
4073 | 80.3k | auto PtrTy = MVT::getIntegerVT(DL.getPointerSizeInBits(AS)); |
4074 | 80.3k | // This inttoptr is a no-op if the integer type is pointer sized. |
4075 | 80.3k | if (TLI.getValueType(DL, AddrInst->getOperand(0)->getType()) == PtrTy) |
4076 | 80.2k | return matchAddr(AddrInst->getOperand(0), Depth); |
4077 | 71 | return false; |
4078 | 71 | } |
4079 | 4.39M | case Instruction::BitCast: |
4080 | 4.39M | // BitCast is always a noop, and we can handle it as long as it is |
4081 | 4.39M | // int->int or pointer->pointer (we don't want int<->fp or something). |
4082 | 4.39M | if (AddrInst->getOperand(0)->getType()->isIntOrPtrTy() && |
4083 | 4.39M | // Don't touch identity bitcasts. These were probably put here by LSR, |
4084 | 4.39M | // and we don't want to mess around with them. Assume it knows what it |
4085 | 4.39M | // is doing. |
4086 | 4.39M | AddrInst->getOperand(0)->getType() != AddrInst->getType()4.39M ) |
4087 | 4.37M | return matchAddr(AddrInst->getOperand(0), Depth); |
4088 | 13.0k | return false; |
4089 | 13.0k | case Instruction::AddrSpaceCast: { |
4090 | 315 | unsigned SrcAS |
4091 | 315 | = AddrInst->getOperand(0)->getType()->getPointerAddressSpace(); |
4092 | 315 | unsigned DestAS = AddrInst->getType()->getPointerAddressSpace(); |
4093 | 315 | if (TLI.isNoopAddrSpaceCast(SrcAS, DestAS)) |
4094 | 104 | return matchAddr(AddrInst->getOperand(0), Depth); |
4095 | 211 | return false; |
4096 | 211 | } |
4097 | 48.1k | case Instruction::Add: { |
4098 | 48.1k | // Check to see if we can merge in the RHS then the LHS. If so, we win. |
4099 | 48.1k | ExtAddrMode BackupAddrMode = AddrMode; |
4100 | 48.1k | unsigned OldSize = AddrModeInsts.size(); |
4101 | 48.1k | // Start a transaction at this point. |
4102 | 48.1k | // The LHS may match but not the RHS. |
4103 | 48.1k | // Therefore, we need a higher level restoration point to undo partially |
4104 | 48.1k | // matched operation. |
4105 | 48.1k | TypePromotionTransaction::ConstRestorationPt LastKnownGood = |
4106 | 48.1k | TPT.getRestorationPoint(); |
4107 | 48.1k | |
4108 | 48.1k | AddrMode.InBounds = false; |
4109 | 48.1k | if (matchAddr(AddrInst->getOperand(1), Depth+1) && |
4110 | 48.1k | matchAddr(AddrInst->getOperand(0), Depth+1)46.6k ) |
4111 | 24.6k | return true; |
4112 | 23.5k | |
4113 | 23.5k | // Restore the old addr mode info. |
4114 | 23.5k | AddrMode = BackupAddrMode; |
4115 | 23.5k | AddrModeInsts.resize(OldSize); |
4116 | 23.5k | TPT.rollback(LastKnownGood); |
4117 | 23.5k | |
4118 | 23.5k | // Otherwise this was over-aggressive. Try merging in the LHS then the RHS. |
4119 | 23.5k | if (matchAddr(AddrInst->getOperand(0), Depth+1) && |
4120 | 23.5k | matchAddr(AddrInst->getOperand(1), Depth+1)12.4k ) |
4121 | 39 | return true; |
4122 | 23.5k | |
4123 | 23.5k | // Otherwise we definitely can't merge the ADD in. |
4124 | 23.5k | AddrMode = BackupAddrMode; |
4125 | 23.5k | AddrModeInsts.resize(OldSize); |
4126 | 23.5k | TPT.rollback(LastKnownGood); |
4127 | 23.5k | break; |
4128 | 23.5k | } |
4129 | 23.5k | //case Instruction::Or: |
4130 | 23.5k | // TODO: We can handle "Or Val, Imm" iff this OR is equivalent to an ADD. |
4131 | 23.5k | //break; |
4132 | 99.8k | case Instruction::Mul: |
4133 | 99.8k | case Instruction::Shl: { |
4134 | 99.8k | // Can only handle X*C and X << C. |
4135 | 99.8k | AddrMode.InBounds = false; |
4136 | 99.8k | ConstantInt *RHS = dyn_cast<ConstantInt>(AddrInst->getOperand(1)); |
4137 | 99.8k | if (!RHS || RHS->getBitWidth() > 6488.8k ) |
4138 | 11.0k | return false; |
4139 | 88.8k | int64_t Scale = RHS->getSExtValue(); |
4140 | 88.8k | if (Opcode == Instruction::Shl) |
4141 | 57.4k | Scale = 1LL << Scale; |
4142 | 88.8k | |
4143 | 88.8k | return matchScaledValue(AddrInst->getOperand(0), Scale, Depth); |
4144 | 88.8k | } |
4145 | 7.06M | case Instruction::GetElementPtr: { |
4146 | 7.06M | // Scan the GEP. We check it if it contains constant offsets and at most |
4147 | 7.06M | // one variable offset. |
4148 | 7.06M | int VariableOperand = -1; |
4149 | 7.06M | unsigned VariableScale = 0; |
4150 | 7.06M | |
4151 | 7.06M | int64_t ConstantOffset = 0; |
4152 | 7.06M | gep_type_iterator GTI = gep_type_begin(AddrInst); |
4153 | 20.7M | for (unsigned i = 1, e = AddrInst->getNumOperands(); i != e; ++i, ++GTI13.6M ) { |
4154 | 13.6M | if (StructType *STy = GTI.getStructTypeOrNull()) { |
4155 | 5.01M | const StructLayout *SL = DL.getStructLayout(STy); |
4156 | 5.01M | unsigned Idx = |
4157 | 5.01M | cast<ConstantInt>(AddrInst->getOperand(i))->getZExtValue(); |
4158 | 5.01M | ConstantOffset += SL->getElementOffset(Idx); |
4159 | 8.66M | } else { |
4160 | 8.66M | uint64_t TypeSize = DL.getTypeAllocSize(GTI.getIndexedType()); |
4161 | 8.66M | if (ConstantInt *CI = dyn_cast<ConstantInt>(AddrInst->getOperand(i))) { |
4162 | 7.73M | const APInt &CVal = CI->getValue(); |
4163 | 7.73M | if (CVal.getMinSignedBits() <= 64) { |
4164 | 7.73M | ConstantOffset += CVal.getSExtValue() * TypeSize; |
4165 | 7.73M | continue; |
4166 | 7.73M | } |
4167 | 928k | } |
4168 | 928k | if (TypeSize) { // Scales of zero don't do anything. |
4169 | 928k | // We only allow one variable index at the moment. |
4170 | 928k | if (VariableOperand != -1) |
4171 | 18.9k | return false; |
4172 | 909k | |
4173 | 909k | // Remember the variable index. |
4174 | 909k | VariableOperand = i; |
4175 | 909k | VariableScale = TypeSize; |
4176 | 909k | } |
4177 | 928k | } |
4178 | 13.6M | } |
4179 | 7.06M | |
4180 | 7.06M | // A common case is for the GEP to only do a constant offset. In this case, |
4181 | 7.06M | // just add it to the disp field and check validity. |
4182 | 7.06M | if (7.04M VariableOperand == -17.04M ) { |
4183 | 6.15M | AddrMode.BaseOffs += ConstantOffset; |
4184 | 6.15M | if (ConstantOffset == 0 || |
4185 | 6.15M | TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace)6.12M ) { |
4186 | 6.10M | // Check to see if we can fold the base pointer in too. |
4187 | 6.10M | if (matchAddr(AddrInst->getOperand(0), Depth+1)) { |
4188 | 6.10M | if (!cast<GEPOperator>(AddrInst)->isInBounds()) |
4189 | 1.71M | AddrMode.InBounds = false; |
4190 | 6.10M | return true; |
4191 | 6.10M | } |
4192 | 56.8k | } else if (EnableGEPOffsetSplit && isa<GetElementPtrInst>(AddrInst) && |
4193 | 56.8k | TLI.shouldConsiderGEPOffsetSplit()51.7k && Depth == 048.0k && |
4194 | 56.8k | ConstantOffset > 026.0k ) { |
4195 | 25.7k | // Record GEPs with non-zero offsets as candidates for splitting in the |
4196 | 25.7k | // event that the offset cannot fit into the r+i addressing mode. |
4197 | 25.7k | // Simple and common case that only one GEP is used in calculating the |
4198 | 25.7k | // address for the memory access. |
4199 | 25.7k | Value *Base = AddrInst->getOperand(0); |
4200 | 25.7k | auto *BaseI = dyn_cast<Instruction>(Base); |
4201 | 25.7k | auto *GEP = cast<GetElementPtrInst>(AddrInst); |
4202 | 25.7k | if (isa<Argument>(Base) || isa<GlobalValue>(Base)19.3k || |
4203 | 25.7k | (19.3k BaseI19.3k && !isa<CastInst>(BaseI)19.3k && |
4204 | 19.9k | !isa<GetElementPtrInst>(BaseI)14.0k )) { |
4205 | 19.9k | // Make sure the parent block allows inserting non-PHI instructions |
4206 | 19.9k | // before the terminator. |
4207 | 19.9k | BasicBlock *Parent = |
4208 | 19.9k | BaseI ? BaseI->getParent()13.4k : &GEP->getFunction()->getEntryBlock()6.46k ; |
4209 | 19.9k | if (!Parent->getTerminator()->isEHPad()) |
4210 | 19.9k | LargeOffsetGEP = std::make_pair(GEP, ConstantOffset); |
4211 | 19.9k | } |
4212 | 25.7k | } |
4213 | 6.15M | AddrMode.BaseOffs -= ConstantOffset; |
4214 | 56.8k | return false; |
4215 | 890k | } |
4216 | 890k | |
4217 | 890k | // Save the valid addressing mode in case we can't match. |
4218 | 890k | ExtAddrMode BackupAddrMode = AddrMode; |
4219 | 890k | unsigned OldSize = AddrModeInsts.size(); |
4220 | 890k | |
4221 | 890k | // See if the scale and offset amount is valid for this target. |
4222 | 890k | AddrMode.BaseOffs += ConstantOffset; |
4223 | 890k | if (!cast<GEPOperator>(AddrInst)->isInBounds()) |
4224 | 382k | AddrMode.InBounds = false; |
4225 | 890k | |
4226 | 890k | // Match the base operand of the GEP. |
4227 | 890k | if (!matchAddr(AddrInst->getOperand(0), Depth+1)) { |
4228 | 1.34k | // If it couldn't be matched, just stuff the value in a register. |
4229 | 1.34k | if (AddrMode.HasBaseReg) { |
4230 | 46 | AddrMode = BackupAddrMode; |
4231 | 46 | AddrModeInsts.resize(OldSize); |
4232 | 46 | return false; |
4233 | 46 | } |
4234 | 1.30k | AddrMode.HasBaseReg = true; |
4235 | 1.30k | AddrMode.BaseReg = AddrInst->getOperand(0); |
4236 | 1.30k | } |
4237 | 890k | |
4238 | 890k | // Match the remaining variable portion of the GEP. |
4239 | 890k | if (890k !matchScaledValue(AddrInst->getOperand(VariableOperand), VariableScale, |
4240 | 890k | Depth)) { |
4241 | 380k | // If it couldn't be matched, try stuffing the base into a register |
4242 | 380k | // instead of matching it, and retrying the match of the scale. |
4243 | 380k | AddrMode = BackupAddrMode; |
4244 | 380k | AddrModeInsts.resize(OldSize); |
4245 | 380k | if (AddrMode.HasBaseReg) |
4246 | 102 | return false; |
4247 | 380k | AddrMode.HasBaseReg = true; |
4248 | 380k | AddrMode.BaseReg = AddrInst->getOperand(0); |
4249 | 380k | AddrMode.BaseOffs += ConstantOffset; |
4250 | 380k | if (!matchScaledValue(AddrInst->getOperand(VariableOperand), |
4251 | 380k | VariableScale, Depth)) { |
4252 | 311k | // If even that didn't work, bail. |
4253 | 311k | AddrMode = BackupAddrMode; |
4254 | 311k | AddrModeInsts.resize(OldSize); |
4255 | 311k | return false; |
4256 | 311k | } |
4257 | 578k | } |
4258 | 578k | |
4259 | 578k | return true; |
4260 | 578k | } |
4261 | 578k | case Instruction::SExt: |
4262 | 65.3k | case Instruction::ZExt: { |
4263 | 65.3k | Instruction *Ext = dyn_cast<Instruction>(AddrInst); |
4264 | 65.3k | if (!Ext) |
4265 | 1 | return false; |
4266 | 65.2k | |
4267 | 65.2k | // Try to move this ext out of the way of the addressing mode. |
4268 | 65.2k | // Ask for a method for doing so. |
4269 | 65.2k | TypePromotionHelper::Action TPH = |
4270 | 65.2k | TypePromotionHelper::getAction(Ext, InsertedInsts, TLI, PromotedInsts); |
4271 | 65.2k | if (!TPH) |
4272 | 53.4k | return false; |
4273 | 11.8k | |
4274 | 11.8k | TypePromotionTransaction::ConstRestorationPt LastKnownGood = |
4275 | 11.8k | TPT.getRestorationPoint(); |
4276 | 11.8k | unsigned CreatedInstsCost = 0; |
4277 | 11.8k | unsigned ExtCost = !TLI.isExtFree(Ext); |
4278 | 11.8k | Value *PromotedOperand = |
4279 | 11.8k | TPH(Ext, TPT, PromotedInsts, CreatedInstsCost, nullptr, nullptr, TLI); |
4280 | 11.8k | // SExt has been moved away. |
4281 | 11.8k | // Thus either it will be rematched later in the recursive calls or it is |
4282 | 11.8k | // gone. Anyway, we must not fold it into the addressing mode at this point. |
4283 | 11.8k | // E.g., |
4284 | 11.8k | // op = add opnd, 1 |
4285 | 11.8k | // idx = ext op |
4286 | 11.8k | // addr = gep base, idx |
4287 | 11.8k | // is now: |
4288 | 11.8k | // promotedOpnd = ext opnd <- no match here |
4289 | 11.8k | // op = promoted_add promotedOpnd, 1 <- match (later in recursive calls) |
4290 | 11.8k | // addr = gep base, op <- match |
4291 | 11.8k | if (MovedAway) |
4292 | 11.8k | *MovedAway = true; |
4293 | 11.8k | |
4294 | 11.8k | assert(PromotedOperand && |
4295 | 11.8k | "TypePromotionHelper should have filtered out those cases"); |
4296 | 11.8k | |
4297 | 11.8k | ExtAddrMode BackupAddrMode = AddrMode; |
4298 | 11.8k | unsigned OldSize = AddrModeInsts.size(); |
4299 | 11.8k | |
4300 | 11.8k | if (!matchAddr(PromotedOperand, Depth) || |
4301 | 11.8k | // The total of the new cost is equal to the cost of the created |
4302 | 11.8k | // instructions. |
4303 | 11.8k | // The total of the old cost is equal to the cost of the extension plus |
4304 | 11.8k | // what we have saved in the addressing mode. |
4305 | 11.8k | !isPromotionProfitable(CreatedInstsCost, |
4306 | 6.25k | ExtCost + (AddrModeInsts.size() - OldSize), |
4307 | 10.7k | PromotedOperand)) { |
4308 | 10.7k | AddrMode = BackupAddrMode; |
4309 | 10.7k | AddrModeInsts.resize(OldSize); |
4310 | 10.7k | LLVM_DEBUG(dbgs() << "Sign extension does not pay off: rollback\n"); |
4311 | 10.7k | TPT.rollback(LastKnownGood); |
4312 | 10.7k | return false; |
4313 | 10.7k | } |
4314 | 1.03k | return true; |
4315 | 1.03k | } |
4316 | 5.56M | } |
4317 | 5.56M | return false; |
4318 | 5.56M | } |
4319 | | |
4320 | | /// If we can, try to add the value of 'Addr' into the current addressing mode. |
4321 | | /// If Addr can't be added to AddrMode this returns false and leaves AddrMode |
4322 | | /// unmodified. This assumes that Addr is either a pointer type or intptr_t |
4323 | | /// for the target. |
4324 | | /// |
4325 | 20.3M | bool AddressingModeMatcher::matchAddr(Value *Addr, unsigned Depth) { |
4326 | 20.3M | // Start a transaction at this point that we will rollback if the matching |
4327 | 20.3M | // fails. |
4328 | 20.3M | TypePromotionTransaction::ConstRestorationPt LastKnownGood = |
4329 | 20.3M | TPT.getRestorationPoint(); |
4330 | 20.3M | if (ConstantInt *CI = dyn_cast<ConstantInt>(Addr)) { |
4331 | 42.3k | // Fold in immediates if legal for the target. |
4332 | 42.3k | AddrMode.BaseOffs += CI->getSExtValue(); |
4333 | 42.3k | if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace)) |
4334 | 30.0k | return true; |
4335 | 12.2k | AddrMode.BaseOffs -= CI->getSExtValue(); |
4336 | 20.2M | } else if (GlobalValue *GV = dyn_cast<GlobalValue>(Addr)) { |
4337 | 809k | // If this is a global variable, try to fold it into the addressing mode. |
4338 | 809k | if (!AddrMode.BaseGV) { |
4339 | 809k | AddrMode.BaseGV = GV; |
4340 | 809k | if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace)) |
4341 | 21.0k | return true; |
4342 | 788k | AddrMode.BaseGV = nullptr; |
4343 | 788k | } |
4344 | 19.4M | } else if (Instruction *I = dyn_cast<Instruction>(Addr)) { |
4345 | 16.9M | ExtAddrMode BackupAddrMode = AddrMode; |
4346 | 16.9M | unsigned OldSize = AddrModeInsts.size(); |
4347 | 16.9M | |
4348 | 16.9M | // Check to see if it is possible to fold this operation. |
4349 | 16.9M | bool MovedAway = false; |
4350 | 16.9M | if (matchOperationAddr(I, I->getOpcode(), Depth, &MovedAway)) { |
4351 | 10.8M | // This instruction may have been moved away. If so, there is nothing |
4352 | 10.8M | // to check here. |
4353 | 10.8M | if (MovedAway) |
4354 | 1.03k | return true; |
4355 | 10.8M | // Okay, it's possible to fold this. Check to see if it is actually |
4356 | 10.8M | // *profitable* to do so. We use a simple cost model to avoid increasing |
4357 | 10.8M | // register pressure too much. |
4358 | 10.8M | if (I->hasOneUse() || |
4359 | 10.8M | isProfitableToFoldIntoAddressingMode(I, BackupAddrMode, AddrMode)2.89M ) { |
4360 | 10.7M | AddrModeInsts.push_back(I); |
4361 | 10.7M | return true; |
4362 | 10.7M | } |
4363 | 108k | |
4364 | 108k | // It isn't profitable to do this, roll back. |
4365 | 108k | //cerr << "NOT FOLDING: " << *I; |
4366 | 108k | AddrMode = BackupAddrMode; |
4367 | 108k | AddrModeInsts.resize(OldSize); |
4368 | 108k | TPT.rollback(LastKnownGood); |
4369 | 108k | } |
4370 | 16.9M | } else if (ConstantExpr *2.53M CE2.53M = dyn_cast<ConstantExpr>(Addr)) { |
4371 | 362k | if (matchOperationAddr(CE, CE->getOpcode(), Depth)) |
4372 | 357k | return true; |
4373 | 5.24k | TPT.rollback(LastKnownGood); |
4374 | 2.16M | } else if (isa<ConstantPointerNull>(Addr)) { |
4375 | 2.14k | // Null pointer gets folded without affecting the addressing mode. |
4376 | 2.14k | return true; |
4377 | 2.14k | } |
4378 | 9.14M | |
4379 | 9.14M | // Worse case, the target should support [reg] addressing modes. :) |
4380 | 9.14M | if (!AddrMode.HasBaseReg) { |
4381 | 8.66M | AddrMode.HasBaseReg = true; |
4382 | 8.66M | AddrMode.BaseReg = Addr; |
4383 | 8.66M | // Still check for legality in case the target supports [imm] but not [i+r]. |
4384 | 8.66M | if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace)) |
4385 | 8.65M | return true; |
4386 | 6.10k | AddrMode.HasBaseReg = false; |
4387 | 6.10k | AddrMode.BaseReg = nullptr; |
4388 | 6.10k | } |
4389 | 9.14M | |
4390 | 9.14M | // If the base register is already taken, see if we can do [r+r]. |
4391 | 9.14M | if (490k AddrMode.Scale == 0490k ) { |
4392 | 441k | AddrMode.Scale = 1; |
4393 | 441k | AddrMode.ScaledReg = Addr; |
4394 | 441k | if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace)) |
4395 | 235k | return true; |
4396 | 206k | AddrMode.Scale = 0; |
4397 | 206k | AddrMode.ScaledReg = nullptr; |
4398 | 206k | } |
4399 | 490k | // Couldn't match. |
4400 | 490k | TPT.rollback(LastKnownGood); |
4401 | 254k | return false; |
4402 | 490k | } |
4403 | | |
4404 | | /// Check to see if all uses of OpVal by the specified inline asm call are due |
4405 | | /// to memory operands. If so, return true, otherwise return false. |
4406 | | static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal, |
4407 | | const TargetLowering &TLI, |
4408 | 14 | const TargetRegisterInfo &TRI) { |
4409 | 14 | const Function *F = CI->getFunction(); |
4410 | 14 | TargetLowering::AsmOperandInfoVector TargetConstraints = |
4411 | 14 | TLI.ParseConstraints(F->getParent()->getDataLayout(), &TRI, |
4412 | 14 | ImmutableCallSite(CI)); |
4413 | 14 | |
4414 | 68 | for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i54 ) { |
4415 | 66 | TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i]; |
4416 | 66 | |
4417 | 66 | // Compute the constraint code and ConstraintType to use. |
4418 | 66 | TLI.ComputeConstraintToUse(OpInfo, SDValue()); |
4419 | 66 | |
4420 | 66 | // If this asm operand is our Value*, and if it isn't an indirect memory |
4421 | 66 | // operand, we can't fold it! |
4422 | 66 | if (OpInfo.CallOperandVal == OpVal && |
4423 | 66 | (28 OpInfo.ConstraintType != TargetLowering::C_Memory28 || |
4424 | 28 | !OpInfo.isIndirect16 )) |
4425 | 12 | return false; |
4426 | 66 | } |
4427 | 14 | |
4428 | 14 | return true2 ; |
4429 | 14 | } |
4430 | | |
4431 | | // Max number of memory uses to look at before aborting the search to conserve |
4432 | | // compile time. |
4433 | | static constexpr int MaxMemoryUsesToScan = 20; |
4434 | | |
4435 | | /// Recursively walk all the uses of I until we find a memory use. |
4436 | | /// If we find an obviously non-foldable instruction, return true. |
4437 | | /// Add the ultimately found memory instructions to MemoryUses. |
4438 | | static bool FindAllMemoryUses( |
4439 | | Instruction *I, |
4440 | | SmallVectorImpl<std::pair<Instruction *, unsigned>> &MemoryUses, |
4441 | | SmallPtrSetImpl<Instruction *> &ConsideredInsts, const TargetLowering &TLI, |
4442 | 1.39M | const TargetRegisterInfo &TRI, int SeenInsts = 0) { |
4443 | 1.39M | // If we already considered this instruction, we're done. |
4444 | 1.39M | if (!ConsideredInsts.insert(I).second) |
4445 | 1.63k | return false; |
4446 | 1.38M | |
4447 | 1.38M | // If this is an obviously unfoldable instruction, bail out. |
4448 | 1.38M | if (!MightBeFoldableInst(I)) |
4449 | 38.2k | return true; |
4450 | 1.35M | |
4451 | 1.35M | const bool OptSize = I->getFunction()->hasOptSize(); |
4452 | 1.35M | |
4453 | 1.35M | // Loop over all the uses, recursively processing them. |
4454 | 2.38M | for (Use &U : I->uses()) { |
4455 | 2.38M | // Conservatively return true if we're seeing a large number or a deep chain |
4456 | 2.38M | // of users. This avoids excessive compilation times in pathological cases. |
4457 | 2.38M | if (SeenInsts++ >= MaxMemoryUsesToScan) |
4458 | 22.7k | return true; |
4459 | 2.35M | |
4460 | 2.35M | Instruction *UserI = cast<Instruction>(U.getUser()); |
4461 | 2.35M | if (LoadInst *LI = dyn_cast<LoadInst>(UserI)) { |
4462 | 731k | MemoryUses.push_back(std::make_pair(LI, U.getOperandNo())); |
4463 | 731k | continue; |
4464 | 731k | } |
4465 | 1.62M | |
4466 | 1.62M | if (StoreInst *SI = dyn_cast<StoreInst>(UserI)) { |
4467 | 442k | unsigned opNo = U.getOperandNo(); |
4468 | 442k | if (opNo != StoreInst::getPointerOperandIndex()) |
4469 | 2.42k | return true; // Storing addr, not into addr. |
4470 | 440k | MemoryUses.push_back(std::make_pair(SI, opNo)); |
4471 | 440k | continue; |
4472 | 440k | } |
4473 | 1.18M | |
4474 | 1.18M | if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UserI)) { |
4475 | 485 | unsigned opNo = U.getOperandNo(); |
4476 | 485 | if (opNo != AtomicRMWInst::getPointerOperandIndex()) |
4477 | 0 | return true; // Storing addr, not into addr. |
4478 | 485 | MemoryUses.push_back(std::make_pair(RMW, opNo)); |
4479 | 485 | continue; |
4480 | 485 | } |
4481 | 1.18M | |
4482 | 1.18M | if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(UserI)) { |
4483 | 114 | unsigned opNo = U.getOperandNo(); |
4484 | 114 | if (opNo != AtomicCmpXchgInst::getPointerOperandIndex()) |
4485 | 0 | return true; // Storing addr, not into addr. |
4486 | 114 | MemoryUses.push_back(std::make_pair(CmpX, opNo)); |
4487 | 114 | continue; |
4488 | 114 | } |
4489 | 1.18M | |
4490 | 1.18M | if (CallInst *CI = dyn_cast<CallInst>(UserI)) { |
4491 | 17.8k | // If this is a cold call, we can sink the addressing calculation into |
4492 | 17.8k | // the cold path. See optimizeCallInst |
4493 | 17.8k | if (!OptSize && CI->hasFnAttr(Attribute::Cold)17.8k ) |
4494 | 7 | continue; |
4495 | 17.8k | |
4496 | 17.8k | InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue()); |
4497 | 17.8k | if (!IA) return true17.8k ; |
4498 | 14 | |
4499 | 14 | // If this is a memory operand, we're cool, otherwise bail out. |
4500 | 14 | if (!IsOperandAMemoryOperand(CI, IA, I, TLI, TRI)) |
4501 | 12 | return true; |
4502 | 2 | continue; |
4503 | 2 | } |
4504 | 1.16M | |
4505 | 1.16M | if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TLI, TRI, |
4506 | 1.16M | SeenInsts)) |
4507 | 83.8k | return true; |
4508 | 1.16M | } |
4509 | 1.35M | |
4510 | 1.35M | return false1.22M ; |
4511 | 1.35M | } |
4512 | | |
4513 | | /// Return true if Val is already known to be live at the use site that we're |
4514 | | /// folding it into. If so, there is no cost to include it in the addressing |
4515 | | /// mode. KnownLive1 and KnownLive2 are two values that we know are live at the |
4516 | | /// instruction already. |
4517 | | bool AddressingModeMatcher::valueAlreadyLiveAtInst(Value *Val,Value *KnownLive1, |
4518 | 4.85M | Value *KnownLive2) { |
4519 | 4.85M | // If Val is either of the known-live values, we know it is live! |
4520 | 4.85M | if (Val == nullptr || Val == KnownLive12.52M || Val == KnownLive22.52M ) |
4521 | 2.33M | return true; |
4522 | 2.52M | |
4523 | 2.52M | // All values other than instructions and arguments (e.g. constants) are live. |
4524 | 2.52M | if (!isa<Instruction>(Val) && !isa<Argument>(Val)357k ) return true18.9k ; |
4525 | 2.50M | |
4526 | 2.50M | // If Val is a constant sized alloca in the entry block, it is live, this is |
4527 | 2.50M | // true because it is just a reference to the stack/frame pointer, which is |
4528 | 2.50M | // live for the whole function. |
4529 | 2.50M | if (AllocaInst *AI = dyn_cast<AllocaInst>(Val)) |
4530 | 115k | if (AI->isStaticAlloca()) |
4531 | 115k | return true; |
4532 | 2.38M | |
4533 | 2.38M | // Check to see if this value is already used in the memory instruction's |
4534 | 2.38M | // block. If so, it's already live into the block at the very least, so we |
4535 | 2.38M | // can reasonably fold it. |
4536 | 2.38M | return Val->isUsedInBasicBlock(MemoryInst->getParent()); |
4537 | 2.38M | } |
4538 | | |
4539 | | /// It is possible for the addressing mode of the machine to fold the specified |
4540 | | /// instruction into a load or store that ultimately uses it. |
4541 | | /// However, the specified instruction has multiple uses. |
4542 | | /// Given this, it may actually increase register pressure to fold it |
4543 | | /// into the load. For example, consider this code: |
4544 | | /// |
4545 | | /// X = ... |
4546 | | /// Y = X+1 |
4547 | | /// use(Y) -> nonload/store |
4548 | | /// Z = Y+1 |
4549 | | /// load Z |
4550 | | /// |
4551 | | /// In this case, Y has multiple uses, and can be folded into the load of Z |
4552 | | /// (yielding load [X+2]). However, doing this will cause both "X" and "X+1" to |
4553 | | /// be live at the use(Y) line. If we don't fold Y into load Z, we use one |
4554 | | /// fewer register. Since Y can't be folded into "use(Y)" we don't increase the |
4555 | | /// number of computations either. |
4556 | | /// |
4557 | | /// Note that this (like most of CodeGenPrepare) is just a rough heuristic. If |
4558 | | /// X was live across 'load Z' for other reasons, we actually *would* want to |
4559 | | /// fold the addressing mode in the Z case. This would make Y die earlier. |
4560 | | bool AddressingModeMatcher:: |
4561 | | isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore, |
4562 | 2.89M | ExtAddrMode &AMAfter) { |
4563 | 2.89M | if (IgnoreProfitability) return true463k ; |
4564 | 2.42M | |
4565 | 2.42M | // AMBefore is the addressing mode before this instruction was folded into it, |
4566 | 2.42M | // and AMAfter is the addressing mode after the instruction was folded. Get |
4567 | 2.42M | // the set of registers referenced by AMAfter and subtract out those |
4568 | 2.42M | // referenced by AMBefore: this is the set of values which folding in this |
4569 | 2.42M | // address extends the lifetime of. |
4570 | 2.42M | // |
4571 | 2.42M | // Note that there are only two potential values being referenced here, |
4572 | 2.42M | // BaseReg and ScaleReg (global addresses are always available, as are any |
4573 | 2.42M | // folded immediates). |
4574 | 2.42M | Value *BaseReg = AMAfter.BaseReg, *ScaledReg = AMAfter.ScaledReg; |
4575 | 2.42M | |
4576 | 2.42M | // If the BaseReg or ScaledReg was referenced by the previous addrmode, their |
4577 | 2.42M | // lifetime wasn't extended by adding this instruction. |
4578 | 2.42M | if (valueAlreadyLiveAtInst(BaseReg, AMBefore.BaseReg, AMBefore.ScaledReg)) |
4579 | 2.21M | BaseReg = nullptr; |
4580 | 2.42M | if (valueAlreadyLiveAtInst(ScaledReg, AMBefore.BaseReg, AMBefore.ScaledReg)) |
4581 | 2.38M | ScaledReg = nullptr; |
4582 | 2.42M | |
4583 | 2.42M | // If folding this instruction (and it's subexprs) didn't extend any live |
4584 | 2.42M | // ranges, we're ok with it. |
4585 | 2.42M | if (!BaseReg && !ScaledReg2.21M ) |
4586 | 2.20M | return true; |
4587 | 225k | |
4588 | 225k | // If all uses of this instruction can have the address mode sunk into them, |
4589 | 225k | // we can remove the addressing mode and effectively trade one live register |
4590 | 225k | // for another (at worst.) In this context, folding an addressing mode into |
4591 | 225k | // the use is just a particularly nice way of sinking it. |
4592 | 225k | SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses; |
4593 | 225k | SmallPtrSet<Instruction*, 16> ConsideredInsts; |
4594 | 225k | if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI, TRI)) |
4595 | 81.2k | return false; // Has a non-memory, non-foldable use! |
4596 | 144k | |
4597 | 144k | // Now that we know that all uses of this instruction are part of a chain of |
4598 | 144k | // computation involving only operations that could theoretically be folded |
4599 | 144k | // into a memory use, loop over each of these memory operation uses and see |
4600 | 144k | // if they could *actually* fold the instruction. The assumption is that |
4601 | 144k | // addressing modes are cheap and that duplicating the computation involved |
4602 | 144k | // many times is worthwhile, even on a fastpath. For sinking candidates |
4603 | 144k | // (i.e. cold call sites), this serves as a way to prevent excessive code |
4604 | 144k | // growth since most architectures have some reasonable small and fast way to |
4605 | 144k | // compute an effective address. (i.e LEA on x86) |
4606 | 144k | SmallVector<Instruction*, 32> MatchedAddrModeInsts; |
4607 | 553k | for (unsigned i = 0, e = MemoryUses.size(); i != e; ++i408k ) { |
4608 | 435k | Instruction *User = MemoryUses[i].first; |
4609 | 435k | unsigned OpNo = MemoryUses[i].second; |
4610 | 435k | |
4611 | 435k | // Get the access type of this use. If the use isn't a pointer, we don't |
4612 | 435k | // know what it accesses. |
4613 | 435k | Value *Address = User->getOperand(OpNo); |
4614 | 435k | PointerType *AddrTy = dyn_cast<PointerType>(Address->getType()); |
4615 | 435k | if (!AddrTy) |
4616 | 0 | return false; |
4617 | 435k | Type *AddressAccessTy = AddrTy->getElementType(); |
4618 | 435k | unsigned AS = AddrTy->getAddressSpace(); |
4619 | 435k | |
4620 | 435k | // Do a match against the root of this address, ignoring profitability. This |
4621 | 435k | // will tell us if the addressing mode for the memory operation will |
4622 | 435k | // *actually* cover the shared instruction. |
4623 | 435k | ExtAddrMode Result; |
4624 | 435k | std::pair<AssertingVH<GetElementPtrInst>, int64_t> LargeOffsetGEP(nullptr, |
4625 | 435k | 0); |
4626 | 435k | TypePromotionTransaction::ConstRestorationPt LastKnownGood = |
4627 | 435k | TPT.getRestorationPoint(); |
4628 | 435k | AddressingModeMatcher Matcher( |
4629 | 435k | MatchedAddrModeInsts, TLI, TRI, AddressAccessTy, AS, MemoryInst, Result, |
4630 | 435k | InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP); |
4631 | 435k | Matcher.IgnoreProfitability = true; |
4632 | 435k | bool Success = Matcher.matchAddr(Address, 0); |
4633 | 435k | (void)Success; assert(Success && "Couldn't select *anything*?"); |
4634 | 435k | |
4635 | 435k | // The match was to check the profitability, the changes made are not |
4636 | 435k | // part of the original matcher. Therefore, they should be dropped |
4637 | 435k | // otherwise the original matcher will not present the right state. |
4638 | 435k | TPT.rollback(LastKnownGood); |
4639 | 435k | |
4640 | 435k | // If the match didn't cover I, then it won't be shared by it. |
4641 | 435k | if (!is_contained(MatchedAddrModeInsts, I)) |
4642 | 26.7k | return false; |
4643 | 408k | |
4644 | 408k | MatchedAddrModeInsts.clear(); |
4645 | 408k | } |
4646 | 144k | |
4647 | 144k | return true117k ; |
4648 | 144k | } |
4649 | | |
4650 | | /// Return true if the specified values are defined in a |
4651 | | /// different basic block than BB. |
4652 | 9.72M | static bool IsNonLocalValue(Value *V, BasicBlock *BB) { |
4653 | 9.72M | if (Instruction *I = dyn_cast<Instruction>(V)) |
4654 | 9.72M | return I->getParent() != BB; |
4655 | 0 | return false; |
4656 | 0 | } |
4657 | | |
4658 | | /// Sink addressing mode computation immediate before MemoryInst if doing so |
4659 | | /// can be done without increasing register pressure. The need for the |
4660 | | /// register pressure constraint means this can end up being an all or nothing |
4661 | | /// decision for all uses of the same addressing computation. |
4662 | | /// |
4663 | | /// Load and Store Instructions often have addressing modes that can do |
4664 | | /// significant amounts of computation. As such, instruction selection will try |
4665 | | /// to get the load or store to do as much computation as possible for the |
4666 | | /// program. The problem is that isel can only see within a single block. As |
4667 | | /// such, we sink as much legal addressing mode work into the block as possible. |
4668 | | /// |
4669 | | /// This method is used to optimize both load/store and inline asms with memory |
4670 | | /// operands. It's also used to sink addressing computations feeding into cold |
4671 | | /// call sites into their (cold) basic block. |
4672 | | /// |
4673 | | /// The motivation for handling sinking into cold blocks is that doing so can |
4674 | | /// both enable other address mode sinking (by satisfying the register pressure |
4675 | | /// constraint above), and reduce register pressure globally (by removing the |
4676 | | /// addressing mode computation from the fast path entirely.). |
4677 | | bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, |
4678 | 7.72M | Type *AccessTy, unsigned AddrSpace) { |
4679 | 7.72M | Value *Repl = Addr; |
4680 | 7.72M | |
4681 | 7.72M | // Try to collapse single-value PHI nodes. This is necessary to undo |
4682 | 7.72M | // unprofitable PRE transformations. |
4683 | 7.72M | SmallVector<Value*, 8> worklist; |
4684 | 7.72M | SmallPtrSet<Value*, 16> Visited; |
4685 | 7.72M | worklist.push_back(Addr); |
4686 | 7.72M | |
4687 | 7.72M | // Use a worklist to iteratively look through PHI and select nodes, and |
4688 | 7.72M | // ensure that the addressing mode obtained from the non-PHI/select roots of |
4689 | 7.72M | // the graph are compatible. |
4690 | 7.72M | bool PhiOrSelectSeen = false; |
4691 | 7.72M | SmallVector<Instruction*, 16> AddrModeInsts; |
4692 | 7.72M | const SimplifyQuery SQ(*DL, TLInfo); |
4693 | 7.72M | AddressingModeCombiner AddrModes(SQ, Addr); |
4694 | 7.72M | TypePromotionTransaction TPT(RemovedInsts); |
4695 | 7.72M | TypePromotionTransaction::ConstRestorationPt LastKnownGood = |
4696 | 7.72M | TPT.getRestorationPoint(); |
4697 | 15.6M | while (!worklist.empty()) { |
4698 | 7.98M | Value *V = worklist.back(); |
4699 | 7.98M | worklist.pop_back(); |
4700 | 7.98M | |
4701 | 7.98M | // We allow traversing cyclic Phi nodes. |
4702 | 7.98M | // In case of success after this loop we ensure that traversing through |
4703 | 7.98M | // Phi nodes ends up with all cases to compute address of the form |
4704 | 7.98M | // BaseGV + Base + Scale * Index + Offset |
4705 | 7.98M | // where Scale and Offset are constans and BaseGV, Base and Index |
4706 | 7.98M | // are exactly the same Values in all cases. |
4707 | 7.98M | // It means that BaseGV, Scale and Offset dominate our memory instruction |
4708 | 7.98M | // and have the same value as they had in address computation represented |
4709 | 7.98M | // as Phi. So we can safely sink address computation to memory instruction. |
4710 | 7.98M | if (!Visited.insert(V).second) |
4711 | 16.1k | continue; |
4712 | 7.97M | |
4713 | 7.97M | // For a PHI node, push all of its incoming values. |
4714 | 7.97M | if (PHINode *P = dyn_cast<PHINode>(V)) { |
4715 | 125k | for (Value *IncValue : P->incoming_values()) |
4716 | 281k | worklist.push_back(IncValue); |
4717 | 125k | PhiOrSelectSeen = true; |
4718 | 125k | continue; |
4719 | 125k | } |
4720 | 7.84M | // Similar for select. |
4721 | 7.84M | if (SelectInst *SI = dyn_cast<SelectInst>(V)) { |
4722 | 10.7k | worklist.push_back(SI->getFalseValue()); |
4723 | 10.7k | worklist.push_back(SI->getTrueValue()); |
4724 | 10.7k | PhiOrSelectSeen = true; |
4725 | 10.7k | continue; |
4726 | 10.7k | } |
4727 | 7.83M | |
4728 | 7.83M | // For non-PHIs, determine the addressing mode being computed. Note that |
4729 | 7.83M | // the result may differ depending on what other uses our candidate |
4730 | 7.83M | // addressing instructions might have. |
4731 | 7.83M | AddrModeInsts.clear(); |
4732 | 7.83M | std::pair<AssertingVH<GetElementPtrInst>, int64_t> LargeOffsetGEP(nullptr, |
4733 | 7.83M | 0); |
4734 | 7.83M | ExtAddrMode NewAddrMode = AddressingModeMatcher::Match( |
4735 | 7.83M | V, AccessTy, AddrSpace, MemoryInst, AddrModeInsts, *TLI, *TRI, |
4736 | 7.83M | InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP); |
4737 | 7.83M | |
4738 | 7.83M | GetElementPtrInst *GEP = LargeOffsetGEP.first; |
4739 | 7.83M | if (GEP && !NewGEPBases.count(GEP)19.9k ) { |
4740 | 19.7k | // If splitting the underlying data structure can reduce the offset of a |
4741 | 19.7k | // GEP, collect the GEP. Skip the GEPs that are the new bases of |
4742 | 19.7k | // previously split data structures. |
4743 | 19.7k | LargeOffsetGEPMap[GEP->getPointerOperand()].push_back(LargeOffsetGEP); |
4744 | 19.7k | if (LargeOffsetGEPID.find(GEP) == LargeOffsetGEPID.end()) |
4745 | 16.5k | LargeOffsetGEPID[GEP] = LargeOffsetGEPID.size(); |
4746 | 19.7k | } |
4747 | 7.83M | |
4748 | 7.83M | NewAddrMode.OriginalValue = V; |
4749 | 7.83M | if (!AddrModes.addNewAddrMode(NewAddrMode)) |
4750 | 79.7k | break; |
4751 | 7.83M | } |
4752 | 7.72M | |
4753 | 7.72M | // Try to combine the AddrModes we've collected. If we couldn't collect any, |
4754 | 7.72M | // or we have multiple but either couldn't combine them or combining them |
4755 | 7.72M | // wouldn't do anything useful, bail out now. |
4756 | 7.72M | if (!AddrModes.combineAddrModes()) { |
4757 | 91.2k | TPT.rollback(LastKnownGood); |
4758 | 91.2k | return false; |
4759 | 91.2k | } |
4760 | 7.63M | TPT.commit(); |
4761 | 7.63M | |
4762 | 7.63M | // Get the combined AddrMode (or the only AddrMode, if we only had one). |
4763 | 7.63M | ExtAddrMode AddrMode = AddrModes.getAddrMode(); |
4764 | 7.63M | |
4765 | 7.63M | // If all the instructions matched are already in this BB, don't do anything. |
4766 | 7.63M | // If we saw a Phi node then it is not local definitely, and if we saw a select |
4767 | 7.63M | // then we want to push the address calculation past it even if it's already |
4768 | 7.63M | // in this BB. |
4769 | 9.72M | if (!PhiOrSelectSeen7.63M && none_of(AddrModeInsts, [&](Value *V) 7.63M { |
4770 | 9.72M | return IsNonLocalValue(V, MemoryInst->getParent()); |
4771 | 9.72M | })) { |
4772 | 7.32M | LLVM_DEBUG(dbgs() << "CGP: Found local addrmode: " << AddrMode |
4773 | 7.32M | << "\n"); |
4774 | 7.32M | return false; |
4775 | 7.32M | } |
4776 | 313k | |
4777 | 313k | // Insert this computation right after this user. Since our caller is |
4778 | 313k | // scanning from the top of the BB to the bottom, reuse of the expr are |
4779 | 313k | // guaranteed to happen later. |
4780 | 313k | IRBuilder<> Builder(MemoryInst); |
4781 | 313k | |
4782 | 313k | // Now that we determined the addressing expression we want to use and know |
4783 | 313k | // that we have to sink it into this block. Check to see if we have already |
4784 | 313k | // done this for some other load/store instr in this block. If so, reuse |
4785 | 313k | // the computation. Before attempting reuse, check if the address is valid |
4786 | 313k | // as it may have been erased. |
4787 | 313k | |
4788 | 313k | WeakTrackingVH SunkAddrVH = SunkAddrs[Addr]; |
4789 | 313k | |
4790 | 313k | Value * SunkAddr = SunkAddrVH.pointsToAliveValue() ? SunkAddrVH22.1k : nullptr291k ; |
4791 | 313k | if (SunkAddr) { |
4792 | 22.1k | LLVM_DEBUG(dbgs() << "CGP: Reusing nonlocal addrmode: " << AddrMode |
4793 | 22.1k | << " for " << *MemoryInst << "\n"); |
4794 | 22.1k | if (SunkAddr->getType() != Addr->getType()) |
4795 | 0 | SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType()); |
4796 | 291k | } else if (AddrSinkUsingGEPs || |
4797 | 291k | (5 !AddrSinkUsingGEPs.getNumOccurrences()5 && TM0 && TTI->useAA()0 )) { |
4798 | 291k | // By default, we use the GEP-based method when AA is used later. This |
4799 | 291k | // prevents new inttoptr/ptrtoint pairs from degrading AA capabilities. |
4800 | 291k | LLVM_DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode |
4801 | 291k | << " for " << *MemoryInst << "\n"); |
4802 | 291k | Type *IntPtrTy = DL->getIntPtrType(Addr->getType()); |
4803 | 291k | Value *ResultPtr = nullptr, *ResultIndex = nullptr; |
4804 | 291k | |
4805 | 291k | // First, find the pointer. |
4806 | 291k | if (AddrMode.BaseReg && AddrMode.BaseReg->getType()->isPointerTy()291k ) { |
4807 | 288k | ResultPtr = AddrMode.BaseReg; |
4808 | 288k | AddrMode.BaseReg = nullptr; |
4809 | 288k | } |
4810 | 291k | |
4811 | 291k | if (AddrMode.Scale && AddrMode.ScaledReg->getType()->isPointerTy()20.0k ) { |
4812 | 0 | // We can't add more than one pointer together, nor can we scale a |
4813 | 0 | // pointer (both of which seem meaningless). |
4814 | 0 | if (ResultPtr || AddrMode.Scale != 1) |
4815 | 0 | return false; |
4816 | 0 | |
4817 | 0 | ResultPtr = AddrMode.ScaledReg; |
4818 | 0 | AddrMode.Scale = 0; |
4819 | 0 | } |
4820 | 291k | |
4821 | 291k | // It is only safe to sign extend the BaseReg if we know that the math |
4822 | 291k | // required to create it did not overflow before we extend it. Since |
4823 | 291k | // the original IR value was tossed in favor of a constant back when |
4824 | 291k | // the AddrMode was created we need to bail out gracefully if widths |
4825 | 291k | // do not match instead of extending it. |
4826 | 291k | // |
4827 | 291k | // (See below for code to add the scale.) |
4828 | 291k | if (AddrMode.Scale) { |
4829 | 20.0k | Type *ScaledRegTy = AddrMode.ScaledReg->getType(); |
4830 | 20.0k | if (cast<IntegerType>(IntPtrTy)->getBitWidth() > |
4831 | 20.0k | cast<IntegerType>(ScaledRegTy)->getBitWidth()) |
4832 | 10 | return false; |
4833 | 291k | } |
4834 | 291k | |
4835 | 291k | if (AddrMode.BaseGV) { |
4836 | 21 | if (ResultPtr) |
4837 | 0 | return false; |
4838 | 21 | |
4839 | 21 | ResultPtr = AddrMode.BaseGV; |
4840 | 21 | } |
4841 | 291k | |
4842 | 291k | // If the real base value actually came from an inttoptr, then the matcher |
4843 | 291k | // will look through it and provide only the integer value. In that case, |
4844 | 291k | // use it here. |
4845 | 291k | if (!DL->isNonIntegralPointerType(Addr->getType())) { |
4846 | 291k | if (!ResultPtr && AddrMode.BaseReg3.06k ) { |
4847 | 3.05k | ResultPtr = Builder.CreateIntToPtr(AddrMode.BaseReg, Addr->getType(), |
4848 | 3.05k | "sunkaddr"); |
4849 | 3.05k | AddrMode.BaseReg = nullptr; |
4850 | 288k | } else if (!ResultPtr && AddrMode.Scale == 19 ) { |
4851 | 0 | ResultPtr = Builder.CreateIntToPtr(AddrMode.ScaledReg, Addr->getType(), |
4852 | 0 | "sunkaddr"); |
4853 | 0 | AddrMode.Scale = 0; |
4854 | 0 | } |
4855 | 291k | } |
4856 | 291k | |
4857 | 291k | if (!ResultPtr && |
4858 | 291k | !AddrMode.BaseReg11 && !AddrMode.Scale9 && !AddrMode.BaseOffs9 ) { |
4859 | 3 | SunkAddr = Constant::getNullValue(Addr->getType()); |
4860 | 291k | } else if (!ResultPtr) { |
4861 | 8 | return false; |
4862 | 291k | } else { |
4863 | 291k | Type *I8PtrTy = |
4864 | 291k | Builder.getInt8PtrTy(Addr->getType()->getPointerAddressSpace()); |
4865 | 291k | Type *I8Ty = Builder.getInt8Ty(); |
4866 | 291k | |
4867 | 291k | // Start with the base register. Do this first so that subsequent address |
4868 | 291k | // matching finds it last, which will prevent it from trying to match it |
4869 | 291k | // as the scaled value in case it happens to be a mul. That would be |
4870 | 291k | // problematic if we've sunk a different mul for the scale, because then |
4871 | 291k | // we'd end up sinking both muls. |
4872 | 291k | if (AddrMode.BaseReg) { |
4873 | 2 | Value *V = AddrMode.BaseReg; |
4874 | 2 | if (V->getType() != IntPtrTy) |
4875 | 2 | V = Builder.CreateIntCast(V, IntPtrTy, /*isSigned=*/true, "sunkaddr"); |
4876 | 2 | |
4877 | 2 | ResultIndex = V; |
4878 | 2 | } |
4879 | 291k | |
4880 | 291k | // Add the scale value. |
4881 | 291k | if (AddrMode.Scale) { |
4882 | 20.0k | Value *V = AddrMode.ScaledReg; |
4883 | 20.0k | if (V->getType() == IntPtrTy) { |
4884 | 20.0k | // done. |
4885 | 20.0k | } else { |
4886 | 2 | assert(cast<IntegerType>(IntPtrTy)->getBitWidth() < |
4887 | 2 | cast<IntegerType>(V->getType())->getBitWidth() && |
4888 | 2 | "We can't transform if ScaledReg is too narrow"); |
4889 | 2 | V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr"); |
4890 | 2 | } |
4891 | 20.0k | |
4892 | 20.0k | if (AddrMode.Scale != 1) |
4893 | 14.0k | V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale), |
4894 | 14.0k | "sunkaddr"); |
4895 | 20.0k | if (ResultIndex) |
4896 | 0 | ResultIndex = Builder.CreateAdd(ResultIndex, V, "sunkaddr"); |
4897 | 20.0k | else |
4898 | 20.0k | ResultIndex = V; |
4899 | 20.0k | } |
4900 | 291k | |
4901 | 291k | // Add in the Base Offset if present. |
4902 | 291k | if (AddrMode.BaseOffs) { |
4903 | 279k | Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs); |
4904 | 279k | if (ResultIndex) { |
4905 | 8.87k | // We need to add this separately from the scale above to help with |
4906 | 8.87k | // SDAG consecutive load/store merging. |
4907 | 8.87k | if (ResultPtr->getType() != I8PtrTy) |
4908 | 5.02k | ResultPtr = Builder.CreatePointerCast(ResultPtr, I8PtrTy); |
4909 | 8.87k | ResultPtr = |
4910 | 8.87k | AddrMode.InBounds |
4911 | 8.87k | ? Builder.CreateInBoundsGEP(I8Ty, ResultPtr, ResultIndex, |
4912 | 393 | "sunkaddr") |
4913 | 8.87k | : Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr")8.48k ; |
4914 | 8.87k | } |
4915 | 279k | |
4916 | 279k | ResultIndex = V; |
4917 | 279k | } |
4918 | 291k | |
4919 | 291k | if (!ResultIndex) { |
4920 | 854 | SunkAddr = ResultPtr; |
4921 | 290k | } else { |
4922 | 290k | if (ResultPtr->getType() != I8PtrTy) |
4923 | 271k | ResultPtr = Builder.CreatePointerCast(ResultPtr, I8PtrTy); |
4924 | 290k | SunkAddr = |
4925 | 290k | AddrMode.InBounds |
4926 | 290k | ? Builder.CreateInBoundsGEP(I8Ty, ResultPtr, ResultIndex, |
4927 | 275k | "sunkaddr") |
4928 | 290k | : Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr")15.3k ; |
4929 | 290k | } |
4930 | 291k | |
4931 | 291k | if (SunkAddr->getType() != Addr->getType()) |
4932 | 262k | SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType()); |
4933 | 291k | } |
4934 | 291k | } else { |
4935 | 5 | // We'd require a ptrtoint/inttoptr down the line, which we can't do for |
4936 | 5 | // non-integral pointers, so in that case bail out now. |
4937 | 5 | Type *BaseTy = AddrMode.BaseReg ? AddrMode.BaseReg->getType() : nullptr0 ; |
4938 | 5 | Type *ScaleTy = AddrMode.Scale ? AddrMode.ScaledReg->getType()0 : nullptr; |
4939 | 5 | PointerType *BasePtrTy = dyn_cast_or_null<PointerType>(BaseTy); |
4940 | 5 | PointerType *ScalePtrTy = dyn_cast_or_null<PointerType>(ScaleTy); |
4941 | 5 | if (DL->isNonIntegralPointerType(Addr->getType()) || |
4942 | 5 | (1 BasePtrTy1 && DL->isNonIntegralPointerType(BasePtrTy)1 ) || |
4943 | 5 | (0 ScalePtrTy0 && DL->isNonIntegralPointerType(ScalePtrTy)0 ) || |
4944 | 5 | (0 AddrMode.BaseGV0 && |
4945 | 0 | DL->isNonIntegralPointerType(AddrMode.BaseGV->getType()))) |
4946 | 5 | return false; |
4947 | 0 | |
4948 | 0 | LLVM_DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode |
4949 | 0 | << " for " << *MemoryInst << "\n"); |
4950 | 0 | Type *IntPtrTy = DL->getIntPtrType(Addr->getType()); |
4951 | 0 | Value *Result = nullptr; |
4952 | 0 |
|
4953 | 0 | // Start with the base register. Do this first so that subsequent address |
4954 | 0 | // matching finds it last, which will prevent it from trying to match it |
4955 | 0 | // as the scaled value in case it happens to be a mul. That would be |
4956 | 0 | // problematic if we've sunk a different mul for the scale, because then |
4957 | 0 | // we'd end up sinking both muls. |
4958 | 0 | if (AddrMode.BaseReg) { |
4959 | 0 | Value *V = AddrMode.BaseReg; |
4960 | 0 | if (V->getType()->isPointerTy()) |
4961 | 0 | V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr"); |
4962 | 0 | if (V->getType() != IntPtrTy) |
4963 | 0 | V = Builder.CreateIntCast(V, IntPtrTy, /*isSigned=*/true, "sunkaddr"); |
4964 | 0 | Result = V; |
4965 | 0 | } |
4966 | 0 |
|
4967 | 0 | // Add the scale value. |
4968 | 0 | if (AddrMode.Scale) { |
4969 | 0 | Value *V = AddrMode.ScaledReg; |
4970 | 0 | if (V->getType() == IntPtrTy) { |
4971 | 0 | // done. |
4972 | 0 | } else if (V->getType()->isPointerTy()) { |
4973 | 0 | V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr"); |
4974 | 0 | } else if (cast<IntegerType>(IntPtrTy)->getBitWidth() < |
4975 | 0 | cast<IntegerType>(V->getType())->getBitWidth()) { |
4976 | 0 | V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr"); |
4977 | 0 | } else { |
4978 | 0 | // It is only safe to sign extend the BaseReg if we know that the math |
4979 | 0 | // required to create it did not overflow before we extend it. Since |
4980 | 0 | // the original IR value was tossed in favor of a constant back when |
4981 | 0 | // the AddrMode was created we need to bail out gracefully if widths |
4982 | 0 | // do not match instead of extending it. |
4983 | 0 | Instruction *I = dyn_cast_or_null<Instruction>(Result); |
4984 | 0 | if (I && (Result != AddrMode.BaseReg)) |
4985 | 0 | I->eraseFromParent(); |
4986 | 0 | return false; |
4987 | 0 | } |
4988 | 0 | if (AddrMode.Scale != 1) |
4989 | 0 | V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale), |
4990 | 0 | "sunkaddr"); |
4991 | 0 | if (Result) |
4992 | 0 | Result = Builder.CreateAdd(Result, V, "sunkaddr"); |
4993 | 0 | else |
4994 | 0 | Result = V; |
4995 | 0 | } |
4996 | 0 |
|
4997 | 0 | // Add in the BaseGV if present. |
4998 | 0 | if (AddrMode.BaseGV) { |
4999 | 0 | Value *V = Builder.CreatePtrToInt(AddrMode.BaseGV, IntPtrTy, "sunkaddr"); |
5000 | 0 | if (Result) |
5001 | 0 | Result = Builder.CreateAdd(Result, V, "sunkaddr"); |
5002 | 0 | else |
5003 | 0 | Result = V; |
5004 | 0 | } |
5005 | 0 |
|
5006 | 0 | // Add in the Base Offset if present. |
5007 | 0 | if (AddrMode.BaseOffs) { |
5008 | 0 | Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs); |
5009 | 0 | if (Result) |
5010 | 0 | Result = Builder.CreateAdd(Result, V, "sunkaddr"); |
5011 | 0 | else |
5012 | 0 | Result = V; |
5013 | 0 | } |
5014 | 0 |
|
5015 | 0 | if (!Result) |
5016 | 0 | SunkAddr = Constant::getNullValue(Addr->getType()); |
5017 | 0 | else |
5018 | 0 | SunkAddr = Builder.CreateIntToPtr(Result, Addr->getType(), "sunkaddr"); |
5019 | 0 | } |
5020 | 313k | |
5021 | 313k | MemoryInst->replaceUsesOfWith(Repl, SunkAddr); |
5022 | 313k | // Store the newly computed address into the cache. In the case we reused a |
5023 | 313k | // value, this should be idempotent. |
5024 | 313k | SunkAddrs[Addr] = WeakTrackingVH(SunkAddr); |
5025 | 313k | |
5026 | 313k | // If we have no uses, recursively delete the value and all dead instructions |
5027 | 313k | // using it. |
5028 | 313k | if (Repl->use_empty()) { |
5029 | 132k | // This can cause recursive deletion, which can invalidate our iterator. |
5030 | 132k | // Use a WeakTrackingVH to hold onto it in case this happens. |
5031 | 132k | Value *CurValue = &*CurInstIterator; |
5032 | 132k | WeakTrackingVH IterHandle(CurValue); |
5033 | 132k | BasicBlock *BB = CurInstIterator->getParent(); |
5034 | 132k | |
5035 | 132k | RecursivelyDeleteTriviallyDeadInstructions(Repl, TLInfo); |
5036 | 132k | |
5037 | 132k | if (IterHandle != CurValue) { |
5038 | 1 | // If the iterator instruction was recursively deleted, start over at the |
5039 | 1 | // start of the block. |
5040 | 1 | CurInstIterator = BB->begin(); |
5041 | 1 | SunkAddrs.clear(); |
5042 | 1 | } |
5043 | 132k | } |
5044 | 313k | ++NumMemoryInsts; |
5045 | 313k | return true; |
5046 | 313k | } |
5047 | | |
5048 | | /// If there are any memory operands, use OptimizeMemoryInst to sink their |
5049 | | /// address computing into the block when possible / profitable. |
5050 | 26.9k | bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) { |
5051 | 26.9k | bool MadeChange = false; |
5052 | 26.9k | |
5053 | 26.9k | const TargetRegisterInfo *TRI = |
5054 | 26.9k | TM->getSubtargetImpl(*CS->getFunction())->getRegisterInfo(); |
5055 | 26.9k | TargetLowering::AsmOperandInfoVector TargetConstraints = |
5056 | 26.9k | TLI->ParseConstraints(*DL, TRI, CS); |
5057 | 26.9k | unsigned ArgNo = 0; |
5058 | 156k | for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i129k ) { |
5059 | 129k | TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i]; |
5060 | 129k | |
5061 | 129k | // Compute the constraint code and ConstraintType to use. |
5062 | 129k | TLI->ComputeConstraintToUse(OpInfo, SDValue()); |
5063 | 129k | |
5064 | 129k | if (OpInfo.ConstraintType == TargetLowering::C_Memory && |
5065 | 129k | OpInfo.isIndirect15.7k ) { |
5066 | 482 | Value *OpVal = CS->getArgOperand(ArgNo++); |
5067 | 482 | MadeChange |= optimizeMemoryInst(CS, OpVal, OpVal->getType(), ~0u); |
5068 | 129k | } else if (OpInfo.Type == InlineAsm::isInput) |
5069 | 6.41k | ArgNo++; |
5070 | 129k | } |
5071 | 26.9k | |
5072 | 26.9k | return MadeChange; |
5073 | 26.9k | } |
5074 | | |
5075 | | /// Check if all the uses of \p Val are equivalent (or free) zero or |
5076 | | /// sign extensions. |
5077 | 13.6k | static bool hasSameExtUse(Value *Val, const TargetLowering &TLI) { |
5078 | 13.6k | assert(!Val->use_empty() && "Input must have at least one use"); |
5079 | 13.6k | const Instruction *FirstUser = cast<Instruction>(*Val->user_begin()); |
5080 | 13.6k | bool IsSExt = isa<SExtInst>(FirstUser); |
5081 | 13.6k | Type *ExtTy = FirstUser->getType(); |
5082 | 36.5k | for (const User *U : Val->users()) { |
5083 | 36.5k | const Instruction *UI = cast<Instruction>(U); |
5084 | 36.5k | if ((IsSExt && !isa<SExtInst>(UI)35.2k ) || (25.0k !IsSExt25.0k && !isa<ZExtInst>(UI)1.29k )) |
5085 | 11.7k | return false; |
5086 | 24.8k | Type *CurTy = UI->getType(); |
5087 | 24.8k | // Same input and output types: Same instruction after CSE. |
5088 | 24.8k | if (CurTy == ExtTy) |
5089 | 24.5k | continue; |
5090 | 284 | |
5091 | 284 | // If IsSExt is true, we are in this situation: |
5092 | 284 | // a = Val |
5093 | 284 | // b = sext ty1 a to ty2 |
5094 | 284 | // c = sext ty1 a to ty3 |
5095 | 284 | // Assuming ty2 is shorter than ty3, this could be turned into: |
5096 | 284 | // a = Val |
5097 | 284 | // b = sext ty1 a to ty2 |
5098 | 284 | // c = sext ty2 b to ty3 |
5099 | 284 | // However, the last sext is not free. |
5100 | 284 | if (IsSExt) |
5101 | 196 | return false; |
5102 | 88 | |
5103 | 88 | // This is a ZExt, maybe this is free to extend from one type to another. |
5104 | 88 | // In that case, we would not account for a different use. |
5105 | 88 | Type *NarrowTy; |
5106 | 88 | Type *LargeTy; |
5107 | 88 | if (ExtTy->getScalarType()->getIntegerBitWidth() > |
5108 | 88 | CurTy->getScalarType()->getIntegerBitWidth()) { |
5109 | 88 | NarrowTy = CurTy; |
5110 | 88 | LargeTy = ExtTy; |
5111 | 88 | } else { |
5112 | 0 | NarrowTy = ExtTy; |
5113 | 0 | LargeTy = CurTy; |
5114 | 0 | } |
5115 | 88 | |
5116 | 88 | if (!TLI.isZExtFree(NarrowTy, LargeTy)) |
5117 | 0 | return false; |
5118 | 88 | } |
5119 | 13.6k | // All uses are the same or can be derived from one another for free. |
5120 | 13.6k | return true1.71k ; |
5121 | 13.6k | } |
5122 | | |
5123 | | /// Try to speculatively promote extensions in \p Exts and continue |
5124 | | /// promoting through newly promoted operands recursively as far as doing so is |
5125 | | /// profitable. Save extensions profitably moved up, in \p ProfitablyMovedExts. |
5126 | | /// When some promotion happened, \p TPT contains the proper state to revert |
5127 | | /// them. |
5128 | | /// |
5129 | | /// \return true if some promotion happened, false otherwise. |
5130 | | bool CodeGenPrepare::tryToPromoteExts( |
5131 | | TypePromotionTransaction &TPT, const SmallVectorImpl<Instruction *> &Exts, |
5132 | | SmallVectorImpl<Instruction *> &ProfitablyMovedExts, |
5133 | 909k | unsigned CreatedInstsCost) { |
5134 | 909k | bool Promoted = false; |
5135 | 909k | |
5136 | 909k | // Iterate over all the extensions to try to promote them. |
5137 | 952k | for (auto I : Exts) { |
5138 | 952k | // Early check if we directly have ext(load). |
5139 | 952k | if (isa<LoadInst>(I->getOperand(0))) { |
5140 | 421k | ProfitablyMovedExts.push_back(I); |
5141 | 421k | continue; |
5142 | 421k | } |
5143 | 531k | |
5144 | 531k | // Check whether or not we want to do any promotion. The reason we have |
5145 | 531k | // this check inside the for loop is to catch the case where an extension |
5146 | 531k | // is directly fed by a load because in such case the extension can be moved |
5147 | 531k | // up without any promotion on its operands. |
5148 | 531k | if (!TLI || !TLI->enableExtLdPromotion() || DisableExtLdPromotion512k ) |
5149 | 19.2k | return false; |
5150 | 512k | |
5151 | 512k | // Get the action to perform the promotion. |
5152 | 512k | TypePromotionHelper::Action TPH = |
5153 | 512k | TypePromotionHelper::getAction(I, InsertedInsts, *TLI, PromotedInsts); |
5154 | 512k | // Check if we can promote. |
5155 | 512k | if (!TPH) { |
5156 | 398k | // Save the current extension as we cannot move up through its operand. |
5157 | 398k | ProfitablyMovedExts.push_back(I); |
5158 | 398k | continue; |
5159 | 398k | } |
5160 | 113k | |
5161 | 113k | // Save the current state. |
5162 | 113k | TypePromotionTransaction::ConstRestorationPt LastKnownGood = |
5163 | 113k | TPT.getRestorationPoint(); |
5164 | 113k | SmallVector<Instruction *, 4> NewExts; |
5165 | 113k | unsigned NewCreatedInstsCost = 0; |
5166 | 113k | unsigned ExtCost = !TLI->isExtFree(I); |
5167 | 113k | // Promote. |
5168 | 113k | Value *PromotedVal = TPH(I, TPT, PromotedInsts, NewCreatedInstsCost, |
5169 | 113k | &NewExts, nullptr, *TLI); |
5170 | 113k | assert(PromotedVal && |
5171 | 113k | "TypePromotionHelper should have filtered out those cases"); |
5172 | 113k | |
5173 | 113k | // We would be able to merge only one extension in a load. |
5174 | 113k | // Therefore, if we have more than 1 new extension we heuristically |
5175 | 113k | // cut this search path, because it means we degrade the code quality. |
5176 | 113k | // With exactly 2, the transformation is neutral, because we will merge |
5177 | 113k | // one extension but leave one. However, we optimistically keep going, |
5178 | 113k | // because the new extension may be removed too. |
5179 | 113k | long long TotalCreatedInstsCost = CreatedInstsCost + NewCreatedInstsCost; |
5180 | 113k | // FIXME: It would be possible to propagate a negative value instead of |
5181 | 113k | // conservatively ceiling it to 0. |
5182 | 113k | TotalCreatedInstsCost = |
5183 | 113k | std::max((long long)0, (TotalCreatedInstsCost - ExtCost)); |
5184 | 113k | if (!StressExtLdPromotion && |
5185 | 113k | (113k TotalCreatedInstsCost > 1113k || |
5186 | 113k | !isPromotedInstructionLegal(*TLI, *DL, PromotedVal)100k )) { |
5187 | 14.0k | // This promotion is not profitable, rollback to the previous state, and |
5188 | 14.0k | // save the current extension in ProfitablyMovedExts as the latest |
5189 | 14.0k | // speculative promotion turned out to be unprofitable. |
5190 | 14.0k | TPT.rollback(LastKnownGood); |
5191 | 14.0k | ProfitablyMovedExts.push_back(I); |
5192 | 14.0k | continue; |
5193 | 14.0k | } |
5194 | 99.5k | // Continue promoting NewExts as far as doing so is profitable. |
5195 | 99.5k | SmallVector<Instruction *, 2> NewlyMovedExts; |
5196 | 99.5k | (void)tryToPromoteExts(TPT, NewExts, NewlyMovedExts, TotalCreatedInstsCost); |
5197 | 99.5k | bool NewPromoted = false; |
5198 | 145k | for (auto ExtInst : NewlyMovedExts) { |
5199 | 145k | Instruction *MovedExt = cast<Instruction>(ExtInst); |
5200 | 145k | Value *ExtOperand = MovedExt->getOperand(0); |
5201 | 145k | // If we have reached to a load, we need this extra profitability check |
5202 | 145k | // as it could potentially be merged into an ext(load). |
5203 | 145k | if (isa<LoadInst>(ExtOperand) && |
5204 | 145k | !(33.7k StressExtLdPromotion33.7k || NewCreatedInstsCost <= ExtCost33.6k || |
5205 | 33.7k | (22.3k ExtOperand->hasOneUse()22.3k || hasSameExtUse(ExtOperand, *TLI)13.6k ))) |
5206 | 11.9k | continue; |
5207 | 133k | |
5208 | 133k | ProfitablyMovedExts.push_back(MovedExt); |
5209 | 133k | NewPromoted = true; |
5210 | 133k | } |
5211 | 99.5k | |
5212 | 99.5k | // If none of speculative promotions for NewExts is profitable, rollback |
5213 | 99.5k | // and save the current extension (I) as the last profitable extension. |
5214 | 99.5k | if (!NewPromoted) { |
5215 | 5.94k | TPT.rollback(LastKnownGood); |
5216 | 5.94k | ProfitablyMovedExts.push_back(I); |
5217 | 5.94k | continue; |
5218 | 5.94k | } |
5219 | 93.6k | // The promotion is profitable. |
5220 | 93.6k | Promoted = true; |
5221 | 93.6k | } |
5222 | 909k | return Promoted889k ; |
5223 | 909k | } |
5224 | | |
5225 | | /// Merging redundant sexts when one is dominating the other. |
5226 | 26.5k | bool CodeGenPrepare::mergeSExts(Function &F) { |
5227 | 26.5k | bool Changed = false; |
5228 | 48.0k | for (auto &Entry : ValToSExtendedUses) { |
5229 | 48.0k | SExts &Insts = Entry.second; |
5230 | 48.0k | SExts CurPts; |
5231 | 52.8k | for (Instruction *Inst : Insts) { |
5232 | 52.8k | if (RemovedInsts.count(Inst) || !isa<SExtInst>(Inst)52.8k || |
5233 | 52.8k | Inst->getOperand(0) != Entry.first52.8k ) |
5234 | 49 | continue; |
5235 | 52.8k | bool inserted = false; |
5236 | 52.8k | for (auto &Pt : CurPts) { |
5237 | 13.4k | if (getDT(F).dominates(Inst, Pt)) { |
5238 | 571 | Pt->replaceAllUsesWith(Inst); |
5239 | 571 | RemovedInsts.insert(Pt); |
5240 | 571 | Pt->removeFromParent(); |
5241 | 571 | Pt = Inst; |
5242 | 571 | inserted = true; |
5243 | 571 | Changed = true; |
5244 | 571 | break; |
5245 | 571 | } |
5246 | 12.9k | if (!getDT(F).dominates(Pt, Inst)) |
5247 | 11.7k | // Give up if we need to merge in a common dominator as the |
5248 | 11.7k | // experiments show it is not profitable. |
5249 | 11.7k | continue; |
5250 | 1.14k | Inst->replaceAllUsesWith(Pt); |
5251 | 1.14k | RemovedInsts.insert(Inst); |
5252 | 1.14k | Inst->removeFromParent(); |
5253 | 1.14k | inserted = true; |
5254 | 1.14k | Changed = true; |
5255 | 1.14k | break; |
5256 | 1.14k | } |
5257 | 52.8k | if (!inserted) |
5258 | 51.1k | CurPts.push_back(Inst); |
5259 | 52.8k | } |
5260 | 48.0k | } |
5261 | 26.5k | return Changed; |
5262 | 26.5k | } |
5263 | | |
5264 | | // Spliting large data structures so that the GEPs accessing them can have |
5265 | | // smaller offsets so that they can be sunk to the same blocks as their users. |
5266 | | // For example, a large struct starting from %base is splitted into two parts |
5267 | | // where the second part starts from %new_base. |
5268 | | // |
5269 | | // Before: |
5270 | | // BB0: |
5271 | | // %base = |
5272 | | // |
5273 | | // BB1: |
5274 | | // %gep0 = gep %base, off0 |
5275 | | // %gep1 = gep %base, off1 |
5276 | | // %gep2 = gep %base, off2 |
5277 | | // |
5278 | | // BB2: |
5279 | | // %load1 = load %gep0 |
5280 | | // %load2 = load %gep1 |
5281 | | // %load3 = load %gep2 |
5282 | | // |
5283 | | // After: |
5284 | | // BB0: |
5285 | | // %base = |
5286 | | // %new_base = gep %base, off0 |
5287 | | // |
5288 | | // BB1: |
5289 | | // %new_gep0 = %new_base |
5290 | | // %new_gep1 = gep %new_base, off1 - off0 |
5291 | | // %new_gep2 = gep %new_base, off2 - off0 |
5292 | | // |
5293 | | // BB2: |
5294 | | // %load1 = load i32, i32* %new_gep0 |
5295 | | // %load2 = load i32, i32* %new_gep1 |
5296 | | // %load3 = load i32, i32* %new_gep2 |
5297 | | // |
5298 | | // %new_gep1 and %new_gep2 can be sunk to BB2 now after the splitting because |
5299 | | // their offsets are smaller enough to fit into the addressing mode. |
5300 | 4.26k | bool CodeGenPrepare::splitLargeGEPOffsets() { |
5301 | 4.26k | bool Changed = false; |
5302 | 11.4k | for (auto &Entry : LargeOffsetGEPMap) { |
5303 | 11.4k | Value *OldBase = Entry.first; |
5304 | 11.4k | SmallVectorImpl<std::pair<AssertingVH<GetElementPtrInst>, int64_t>> |
5305 | 11.4k | &LargeOffsetGEPs = Entry.second; |
5306 | 11.4k | auto compareGEPOffset = |
5307 | 11.4k | [&](const std::pair<GetElementPtrInst *, int64_t> &LHS, |
5308 | 27.6k | const std::pair<GetElementPtrInst *, int64_t> &RHS) { |
5309 | 27.6k | if (LHS.first == RHS.first) |
5310 | 7.05k | return false; |
5311 | 20.5k | if (LHS.second != RHS.second) |
5312 | 19.3k | return LHS.second < RHS.second; |
5313 | 1.17k | return LargeOffsetGEPID[LHS.first] < LargeOffsetGEPID[RHS.first]; |
5314 | 1.17k | }; |
5315 | 11.4k | // Sorting all the GEPs of the same data structures based on the offsets. |
5316 | 11.4k | llvm::sort(LargeOffsetGEPs, compareGEPOffset); |
5317 | 11.4k | LargeOffsetGEPs.erase( |
5318 | 11.4k | std::unique(LargeOffsetGEPs.begin(), LargeOffsetGEPs.end()), |
5319 | 11.4k | LargeOffsetGEPs.end()); |
5320 | 11.4k | // Skip if all the GEPs have the same offsets. |
5321 | 11.4k | if (LargeOffsetGEPs.front().second == LargeOffsetGEPs.back().second) |
5322 | 9.98k | continue; |
5323 | 1.45k | GetElementPtrInst *BaseGEP = LargeOffsetGEPs.begin()->first; |
5324 | 1.45k | int64_t BaseOffset = LargeOffsetGEPs.begin()->second; |
5325 | 1.45k | Value *NewBaseGEP = nullptr; |
5326 | 1.45k | |
5327 | 1.45k | auto LargeOffsetGEP = LargeOffsetGEPs.begin(); |
5328 | 7.84k | while (LargeOffsetGEP != LargeOffsetGEPs.end()) { |
5329 | 6.38k | GetElementPtrInst *GEP = LargeOffsetGEP->first; |
5330 | 6.38k | int64_t Offset = LargeOffsetGEP->second; |
5331 | 6.38k | if (Offset != BaseOffset) { |
5332 | 4.85k | TargetLowering::AddrMode AddrMode; |
5333 | 4.85k | AddrMode.BaseOffs = Offset - BaseOffset; |
5334 | 4.85k | // The result type of the GEP might not be the type of the memory |
5335 | 4.85k | // access. |
5336 | 4.85k | if (!TLI->isLegalAddressingMode(*DL, AddrMode, |
5337 | 4.85k | GEP->getResultElementType(), |
5338 | 4.85k | GEP->getAddressSpace())) { |
5339 | 241 | // We need to create a new base if the offset to the current base is |
5340 | 241 | // too large to fit into the addressing mode. So, a very large struct |
5341 | 241 | // may be splitted into several parts. |
5342 | 241 | BaseGEP = GEP; |
5343 | 241 | BaseOffset = Offset; |
5344 | 241 | NewBaseGEP = nullptr; |
5345 | 241 | } |
5346 | 4.85k | } |
5347 | 6.38k | |
5348 | 6.38k | // Generate a new GEP to replace the current one. |
5349 | 6.38k | LLVMContext &Ctx = GEP->getContext(); |
5350 | 6.38k | Type *IntPtrTy = DL->getIntPtrType(GEP->getType()); |
5351 | 6.38k | Type *I8PtrTy = |
5352 | 6.38k | Type::getInt8PtrTy(Ctx, GEP->getType()->getPointerAddressSpace()); |
5353 | 6.38k | Type *I8Ty = Type::getInt8Ty(Ctx); |
5354 | 6.38k | |
5355 | 6.38k | if (!NewBaseGEP) { |
5356 | 1.70k | // Create a new base if we don't have one yet. Find the insertion |
5357 | 1.70k | // pointer for the new base first. |
5358 | 1.70k | BasicBlock::iterator NewBaseInsertPt; |
5359 | 1.70k | BasicBlock *NewBaseInsertBB; |
5360 | 1.70k | if (auto *BaseI = dyn_cast<Instruction>(OldBase)) { |
5361 | 1.06k | // If the base of the struct is an instruction, the new base will be |
5362 | 1.06k | // inserted close to it. |
5363 | 1.06k | NewBaseInsertBB = BaseI->getParent(); |
5364 | 1.06k | if (isa<PHINode>(BaseI)) |
5365 | 29 | NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt(); |
5366 | 1.03k | else if (InvokeInst *Invoke = dyn_cast<InvokeInst>(BaseI)) { |
5367 | 3 | NewBaseInsertBB = |
5368 | 3 | SplitEdge(NewBaseInsertBB, Invoke->getNormalDest()); |
5369 | 3 | NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt(); |
5370 | 3 | } else |
5371 | 1.02k | NewBaseInsertPt = std::next(BaseI->getIterator()); |
5372 | 1.06k | } else { |
5373 | 639 | // If the current base is an argument or global value, the new base |
5374 | 639 | // will be inserted to the entry block. |
5375 | 639 | NewBaseInsertBB = &BaseGEP->getFunction()->getEntryBlock(); |
5376 | 639 | NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt(); |
5377 | 639 | } |
5378 | 1.70k | IRBuilder<> NewBaseBuilder(NewBaseInsertBB, NewBaseInsertPt); |
5379 | 1.70k | // Create a new base. |
5380 | 1.70k | Value *BaseIndex = ConstantInt::get(IntPtrTy, BaseOffset); |
5381 | 1.70k | NewBaseGEP = OldBase; |
5382 | 1.70k | if (NewBaseGEP->getType() != I8PtrTy) |
5383 | 1.64k | NewBaseGEP = NewBaseBuilder.CreatePointerCast(NewBaseGEP, I8PtrTy); |
5384 | 1.70k | NewBaseGEP = |
5385 | 1.70k | NewBaseBuilder.CreateGEP(I8Ty, NewBaseGEP, BaseIndex, "splitgep"); |
5386 | 1.70k | NewGEPBases.insert(NewBaseGEP); |
5387 | 1.70k | } |
5388 | 6.38k | |
5389 | 6.38k | IRBuilder<> Builder(GEP); |
5390 | 6.38k | Value *NewGEP = NewBaseGEP; |
5391 | 6.38k | if (Offset == BaseOffset) { |
5392 | 1.77k | if (GEP->getType() != I8PtrTy) |
5393 | 1.61k | NewGEP = Builder.CreatePointerCast(NewGEP, GEP->getType()); |
5394 | 4.61k | } else { |
5395 | 4.61k | // Calculate the new offset for the new GEP. |
5396 | 4.61k | Value *Index = ConstantInt::get(IntPtrTy, Offset - BaseOffset); |
5397 | 4.61k | NewGEP = Builder.CreateGEP(I8Ty, NewBaseGEP, Index); |
5398 | 4.61k | |
5399 | 4.61k | if (GEP->getType() != I8PtrTy) |
5400 | 2.26k | NewGEP = Builder.CreatePointerCast(NewGEP, GEP->getType()); |
5401 | 4.61k | } |
5402 | 6.38k | GEP->replaceAllUsesWith(NewGEP); |
5403 | 6.38k | LargeOffsetGEPID.erase(GEP); |
5404 | 6.38k | LargeOffsetGEP = LargeOffsetGEPs.erase(LargeOffsetGEP); |
5405 | 6.38k | GEP->eraseFromParent(); |
5406 | 6.38k | Changed = true; |
5407 | 6.38k | } |
5408 | 1.45k | } |
5409 | 4.26k | return Changed; |
5410 | 4.26k | } |
5411 | | |
5412 | | /// Return true, if an ext(load) can be formed from an extension in |
5413 | | /// \p MovedExts. |
5414 | | bool CodeGenPrepare::canFormExtLd( |
5415 | | const SmallVectorImpl<Instruction *> &MovedExts, LoadInst *&LI, |
5416 | 808k | Instruction *&Inst, bool HasPromoted) { |
5417 | 823k | for (auto *MovedExtInst : MovedExts) { |
5418 | 823k | if (isa<LoadInst>(MovedExtInst->getOperand(0))) { |
5419 | 407k | LI = cast<LoadInst>(MovedExtInst->getOperand(0)); |
5420 | 407k | Inst = MovedExtInst; |
5421 | 407k | break; |
5422 | 407k | } |
5423 | 823k | } |
5424 | 808k | if (!LI) |
5425 | 400k | return false; |
5426 | 407k | |
5427 | 407k | // If they're already in the same block, there's nothing to do. |
5428 | 407k | // Make the cheap checks first if we did not promote. |
5429 | 407k | // If we promoted, we need to check if it is indeed profitable. |
5430 | 407k | if (!HasPromoted && LI->getParent() == Inst->getParent()391k ) |
5431 | 374k | return false; |
5432 | 33.2k | |
5433 | 33.2k | return TLI->isExtLoad(LI, Inst, *DL); |
5434 | 33.2k | } |
5435 | | |
5436 | | /// Move a zext or sext fed by a load into the same basic block as the load, |
5437 | | /// unless conditions are unfavorable. This allows SelectionDAG to fold the |
5438 | | /// extend into the load. |
5439 | | /// |
5440 | | /// E.g., |
5441 | | /// \code |
5442 | | /// %ld = load i32* %addr |
5443 | | /// %add = add nuw i32 %ld, 4 |
5444 | | /// %zext = zext i32 %add to i64 |
5445 | | // \endcode |
5446 | | /// => |
5447 | | /// \code |
5448 | | /// %ld = load i32* %addr |
5449 | | /// %zext = zext i32 %ld to i64 |
5450 | | /// %add = add nuw i64 %zext, 4 |
5451 | | /// \encode |
5452 | | /// Note that the promotion in %add to i64 is done in tryToPromoteExts(), which |
5453 | | /// allow us to match zext(load i32*) to i64. |
5454 | | /// |
5455 | | /// Also, try to promote the computations used to obtain a sign extended |
5456 | | /// value used into memory accesses. |
5457 | | /// E.g., |
5458 | | /// \code |
5459 | | /// a = add nsw i32 b, 3 |
5460 | | /// d = sext i32 a to i64 |
5461 | | /// e = getelementptr ..., i64 d |
5462 | | /// \endcode |
5463 | | /// => |
5464 | | /// \code |
5465 | | /// f = sext i32 b to i64 |
5466 | | /// a = add nsw i64 f, 3 |
5467 | | /// e = getelementptr ..., i64 a |
5468 | | /// \endcode |
5469 | | /// |
5470 | | /// \p Inst[in/out] the extension may be modified during the process if some |
5471 | | /// promotions apply. |
5472 | 808k | bool CodeGenPrepare::optimizeExt(Instruction *&Inst) { |
5473 | 808k | // ExtLoad formation and address type promotion infrastructure requires TLI to |
5474 | 808k | // be effective. |
5475 | 808k | if (!TLI) |
5476 | 5 | return false; |
5477 | 808k | |
5478 | 808k | bool AllowPromotionWithoutCommonHeader = false; |
5479 | 808k | /// See if it is an interesting sext operations for the address type |
5480 | 808k | /// promotion before trying to promote it, e.g., the ones with the right |
5481 | 808k | /// type and used in memory accesses. |
5482 | 808k | bool ATPConsiderable = TTI->shouldConsiderAddressTypePromotion( |
5483 | 808k | *Inst, AllowPromotionWithoutCommonHeader); |
5484 | 808k | TypePromotionTransaction TPT(RemovedInsts); |
5485 | 808k | TypePromotionTransaction::ConstRestorationPt LastKnownGood = |
5486 | 808k | TPT.getRestorationPoint(); |
5487 | 808k | SmallVector<Instruction *, 1> Exts; |
5488 | 808k | SmallVector<Instruction *, 2> SpeculativelyMovedExts; |
5489 | 808k | Exts.push_back(Inst); |
5490 | 808k | |
5491 | 808k | bool HasPromoted = tryToPromoteExts(TPT, Exts, SpeculativelyMovedExts); |
5492 | 808k | |
5493 | 808k | // Look for a load being extended. |
5494 | 808k | LoadInst *LI = nullptr; |
5495 | 808k | Instruction *ExtFedByLoad; |
5496 | 808k | |
5497 | 808k | // Try to promote a chain of computation if it allows to form an extended |
5498 | 808k | // load. |
5499 | 808k | if (canFormExtLd(SpeculativelyMovedExts, LI, ExtFedByLoad, HasPromoted)) { |
5500 | 33.1k | assert(LI && ExtFedByLoad && "Expect a valid load and extension"); |
5501 | 33.1k | TPT.commit(); |
5502 | 33.1k | // Move the extend into the same block as the load |
5503 | 33.1k | ExtFedByLoad->moveAfter(LI); |
5504 | 33.1k | // CGP does not check if the zext would be speculatively executed when moved |
5505 | 33.1k | // to the same basic block as the load. Preserving its original location |
5506 | 33.1k | // would pessimize the debugging experience, as well as negatively impact |
5507 | 33.1k | // the quality of sample pgo. We don't want to use "line 0" as that has a |
5508 | 33.1k | // size cost in the line-table section and logically the zext can be seen as |
5509 | 33.1k | // part of the load. Therefore we conservatively reuse the same debug |
5510 | 33.1k | // location for the load and the zext. |
5511 | 33.1k | ExtFedByLoad->setDebugLoc(LI->getDebugLoc()); |
5512 | 33.1k | ++NumExtsMoved; |
5513 | 33.1k | Inst = ExtFedByLoad; |
5514 | 33.1k | return true; |
5515 | 33.1k | } |
5516 | 775k | |
5517 | 775k | // Continue promoting SExts if known as considerable depending on targets. |
5518 | 775k | if (ATPConsiderable && |
5519 | 775k | performAddressTypePromotion(Inst, AllowPromotionWithoutCommonHeader, |
5520 | 102k | HasPromoted, TPT, SpeculativelyMovedExts)) |
5521 | 3.36k | return true; |
5522 | 771k | |
5523 | 771k | TPT.rollback(LastKnownGood); |
5524 | 771k | return false; |
5525 | 771k | } |
5526 | | |
5527 | | // Perform address type promotion if doing so is profitable. |
5528 | | // If AllowPromotionWithoutCommonHeader == false, we should find other sext |
5529 | | // instructions that sign extended the same initial value. However, if |
5530 | | // AllowPromotionWithoutCommonHeader == true, we expect promoting the |
5531 | | // extension is just profitable. |
5532 | | bool CodeGenPrepare::performAddressTypePromotion( |
5533 | | Instruction *&Inst, bool AllowPromotionWithoutCommonHeader, |
5534 | | bool HasPromoted, TypePromotionTransaction &TPT, |
5535 | 102k | SmallVectorImpl<Instruction *> &SpeculativelyMovedExts) { |
5536 | 102k | bool Promoted = false; |
5537 | 102k | SmallPtrSet<Instruction *, 1> UnhandledExts; |
5538 | 102k | bool AllSeenFirst = true; |
5539 | 106k | for (auto I : SpeculativelyMovedExts) { |
5540 | 106k | Value *HeadOfChain = I->getOperand(0); |
5541 | 106k | DenseMap<Value *, Instruction *>::iterator AlreadySeen = |
5542 | 106k | SeenChainsForSExt.find(HeadOfChain); |
5543 | 106k | // If there is an unhandled SExt which has the same header, try to promote |
5544 | 106k | // it as well. |
5545 | 106k | if (AlreadySeen != SeenChainsForSExt.end()) { |
5546 | 4.88k | if (AlreadySeen->second != nullptr) |
5547 | 1.47k | UnhandledExts.insert(AlreadySeen->second); |
5548 | 4.88k | AllSeenFirst = false; |
5549 | 4.88k | } |
5550 | 106k | } |
5551 | 102k | |
5552 | 102k | if (!AllSeenFirst || (98.0k AllowPromotionWithoutCommonHeader98.0k && |
5553 | 98.0k | SpeculativelyMovedExts.size() == 146.2k )) { |
5554 | 50.9k | TPT.commit(); |
5555 | 50.9k | if (HasPromoted) |
5556 | 3.25k | Promoted = true; |
5557 | 51.3k | for (auto I : SpeculativelyMovedExts) { |
5558 | 51.3k | Value *HeadOfChain = I->getOperand(0); |
5559 | 51.3k | SeenChainsForSExt[HeadOfChain] = nullptr; |
5560 | 51.3k | ValToSExtendedUses[HeadOfChain].push_back(I); |
5561 | 51.3k | } |
5562 | 50.9k | // Update Inst as promotion happen. |
5563 | 50.9k | Inst = SpeculativelyMovedExts.pop_back_val(); |
5564 | 51.9k | } else { |
5565 | 51.9k | // This is the first chain visited from the header, keep the current chain |
5566 | 51.9k | // as unhandled. Defer to promote this until we encounter another SExt |
5567 | 51.9k | // chain derived from the same header. |
5568 | 55.3k | for (auto I : SpeculativelyMovedExts) { |
5569 | 55.3k | Value *HeadOfChain = I->getOperand(0); |
5570 | 55.3k | SeenChainsForSExt[HeadOfChain] = Inst; |
5571 | 55.3k | } |
5572 | 51.9k | return false; |
5573 | 51.9k | } |
5574 | 50.9k | |
5575 | 50.9k | if (!AllSeenFirst && !UnhandledExts.empty()4.85k ) |
5576 | 1.47k | for (auto VisitedSExt : UnhandledExts)1.46k { |
5577 | 1.47k | if (RemovedInsts.count(VisitedSExt)) |
5578 | 11 | continue; |
5579 | 1.45k | TypePromotionTransaction TPT(RemovedInsts); |
5580 | 1.45k | SmallVector<Instruction *, 1> Exts; |
5581 | 1.45k | SmallVector<Instruction *, 2> Chains; |
5582 | 1.45k | Exts.push_back(VisitedSExt); |
5583 | 1.45k | bool HasPromoted = tryToPromoteExts(TPT, Exts, Chains); |
5584 | 1.45k | TPT.commit(); |
5585 | 1.45k | if (HasPromoted) |
5586 | 281 | Promoted = true; |
5587 | 1.53k | for (auto I : Chains) { |
5588 | 1.53k | Value *HeadOfChain = I->getOperand(0); |
5589 | 1.53k | // Mark this as handled. |
5590 | 1.53k | SeenChainsForSExt[HeadOfChain] = nullptr; |
5591 | 1.53k | ValToSExtendedUses[HeadOfChain].push_back(I); |
5592 | 1.53k | } |
5593 | 1.45k | } |
5594 | 50.9k | return Promoted; |
5595 | 50.9k | } |
5596 | | |
5597 | 808k | bool CodeGenPrepare::optimizeExtUses(Instruction *I) { |
5598 | 808k | BasicBlock *DefBB = I->getParent(); |
5599 | 808k | |
5600 | 808k | // If the result of a {s|z}ext and its source are both live out, rewrite all |
5601 | 808k | // other uses of the source with result of extension. |
5602 | 808k | Value *Src = I->getOperand(0); |
5603 | 808k | if (Src->hasOneUse()) |
5604 | 475k | return false; |
5605 | 332k | |
5606 | 332k | // Only do this xform if truncating is free. |
5607 | 332k | if (TLI && !TLI->isTruncateFree(I->getType(), Src->getType())332k ) |
5608 | 4.55k | return false; |
5609 | 328k | |
5610 | 328k | // Only safe to perform the optimization if the source is also defined in |
5611 | 328k | // this block. |
5612 | 328k | if (!isa<Instruction>(Src) || DefBB != cast<Instruction>(Src)->getParent()251k ) |
5613 | 119k | return false; |
5614 | 208k | |
5615 | 208k | bool DefIsLiveOut = false; |
5616 | 222k | for (User *U : I->users()) { |
5617 | 222k | Instruction *UI = cast<Instruction>(U); |
5618 | 222k | |
5619 | 222k | // Figure out which BB this ext is used in. |
5620 | 222k | BasicBlock *UserBB = UI->getParent(); |
5621 | 222k | if (UserBB == DefBB) continue114k ; |
5622 | 107k | DefIsLiveOut = true; |
5623 | 107k | break; |
5624 | 107k | } |
5625 | 208k | if (!DefIsLiveOut) |
5626 | 101k | return false; |
5627 | 107k | |
5628 | 107k | // Make sure none of the uses are PHI nodes. |
5629 | 346k | for (User *U : Src->users())107k { |
5630 | 346k | Instruction *UI = cast<Instruction>(U); |
5631 | 346k | BasicBlock *UserBB = UI->getParent(); |
5632 | 346k | if (UserBB == DefBB) continue163k ; |
5633 | 183k | // Be conservative. We don't want this xform to end up introducing |
5634 | 183k | // reloads just before load / store instructions. |
5635 | 183k | if (isa<PHINode>(UI) || isa<LoadInst>(UI)172k || isa<StoreInst>(UI)172k ) |
5636 | 11.7k | return false; |
5637 | 183k | } |
5638 | 107k | |
5639 | 107k | // InsertedTruncs - Only insert one trunc in each block once. |
5640 | 107k | DenseMap<BasicBlock*, Instruction*> InsertedTruncs; |
5641 | 95.6k | |
5642 | 95.6k | bool MadeChange = false; |
5643 | 157k | for (Use &U : Src->uses()) { |
5644 | 157k | Instruction *User = cast<Instruction>(U.getUser()); |
5645 | 157k | |
5646 | 157k | // Figure out which BB this ext is used in. |
5647 | 157k | BasicBlock *UserBB = User->getParent(); |
5648 | 157k | if (UserBB == DefBB) continue103k ; |
5649 | 54.7k | |
5650 | 54.7k | // Both src and def are live in this block. Rewrite the use. |
5651 | 54.7k | Instruction *&InsertedTrunc = InsertedTruncs[UserBB]; |
5652 | 54.7k | |
5653 | 54.7k | if (!InsertedTrunc) { |
5654 | 54.7k | BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); |
5655 | 54.7k | assert(InsertPt != UserBB->end()); |
5656 | 54.7k | InsertedTrunc = new TruncInst(I, Src->getType(), "", &*InsertPt); |
5657 | 54.7k | InsertedInsts.insert(InsertedTrunc); |
5658 | 54.7k | } |
5659 | 54.7k | |
5660 | 54.7k | // Replace a use of the {s|z}ext source with a use of the result. |
5661 | 54.7k | U = InsertedTrunc; |
5662 | 54.7k | ++NumExtUses; |
5663 | 54.7k | MadeChange = true; |
5664 | 54.7k | } |
5665 | 95.6k | |
5666 | 95.6k | return MadeChange; |
5667 | 107k | } |
5668 | | |
5669 | | // Find loads whose uses only use some of the loaded value's bits. Add an "and" |
5670 | | // just after the load if the target can fold this into one extload instruction, |
5671 | | // with the hope of eliminating some of the other later "and" instructions using |
5672 | | // the loaded value. "and"s that are made trivially redundant by the insertion |
5673 | | // of the new "and" are removed by this function, while others (e.g. those whose |
5674 | | // path from the load goes through a phi) are left for isel to potentially |
5675 | | // remove. |
5676 | | // |
5677 | | // For example: |
5678 | | // |
5679 | | // b0: |
5680 | | // x = load i32 |
5681 | | // ... |
5682 | | // b1: |
5683 | | // y = and x, 0xff |
5684 | | // z = use y |
5685 | | // |
5686 | | // becomes: |
5687 | | // |
5688 | | // b0: |
5689 | | // x = load i32 |
5690 | | // x' = and x, 0xff |
5691 | | // ... |
5692 | | // b1: |
5693 | | // z = use x' |
5694 | | // |
5695 | | // whereas: |
5696 | | // |
5697 | | // b0: |
5698 | | // x1 = load i32 |
5699 | | // ... |
5700 | | // b1: |
5701 | | // x2 = load i32 |
5702 | | // ... |
5703 | | // b2: |
5704 | | // x = phi x1, x2 |
5705 | | // y = and x, 0xff |
5706 | | // |
5707 | | // becomes (after a call to optimizeLoadExt for each load): |
5708 | | // |
5709 | | // b0: |
5710 | | // x1 = load i32 |
5711 | | // x1' = and x1, 0xff |
5712 | | // ... |
5713 | | // b1: |
5714 | | // x2 = load i32 |
5715 | | // x2' = and x2, 0xff |
5716 | | // ... |
5717 | | // b2: |
5718 | | // x = phi x1', x2' |
5719 | | // y = and x, 0xff |
5720 | 3.66M | bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) { |
5721 | 3.66M | if (!Load->isSimple() || !Load->getType()->isIntOrPtrTy()3.62M ) |
5722 | 304k | return false; |
5723 | 3.35M | |
5724 | 3.35M | // Skip loads we've already transformed. |
5725 | 3.35M | if (Load->hasOneUse() && |
5726 | 3.35M | InsertedInsts.count(cast<Instruction>(*Load->user_begin()))2.24M ) |
5727 | 254 | return false; |
5728 | 3.35M | |
5729 | 3.35M | // Look at all uses of Load, looking through phis, to determine how many bits |
5730 | 3.35M | // of the loaded value are needed. |
5731 | 3.35M | SmallVector<Instruction *, 8> WorkList; |
5732 | 3.35M | SmallPtrSet<Instruction *, 16> Visited; |
5733 | 3.35M | SmallVector<Instruction *, 8> AndsToMaybeRemove; |
5734 | 3.35M | for (auto *U : Load->users()) |
5735 | 5.44M | WorkList.push_back(cast<Instruction>(U)); |
5736 | 3.35M | |
5737 | 3.35M | EVT LoadResultVT = TLI->getValueType(*DL, Load->getType()); |
5738 | 3.35M | unsigned BitWidth = LoadResultVT.getSizeInBits(); |
5739 | 3.35M | APInt DemandBits(BitWidth, 0); |
5740 | 3.35M | APInt WidestAndBits(BitWidth, 0); |
5741 | 3.35M | |
5742 | 3.68M | while (!WorkList.empty()) { |
5743 | 3.62M | Instruction *I = WorkList.back(); |
5744 | 3.62M | WorkList.pop_back(); |
5745 | 3.62M | |
5746 | 3.62M | // Break use-def graph loops. |
5747 | 3.62M | if (!Visited.insert(I).second) |
5748 | 17.9k | continue; |
5749 | 3.60M | |
5750 | 3.60M | // For a PHI node, push all of its users. |
5751 | 3.60M | if (auto *Phi = dyn_cast<PHINode>(I)) { |
5752 | 195k | for (auto *U : Phi->users()) |
5753 | 385k | WorkList.push_back(cast<Instruction>(U)); |
5754 | 195k | continue; |
5755 | 195k | } |
5756 | 3.41M | |
5757 | 3.41M | switch (I->getOpcode()) { |
5758 | 3.41M | case Instruction::And: { |
5759 | 52.6k | auto *AndC = dyn_cast<ConstantInt>(I->getOperand(1)); |
5760 | 52.6k | if (!AndC) |
5761 | 10.4k | return false; |
5762 | 42.1k | APInt AndBits = AndC->getValue(); |
5763 | 42.1k | DemandBits |= AndBits; |
5764 | 42.1k | // Keep track of the widest and mask we see. |
5765 | 42.1k | if (AndBits.ugt(WidestAndBits)) |
5766 | 40.8k | WidestAndBits = AndBits; |
5767 | 42.1k | if (AndBits == WidestAndBits && I->getOperand(0) == Load41.1k ) |
5768 | 39.5k | AndsToMaybeRemove.push_back(I); |
5769 | 42.1k | break; |
5770 | 42.1k | } |
5771 | 42.1k | |
5772 | 42.1k | case Instruction::Shl: { |
5773 | 16.1k | auto *ShlC = dyn_cast<ConstantInt>(I->getOperand(1)); |
5774 | 16.1k | if (!ShlC) |
5775 | 1.13k | return false; |
5776 | 14.9k | uint64_t ShiftAmt = ShlC->getLimitedValue(BitWidth - 1); |
5777 | 14.9k | DemandBits.setLowBits(BitWidth - ShiftAmt); |
5778 | 14.9k | break; |
5779 | 14.9k | } |
5780 | 14.9k | |
5781 | 59.2k | case Instruction::Trunc: { |
5782 | 59.2k | EVT TruncVT = TLI->getValueType(*DL, I->getType()); |
5783 | 59.2k | unsigned TruncBitWidth = TruncVT.getSizeInBits(); |
5784 | 59.2k | DemandBits.setLowBits(TruncBitWidth); |
5785 | 59.2k | break; |
5786 | 14.9k | } |
5787 | 14.9k | |
5788 | 3.28M | default: |
5789 | 3.28M | return false; |
5790 | 3.41M | } |
5791 | 3.41M | } |
5792 | 3.35M | |
5793 | 3.35M | uint32_t ActiveBits = DemandBits.getActiveBits(); |
5794 | 57.9k | // Avoid hoisting (and (load x) 1) since it is unlikely to be folded by the |
5795 | 57.9k | // target even if isLoadExtLegal says an i1 EXTLOAD is valid. For example, |
5796 | 57.9k | // for the AArch64 target isLoadExtLegal(ZEXTLOAD, i32, i1) returns true, but |
5797 | 57.9k | // (and (load x) 1) is not matched as a single instruction, rather as a LDR |
5798 | 57.9k | // followed by an AND. |
5799 | 57.9k | // TODO: Look into removing this restriction by fixing backends to either |
5800 | 57.9k | // return false for isLoadExtLegal for i1 or have them select this pattern to |
5801 | 57.9k | // a single instruction. |
5802 | 57.9k | // |
5803 | 57.9k | // Also avoid hoisting if we didn't see any ands with the exact DemandBits |
5804 | 57.9k | // mask, since these are the only ands that will be removed by isel. |
5805 | 57.9k | if (ActiveBits <= 1 || !DemandBits.isMask(ActiveBits)45.8k || |
5806 | 57.9k | WidestAndBits != DemandBits31.5k ) |
5807 | 53.6k | return false; |
5808 | 4.30k | |
5809 | 4.30k | LLVMContext &Ctx = Load->getType()->getContext(); |
5810 | 4.30k | Type *TruncTy = Type::getIntNTy(Ctx, ActiveBits); |
5811 | 4.30k | EVT TruncVT = TLI->getValueType(*DL, TruncTy); |
5812 | 4.30k | |
5813 | 4.30k | // Reject cases that won't be matched as extloads. |
5814 | 4.30k | if (!LoadResultVT.bitsGT(TruncVT) || !TruncVT.isRound()4.29k || |
5815 | 4.30k | !TLI->isLoadExtLegal(ISD::ZEXTLOAD, LoadResultVT, TruncVT)1.72k ) |
5816 | 2.70k | return false; |
5817 | 1.60k | |
5818 | 1.60k | IRBuilder<> Builder(Load->getNextNode()); |
5819 | 1.60k | auto *NewAnd = dyn_cast<Instruction>( |
5820 | 1.60k | Builder.CreateAnd(Load, ConstantInt::get(Ctx, DemandBits))); |
5821 | 1.60k | // Mark this instruction as "inserted by CGP", so that other |
5822 | 1.60k | // optimizations don't touch it. |
5823 | 1.60k | InsertedInsts.insert(NewAnd); |
5824 | 1.60k | |
5825 | 1.60k | // Replace all uses of load with new and (except for the use of load in the |
5826 | 1.60k | // new and itself). |
5827 | 1.60k | Load->replaceAllUsesWith(NewAnd); |
5828 | 1.60k | NewAnd->setOperand(0, Load); |
5829 | 1.60k | |
5830 | 1.60k | // Remove any and instructions that are now redundant. |
5831 | 1.60k | for (auto *And : AndsToMaybeRemove) |
5832 | 1.58k | // Check that the and mask is the same as the one we decided to put on the |
5833 | 1.58k | // new and. |
5834 | 1.58k | if (cast<ConstantInt>(And->getOperand(1))->getValue() == DemandBits) { |
5835 | 1.58k | And->replaceAllUsesWith(NewAnd); |
5836 | 1.58k | if (&*CurInstIterator == And) |
5837 | 1.49k | CurInstIterator = std::next(And->getIterator()); |
5838 | 1.58k | And->eraseFromParent(); |
5839 | 1.58k | ++NumAndUses; |
5840 | 1.58k | } |
5841 | 1.60k | |
5842 | 1.60k | ++NumAndsAdded; |
5843 | 1.60k | return true; |
5844 | 1.60k | } |
5845 | | |
5846 | | /// Check if V (an operand of a select instruction) is an expensive instruction |
5847 | | /// that is only used once. |
5848 | 33.6k | static bool sinkSelectOperand(const TargetTransformInfo *TTI, Value *V) { |
5849 | 33.6k | auto *I = dyn_cast<Instruction>(V); |
5850 | 33.6k | // If it's safe to speculatively execute, then it should not have side |
5851 | 33.6k | // effects; therefore, it's safe to sink and possibly *not* execute. |
5852 | 33.6k | return I && I->hasOneUse()20.9k && isSafeToSpeculativelyExecute(I)8.88k && |
5853 | 33.6k | TTI->getUserCost(I) >= TargetTransformInfo::TCC_Expensive8.06k ; |
5854 | 33.6k | } |
5855 | | |
5856 | | /// Returns true if a SelectInst should be turned into an explicit branch. |
5857 | | static bool isFormingBranchFromSelectProfitable(const TargetTransformInfo *TTI, |
5858 | | const TargetLowering *TLI, |
5859 | 268k | SelectInst *SI) { |
5860 | 268k | // If even a predictable select is cheap, then a branch can't be cheaper. |
5861 | 268k | if (!TLI->isPredictableSelectExpensive()) |
5862 | 245k | return false; |
5863 | 23.3k | |
5864 | 23.3k | // FIXME: This should use the same heuristics as IfConversion to determine |
5865 | 23.3k | // whether a select is better represented as a branch. |
5866 | 23.3k | |
5867 | 23.3k | // If metadata tells us that the select condition is obviously predictable, |
5868 | 23.3k | // then we want to replace the select with a branch. |
5869 | 23.3k | uint64_t TrueWeight, FalseWeight; |
5870 | 23.3k | if (SI->extractProfMetadata(TrueWeight, FalseWeight)) { |
5871 | 15 | uint64_t Max = std::max(TrueWeight, FalseWeight); |
5872 | 15 | uint64_t Sum = TrueWeight + FalseWeight; |
5873 | 15 | if (Sum != 0) { |
5874 | 14 | auto Probability = BranchProbability::getBranchProbability(Max, Sum); |
5875 | 14 | if (Probability > TLI->getPredictableBranchThreshold()) |
5876 | 9 | return true; |
5877 | 23.3k | } |
5878 | 15 | } |
5879 | 23.3k | |
5880 | 23.3k | CmpInst *Cmp = dyn_cast<CmpInst>(SI->getCondition()); |
5881 | 23.3k | |
5882 | 23.3k | // If a branch is predictable, an out-of-order CPU can avoid blocking on its |
5883 | 23.3k | // comparison condition. If the compare has more than one use, there's |
5884 | 23.3k | // probably another cmov or setcc around, so it's not worth emitting a branch. |
5885 | 23.3k | if (!Cmp || !Cmp->hasOneUse()19.1k ) |
5886 | 6.69k | return false; |
5887 | 16.6k | |
5888 | 16.6k | // If either operand of the select is expensive and only needed on one side |
5889 | 16.6k | // of the select, we should form a branch. |
5890 | 16.6k | if (sinkSelectOperand(TTI, SI->getTrueValue()) || |
5891 | 16.6k | sinkSelectOperand(TTI, SI->getFalseValue())16.6k ) |
5892 | 29 | return true; |
5893 | 16.6k | |
5894 | 16.6k | return false; |
5895 | 16.6k | } |
5896 | | |
5897 | | /// If \p isTrue is true, return the true value of \p SI, otherwise return |
5898 | | /// false value of \p SI. If the true/false value of \p SI is defined by any |
5899 | | /// select instructions in \p Selects, look through the defining select |
5900 | | /// instruction until the true/false value is not defined in \p Selects. |
5901 | | static Value *getTrueOrFalseValue( |
5902 | | SelectInst *SI, bool isTrue, |
5903 | 302 | const SmallPtrSet<const Instruction *, 2> &Selects) { |
5904 | 302 | Value *V = nullptr; |
5905 | 302 | |
5906 | 607 | for (SelectInst *DefSI = SI; DefSI != nullptr && Selects.count(DefSI)305 ; |
5907 | 305 | DefSI = dyn_cast<SelectInst>(V)) { |
5908 | 305 | assert(DefSI->getCondition() == SI->getCondition() && |
5909 | 305 | "The condition of DefSI does not match with SI"); |
5910 | 305 | V = (isTrue ? DefSI->getTrueValue()154 : DefSI->getFalseValue()151 ); |
5911 | 305 | } |
5912 | 302 | |
5913 | 302 | assert(V && "Failed to get select true/false value"); |
5914 | 302 | return V; |
5915 | 302 | } |
5916 | | |
5917 | 485k | bool CodeGenPrepare::optimizeShiftInst(BinaryOperator *Shift) { |
5918 | 485k | assert(Shift->isShift() && "Expected a shift"); |
5919 | 485k | |
5920 | 485k | // If this is (1) a vector shift, (2) shifts by scalars are cheaper than |
5921 | 485k | // general vector shifts, and (3) the shift amount is a select-of-splatted |
5922 | 485k | // values, hoist the shifts before the select: |
5923 | 485k | // shift Op0, (select Cond, TVal, FVal) --> |
5924 | 485k | // select Cond, (shift Op0, TVal), (shift Op0, FVal) |
5925 | 485k | // |
5926 | 485k | // This is inverting a generic IR transform when we know that the cost of a |
5927 | 485k | // general vector shift is more than the cost of 2 shift-by-scalars. |
5928 | 485k | // We can't do this effectively in SDAG because we may not be able to |
5929 | 485k | // determine if the select operands are splats from within a basic block. |
5930 | 485k | Type *Ty = Shift->getType(); |
5931 | 485k | if (!Ty->isVectorTy() || !TLI->isVectorShiftByScalarCheap(Ty)19.7k ) |
5932 | 481k | return false; |
5933 | 3.61k | Value *Cond, *TVal, *FVal; |
5934 | 3.61k | if (!match(Shift->getOperand(1), |
5935 | 3.61k | m_OneUse(m_Select(m_Value(Cond), m_Value(TVal), m_Value(FVal))))) |
5936 | 3.59k | return false; |
5937 | 18 | if (!isSplatValue(TVal) || !isSplatValue(FVal)) |
5938 | 0 | return false; |
5939 | 18 | |
5940 | 18 | IRBuilder<> Builder(Shift); |
5941 | 18 | BinaryOperator::BinaryOps Opcode = Shift->getOpcode(); |
5942 | 18 | Value *NewTVal = Builder.CreateBinOp(Opcode, Shift->getOperand(0), TVal); |
5943 | 18 | Value *NewFVal = Builder.CreateBinOp(Opcode, Shift->getOperand(0), FVal); |
5944 | 18 | Value *NewSel = Builder.CreateSelect(Cond, NewTVal, NewFVal); |
5945 | 18 | Shift->replaceAllUsesWith(NewSel); |
5946 | 18 | Shift->eraseFromParent(); |
5947 | 18 | return true; |
5948 | 18 | } |
5949 | | |
5950 | | /// If we have a SelectInst that will likely profit from branch prediction, |
5951 | | /// turn it into a branch. |
5952 | 303k | bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) { |
5953 | 303k | // If branch conversion isn't desirable, exit early. |
5954 | 303k | if (DisableSelectToBranch || OptSize303k || !TLI301k ) |
5955 | 1.60k | return false; |
5956 | 301k | |
5957 | 301k | // Find all consecutive select instructions that share the same condition. |
5958 | 301k | SmallVector<SelectInst *, 2> ASI; |
5959 | 301k | ASI.push_back(SI); |
5960 | 301k | for (BasicBlock::iterator It = ++BasicBlock::iterator(SI); |
5961 | 322k | It != SI->getParent()->end(); ++It21.3k ) { |
5962 | 322k | SelectInst *I = dyn_cast<SelectInst>(&*It); |
5963 | 322k | if (I && SI->getCondition() == I->getCondition()28.1k ) { |
5964 | 21.3k | ASI.push_back(I); |
5965 | 301k | } else { |
5966 | 301k | break; |
5967 | 301k | } |
5968 | 322k | } |
5969 | 301k | |
5970 | 301k | SelectInst *LastSI = ASI.back(); |
5971 | 301k | // Increment the current iterator to skip all the rest of select instructions |
5972 | 301k | // because they will be either "not lowered" or "all lowered" to branch. |
5973 | 301k | CurInstIterator = std::next(LastSI->getIterator()); |
5974 | 301k | |
5975 | 301k | bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1); |
5976 | 301k | |
5977 | 301k | // Can we convert the 'select' to CF ? |
5978 | 301k | if (VectorCond || SI->getMetadata(LLVMContext::MD_unpredictable)268k ) |
5979 | 32.5k | return false; |
5980 | 268k | |
5981 | 268k | TargetLowering::SelectSupportKind SelectKind; |
5982 | 268k | if (VectorCond) |
5983 | 0 | SelectKind = TargetLowering::VectorMaskSelect; |
5984 | 268k | else if (SI->getType()->isVectorTy()) |
5985 | 840 | SelectKind = TargetLowering::ScalarCondVectorVal; |
5986 | 268k | else |
5987 | 268k | SelectKind = TargetLowering::ScalarValSelect; |
5988 | 268k | |
5989 | 268k | if (TLI->isSelectSupported(SelectKind) && |
5990 | 268k | !isFormingBranchFromSelectProfitable(TTI, TLI, SI)268k ) |
5991 | 268k | return false; |
5992 | 149 | |
5993 | 149 | // The DominatorTree needs to be rebuilt by any consumers after this |
5994 | 149 | // transformation. We simply reset here rather than setting the ModifiedDT |
5995 | 149 | // flag to avoid restarting the function walk in runOnFunction for each |
5996 | 149 | // select optimized. |
5997 | 149 | DT.reset(); |
5998 | 149 | |
5999 | 149 | // Transform a sequence like this: |
6000 | 149 | // start: |
6001 | 149 | // %cmp = cmp uge i32 %a, %b |
6002 | 149 | // %sel = select i1 %cmp, i32 %c, i32 %d |
6003 | 149 | // |
6004 | 149 | // Into: |
6005 | 149 | // start: |
6006 | 149 | // %cmp = cmp uge i32 %a, %b |
6007 | 149 | // br i1 %cmp, label %select.true, label %select.false |
6008 | 149 | // select.true: |
6009 | 149 | // br label %select.end |
6010 | 149 | // select.false: |
6011 | 149 | // br label %select.end |
6012 | 149 | // select.end: |
6013 | 149 | // %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ] |
6014 | 149 | // |
6015 | 149 | // In addition, we may sink instructions that produce %c or %d from |
6016 | 149 | // the entry block into the destination(s) of the new branch. |
6017 | 149 | // If the true or false blocks do not contain a sunken instruction, that |
6018 | 149 | // block and its branch may be optimized away. In that case, one side of the |
6019 | 149 | // first branch will point directly to select.end, and the corresponding PHI |
6020 | 149 | // predecessor block will be the start block. |
6021 | 149 | |
6022 | 149 | // First, we split the block containing the select into 2 blocks. |
6023 | 149 | BasicBlock *StartBlock = SI->getParent(); |
6024 | 149 | BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI)); |
6025 | 149 | BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end"); |
6026 | 149 | |
6027 | 149 | // Delete the unconditional branch that was just created by the split. |
6028 | 149 | StartBlock->getTerminator()->eraseFromParent(); |
6029 | 149 | |
6030 | 149 | // These are the new basic blocks for the conditional branch. |
6031 | 149 | // At least one will become an actual new basic block. |
6032 | 149 | BasicBlock *TrueBlock = nullptr; |
6033 | 149 | BasicBlock *FalseBlock = nullptr; |
6034 | 149 | BranchInst *TrueBranch = nullptr; |
6035 | 149 | BranchInst *FalseBranch = nullptr; |
6036 | 149 | |
6037 | 149 | // Sink expensive instructions into the conditional blocks to avoid executing |
6038 | 149 | // them speculatively. |
6039 | 151 | for (SelectInst *SI : ASI) { |
6040 | 151 | if (sinkSelectOperand(TTI, SI->getTrueValue())) { |
6041 | 26 | if (TrueBlock == nullptr) { |
6042 | 26 | TrueBlock = BasicBlock::Create(SI->getContext(), "select.true.sink", |
6043 | 26 | EndBlock->getParent(), EndBlock); |
6044 | 26 | TrueBranch = BranchInst::Create(EndBlock, TrueBlock); |
6045 | 26 | TrueBranch->setDebugLoc(SI->getDebugLoc()); |
6046 | 26 | } |
6047 | 26 | auto *TrueInst = cast<Instruction>(SI->getTrueValue()); |
6048 | 26 | TrueInst->moveBefore(TrueBranch); |
6049 | 26 | } |
6050 | 151 | if (sinkSelectOperand(TTI, SI->getFalseValue())) { |
6051 | 5 | if (FalseBlock == nullptr) { |
6052 | 5 | FalseBlock = BasicBlock::Create(SI->getContext(), "select.false.sink", |
6053 | 5 | EndBlock->getParent(), EndBlock); |
6054 | 5 | FalseBranch = BranchInst::Create(EndBlock, FalseBlock); |
6055 | 5 | FalseBranch->setDebugLoc(SI->getDebugLoc()); |
6056 | 5 | } |
6057 | 5 | auto *FalseInst = cast<Instruction>(SI->getFalseValue()); |
6058 | 5 | FalseInst->moveBefore(FalseBranch); |
6059 | 5 | } |
6060 | 151 | } |
6061 | 149 | |
6062 | 149 | // If there was nothing to sink, then arbitrarily choose the 'false' side |
6063 | 149 | // for a new input value to the PHI. |
6064 | 149 | if (TrueBlock == FalseBlock) { |
6065 | 120 | assert(TrueBlock == nullptr && |
6066 | 120 | "Unexpected basic block transform while optimizing select"); |
6067 | 120 | |
6068 | 120 | FalseBlock = BasicBlock::Create(SI->getContext(), "select.false", |
6069 | 120 | EndBlock->getParent(), EndBlock); |
6070 | 120 | auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock); |
6071 | 120 | FalseBranch->setDebugLoc(SI->getDebugLoc()); |
6072 | 120 | } |
6073 | 149 | |
6074 | 149 | // Insert the real conditional branch based on the original condition. |
6075 | 149 | // If we did not create a new block for one of the 'true' or 'false' paths |
6076 | 149 | // of the condition, it means that side of the branch goes to the end block |
6077 | 149 | // directly and the path originates from the start block from the point of |
6078 | 149 | // view of the new PHI. |
6079 | 149 | BasicBlock *TT, *FT; |
6080 | 149 | if (TrueBlock == nullptr) { |
6081 | 123 | TT = EndBlock; |
6082 | 123 | FT = FalseBlock; |
6083 | 123 | TrueBlock = StartBlock; |
6084 | 123 | } else if (26 FalseBlock == nullptr26 ) { |
6085 | 24 | TT = TrueBlock; |
6086 | 24 | FT = EndBlock; |
6087 | 24 | FalseBlock = StartBlock; |
6088 | 24 | } else { |
6089 | 2 | TT = TrueBlock; |
6090 | 2 | FT = FalseBlock; |
6091 | 2 | } |
6092 | 149 | IRBuilder<>(SI).CreateCondBr(SI->getCondition(), TT, FT, SI); |
6093 | 149 | |
6094 | 149 | SmallPtrSet<const Instruction *, 2> INS; |
6095 | 149 | INS.insert(ASI.begin(), ASI.end()); |
6096 | 149 | // Use reverse iterator because later select may use the value of the |
6097 | 149 | // earlier select, and we need to propagate value through earlier select |
6098 | 149 | // to get the PHI operand. |
6099 | 300 | for (auto It = ASI.rbegin(); It != ASI.rend(); ++It151 ) { |
6100 | 151 | SelectInst *SI = *It; |
6101 | 151 | // The select itself is replaced with a PHI Node. |
6102 | 151 | PHINode *PN = PHINode::Create(SI->getType(), 2, "", &EndBlock->front()); |
6103 | 151 | PN->takeName(SI); |
6104 | 151 | PN->addIncoming(getTrueOrFalseValue(SI, true, INS), TrueBlock); |
6105 | 151 | PN->addIncoming(getTrueOrFalseValue(SI, false, INS), FalseBlock); |
6106 | 151 | PN->setDebugLoc(SI->getDebugLoc()); |
6107 | 151 | |
6108 | 151 | SI->replaceAllUsesWith(PN); |
6109 | 151 | SI->eraseFromParent(); |
6110 | 151 | INS.erase(SI); |
6111 | 151 | ++NumSelectsExpanded; |
6112 | 151 | } |
6113 | 149 | |
6114 | 149 | // Instruct OptimizeBlock to skip to the next block. |
6115 | 149 | CurInstIterator = StartBlock->end(); |
6116 | 149 | return true; |
6117 | 149 | } |
6118 | | |
6119 | 23.4k | static bool isBroadcastShuffle(ShuffleVectorInst *SVI) { |
6120 | 23.4k | SmallVector<int, 16> Mask(SVI->getShuffleMask()); |
6121 | 23.4k | int SplatElem = -1; |
6122 | 90.5k | for (unsigned i = 0; i < Mask.size(); ++i67.0k ) { |
6123 | 85.6k | if (SplatElem != -1 && Mask[i] != -132.7k && Mask[i] != SplatElem29.5k ) |
6124 | 18.6k | return false; |
6125 | 67.0k | SplatElem = Mask[i]; |
6126 | 67.0k | } |
6127 | 23.4k | |
6128 | 23.4k | return true4.84k ; |
6129 | 23.4k | } |
6130 | | |
6131 | | /// Some targets have expensive vector shifts if the lanes aren't all the same |
6132 | | /// (e.g. x86 only introduced "vpsllvd" and friends with AVX2). In these cases |
6133 | | /// it's often worth sinking a shufflevector splat down to its use so that |
6134 | | /// codegen can spot all lanes are identical. |
6135 | 230k | bool CodeGenPrepare::optimizeShuffleVectorInst(ShuffleVectorInst *SVI) { |
6136 | 230k | BasicBlock *DefBB = SVI->getParent(); |
6137 | 230k | |
6138 | 230k | // Only do this xform if variable vector shifts are particularly expensive. |
6139 | 230k | if (!TLI || !TLI->isVectorShiftByScalarCheap(SVI->getType())) |
6140 | 207k | return false; |
6141 | 23.4k | |
6142 | 23.4k | // We only expect better codegen by sinking a shuffle if we can recognise a |
6143 | 23.4k | // constant splat. |
6144 | 23.4k | if (!isBroadcastShuffle(SVI)) |
6145 | 18.6k | return false; |
6146 | 4.84k | |
6147 | 4.84k | // InsertedShuffles - Only insert a shuffle in each block once. |
6148 | 4.84k | DenseMap<BasicBlock*, Instruction*> InsertedShuffles; |
6149 | 4.84k | |
6150 | 4.84k | bool MadeChange = false; |
6151 | 5.72k | for (User *U : SVI->users()) { |
6152 | 5.72k | Instruction *UI = cast<Instruction>(U); |
6153 | 5.72k | |
6154 | 5.72k | // Figure out which BB this ext is used in. |
6155 | 5.72k | BasicBlock *UserBB = UI->getParent(); |
6156 | 5.72k | if (UserBB == DefBB) continue5.39k ; |
6157 | 327 | |
6158 | 327 | // For now only apply this when the splat is used by a shift instruction. |
6159 | 327 | if (!UI->isShift()) continue295 ; |
6160 | 32 | |
6161 | 32 | // Everything checks out, sink the shuffle if the user's block doesn't |
6162 | 32 | // already have a copy. |
6163 | 32 | Instruction *&InsertedShuffle = InsertedShuffles[UserBB]; |
6164 | 32 | |
6165 | 32 | if (!InsertedShuffle) { |
6166 | 32 | BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); |
6167 | 32 | assert(InsertPt != UserBB->end()); |
6168 | 32 | InsertedShuffle = |
6169 | 32 | new ShuffleVectorInst(SVI->getOperand(0), SVI->getOperand(1), |
6170 | 32 | SVI->getOperand(2), "", &*InsertPt); |
6171 | 32 | InsertedShuffle->setDebugLoc(SVI->getDebugLoc()); |
6172 | 32 | } |
6173 | 32 | |
6174 | 32 | UI->replaceUsesOfWith(SVI, InsertedShuffle); |
6175 | 32 | MadeChange = true; |
6176 | 32 | } |
6177 | 4.84k | |
6178 | 4.84k | // If we removed all uses, nuke the shuffle. |
6179 | 4.84k | if (SVI->use_empty()) { |
6180 | 6 | SVI->eraseFromParent(); |
6181 | 6 | MadeChange = true; |
6182 | 6 | } |
6183 | 4.84k | |
6184 | 4.84k | return MadeChange; |
6185 | 4.84k | } |
6186 | | |
6187 | 18.4M | bool CodeGenPrepare::tryToSinkFreeOperands(Instruction *I) { |
6188 | 18.4M | // If the operands of I can be folded into a target instruction together with |
6189 | 18.4M | // I, duplicate and sink them. |
6190 | 18.4M | SmallVector<Use *, 4> OpsToSink; |
6191 | 18.4M | if (!TLI || !TLI->shouldSinkOperands(I, OpsToSink)18.4M ) |
6192 | 18.4M | return false; |
6193 | 2.42k | |
6194 | 2.42k | // OpsToSink can contain multiple uses in a use chain (e.g. |
6195 | 2.42k | // (%u1 with %u1 = shufflevector), (%u2 with %u2 = zext %u1)). The dominating |
6196 | 2.42k | // uses must come first, which means they are sunk first, temporarily creating |
6197 | 2.42k | // invalid IR. This will be fixed once their dominated users are sunk and |
6198 | 2.42k | // updated. |
6199 | 2.42k | BasicBlock *TargetBB = I->getParent(); |
6200 | 2.42k | bool Changed = false; |
6201 | 2.42k | SmallVector<Use *, 4> ToReplace; |
6202 | 4.83k | for (Use *U : OpsToSink) { |
6203 | 4.83k | auto *UI = cast<Instruction>(U->get()); |
6204 | 4.83k | if (UI->getParent() == TargetBB || isa<PHINode>(UI)18 ) |
6205 | 4.81k | continue; |
6206 | 18 | ToReplace.push_back(U); |
6207 | 18 | } |
6208 | 2.42k | |
6209 | 2.42k | SmallPtrSet<Instruction *, 4> MaybeDead; |
6210 | 2.42k | for (Use *U : ToReplace) { |
6211 | 18 | auto *UI = cast<Instruction>(U->get()); |
6212 | 18 | Instruction *NI = UI->clone(); |
6213 | 18 | MaybeDead.insert(UI); |
6214 | 18 | LLVM_DEBUG(dbgs() << "Sinking " << *UI << " to user " << *I << "\n"); |
6215 | 18 | NI->insertBefore(I); |
6216 | 18 | InsertedInsts.insert(NI); |
6217 | 18 | U->set(NI); |
6218 | 18 | Changed = true; |
6219 | 18 | } |
6220 | 2.42k | |
6221 | 2.42k | // Remove instructions that are dead after sinking. |
6222 | 2.42k | for (auto *I : MaybeDead) |
6223 | 18 | if (!I->hasNUsesOrMore(1)) |
6224 | 12 | I->eraseFromParent(); |
6225 | 2.42k | |
6226 | 2.42k | return Changed; |
6227 | 2.42k | } |
6228 | | |
6229 | 39.3k | bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) { |
6230 | 39.3k | if (!TLI || !DL39.3k ) |
6231 | 4 | return false; |
6232 | 39.3k | |
6233 | 39.3k | Value *Cond = SI->getCondition(); |
6234 | 39.3k | Type *OldType = Cond->getType(); |
6235 | 39.3k | LLVMContext &Context = Cond->getContext(); |
6236 | 39.3k | MVT RegType = TLI->getRegisterType(Context, TLI->getValueType(*DL, OldType)); |
6237 | 39.3k | unsigned RegWidth = RegType.getSizeInBits(); |
6238 | 39.3k | |
6239 | 39.3k | if (RegWidth <= cast<IntegerType>(OldType)->getBitWidth()) |
6240 | 30.5k | return false; |
6241 | 8.78k | |
6242 | 8.78k | // If the register width is greater than the type width, expand the condition |
6243 | 8.78k | // of the switch instruction and each case constant to the width of the |
6244 | 8.78k | // register. By widening the type of the switch condition, subsequent |
6245 | 8.78k | // comparisons (for case comparisons) will not need to be extended to the |
6246 | 8.78k | // preferred register width, so we will potentially eliminate N-1 extends, |
6247 | 8.78k | // where N is the number of cases in the switch. |
6248 | 8.78k | auto *NewType = Type::getIntNTy(Context, RegWidth); |
6249 | 8.78k | |
6250 | 8.78k | // Zero-extend the switch condition and case constants unless the switch |
6251 | 8.78k | // condition is a function argument that is already being sign-extended. |
6252 | 8.78k | // In that case, we can avoid an unnecessary mask/extension by sign-extending |
6253 | 8.78k | // everything instead. |
6254 | 8.78k | Instruction::CastOps ExtType = Instruction::ZExt; |
6255 | 8.78k | if (auto *Arg = dyn_cast<Argument>(Cond)) |
6256 | 39 | if (Arg->hasSExtAttr()) |
6257 | 16 | ExtType = Instruction::SExt; |
6258 | 8.78k | |
6259 | 8.78k | auto *ExtInst = CastInst::Create(ExtType, Cond, NewType); |
6260 | 8.78k | ExtInst->insertBefore(SI); |
6261 | 8.78k | ExtInst->setDebugLoc(SI->getDebugLoc()); |
6262 | 8.78k | SI->setCondition(ExtInst); |
6263 | 25.2k | for (auto Case : SI->cases()) { |
6264 | 25.2k | APInt NarrowConst = Case.getCaseValue()->getValue(); |
6265 | 25.2k | APInt WideConst = (ExtType == Instruction::ZExt) ? |
6266 | 25.1k | NarrowConst.zext(RegWidth) : NarrowConst.sext(RegWidth)69 ; |
6267 | 25.2k | Case.setValue(ConstantInt::get(Context, WideConst)); |
6268 | 25.2k | } |
6269 | 8.78k | |
6270 | 8.78k | return true; |
6271 | 8.78k | } |
6272 | | |
6273 | | |
6274 | | namespace { |
6275 | | |
6276 | | /// Helper class to promote a scalar operation to a vector one. |
6277 | | /// This class is used to move downward extractelement transition. |
6278 | | /// E.g., |
6279 | | /// a = vector_op <2 x i32> |
6280 | | /// b = extractelement <2 x i32> a, i32 0 |
6281 | | /// c = scalar_op b |
6282 | | /// store c |
6283 | | /// |
6284 | | /// => |
6285 | | /// a = vector_op <2 x i32> |
6286 | | /// c = vector_op a (equivalent to scalar_op on the related lane) |
6287 | | /// * d = extractelement <2 x i32> c, i32 0 |
6288 | | /// * store d |
6289 | | /// Assuming both extractelement and store can be combine, we get rid of the |
6290 | | /// transition. |
6291 | | class VectorPromoteHelper { |
6292 | | /// DataLayout associated with the current module. |
6293 | | const DataLayout &DL; |
6294 | | |
6295 | | /// Used to perform some checks on the legality of vector operations. |
6296 | | const TargetLowering &TLI; |
6297 | | |
6298 | | /// Used to estimated the cost of the promoted chain. |
6299 | | const TargetTransformInfo &TTI; |
6300 | | |
6301 | | /// The transition being moved downwards. |
6302 | | Instruction *Transition; |
6303 | | |
6304 | | /// The sequence of instructions to be promoted. |
6305 | | SmallVector<Instruction *, 4> InstsToBePromoted; |
6306 | | |
6307 | | /// Cost of combining a store and an extract. |
6308 | | unsigned StoreExtractCombineCost; |
6309 | | |
6310 | | /// Instruction that will be combined with the transition. |
6311 | | Instruction *CombineInst = nullptr; |
6312 | | |
6313 | | /// The instruction that represents the current end of the transition. |
6314 | | /// Since we are faking the promotion until we reach the end of the chain |
6315 | | /// of computation, we need a way to get the current end of the transition. |
6316 | 117 | Instruction *getEndOfTransition() const { |
6317 | 117 | if (InstsToBePromoted.empty()) |
6318 | 81 | return Transition; |
6319 | 36 | return InstsToBePromoted.back(); |
6320 | 36 | } |
6321 | | |
6322 | | /// Return the index of the original value in the transition. |
6323 | | /// E.g., for "extractelement <2 x i32> c, i32 1" the original value, |
6324 | | /// c, is at index 0. |
6325 | 194 | unsigned getTransitionOriginalValueIdx() const { |
6326 | 194 | assert(isa<ExtractElementInst>(Transition) && |
6327 | 194 | "Other kind of transitions are not supported yet"); |
6328 | 194 | return 0; |
6329 | 194 | } |
6330 | | |
6331 | | /// Return the index of the index in the transition. |
6332 | | /// E.g., for "extractelement <2 x i32> c, i32 0" the index |
6333 | | /// is at index 1. |
6334 | 32 | unsigned getTransitionIdx() const { |
6335 | 32 | assert(isa<ExtractElementInst>(Transition) && |
6336 | 32 | "Other kind of transitions are not supported yet"); |
6337 | 32 | return 1; |
6338 | 32 | } |
6339 | | |
6340 | | /// Get the type of the transition. |
6341 | | /// This is the type of the original value. |
6342 | | /// E.g., for "extractelement <2 x i32> c, i32 1" the type of the |
6343 | | /// transition is <2 x i32>. |
6344 | 112 | Type *getTransitionType() const { |
6345 | 112 | return Transition->getOperand(getTransitionOriginalValueIdx())->getType(); |
6346 | 112 | } |
6347 | | |
6348 | | /// Promote \p ToBePromoted by moving \p Def downward through. |
6349 | | /// I.e., we have the following sequence: |
6350 | | /// Def = Transition <ty1> a to <ty2> |
6351 | | /// b = ToBePromoted <ty2> Def, ... |
6352 | | /// => |
6353 | | /// b = ToBePromoted <ty1> a, ... |
6354 | | /// Def = Transition <ty1> ToBePromoted to <ty2> |
6355 | | void promoteImpl(Instruction *ToBePromoted); |
6356 | | |
6357 | | /// Check whether or not it is profitable to promote all the |
6358 | | /// instructions enqueued to be promoted. |
6359 | 6 | bool isProfitableToPromote() { |
6360 | 6 | Value *ValIdx = Transition->getOperand(getTransitionOriginalValueIdx()); |
6361 | 6 | unsigned Index = isa<ConstantInt>(ValIdx) |
6362 | 6 | ? cast<ConstantInt>(ValIdx)->getZExtValue()0 |
6363 | 6 | : -1; |
6364 | 6 | Type *PromotedType = getTransitionType(); |
6365 | 6 | |
6366 | 6 | StoreInst *ST = cast<StoreInst>(CombineInst); |
6367 | 6 | unsigned AS = ST->getPointerAddressSpace(); |
6368 | 6 | unsigned Align = ST->getAlignment(); |
6369 | 6 | // Check if this store is supported. |
6370 | 6 | if (!TLI.allowsMisalignedMemoryAccesses( |
6371 | 6 | TLI.getValueType(DL, ST->getValueOperand()->getType()), AS, |
6372 | 6 | Align)) { |
6373 | 0 | // If this is not supported, there is no way we can combine |
6374 | 0 | // the extract with the store. |
6375 | 0 | return false; |
6376 | 0 | } |
6377 | 6 | |
6378 | 6 | // The scalar chain of computation has to pay for the transition |
6379 | 6 | // scalar to vector. |
6380 | 6 | // The vector chain has to account for the combining cost. |
6381 | 6 | uint64_t ScalarCost = |
6382 | 6 | TTI.getVectorInstrCost(Transition->getOpcode(), PromotedType, Index); |
6383 | 6 | uint64_t VectorCost = StoreExtractCombineCost; |
6384 | 18 | for (const auto &Inst : InstsToBePromoted) { |
6385 | 18 | // Compute the cost. |
6386 | 18 | // By construction, all instructions being promoted are arithmetic ones. |
6387 | 18 | // Moreover, one argument is a constant that can be viewed as a splat |
6388 | 18 | // constant. |
6389 | 18 | Value *Arg0 = Inst->getOperand(0); |
6390 | 18 | bool IsArg0Constant = isa<UndefValue>(Arg0) || isa<ConstantInt>(Arg0) || |
6391 | 18 | isa<ConstantFP>(Arg0); |
6392 | 18 | TargetTransformInfo::OperandValueKind Arg0OVK = |
6393 | 18 | IsArg0Constant ? TargetTransformInfo::OK_UniformConstantValue0 |
6394 | 18 | : TargetTransformInfo::OK_AnyValue; |
6395 | 18 | TargetTransformInfo::OperandValueKind Arg1OVK = |
6396 | 18 | !IsArg0Constant ? TargetTransformInfo::OK_UniformConstantValue |
6397 | 18 | : TargetTransformInfo::OK_AnyValue0 ; |
6398 | 18 | ScalarCost += TTI.getArithmeticInstrCost( |
6399 | 18 | Inst->getOpcode(), Inst->getType(), Arg0OVK, Arg1OVK); |
6400 | 18 | VectorCost += TTI.getArithmeticInstrCost(Inst->getOpcode(), PromotedType, |
6401 | 18 | Arg0OVK, Arg1OVK); |
6402 | 18 | } |
6403 | 6 | LLVM_DEBUG( |
6404 | 6 | dbgs() << "Estimated cost of computation to be promoted:\nScalar: " |
6405 | 6 | << ScalarCost << "\nVector: " << VectorCost << '\n'); |
6406 | 6 | return ScalarCost > VectorCost; |
6407 | 6 | } |
6408 | | |
6409 | | /// Generate a constant vector with \p Val with the same |
6410 | | /// number of elements as the transition. |
6411 | | /// \p UseSplat defines whether or not \p Val should be replicated |
6412 | | /// across the whole vector. |
6413 | | /// In other words, if UseSplat == true, we generate <Val, Val, ..., Val>, |
6414 | | /// otherwise we generate a vector with as many undef as possible: |
6415 | | /// <undef, ..., undef, Val, undef, ..., undef> where \p Val is only |
6416 | | /// used at the index of the extract. |
6417 | 38 | Value *getConstantVector(Constant *Val, bool UseSplat) const { |
6418 | 38 | unsigned ExtractIdx = std::numeric_limits<unsigned>::max(); |
6419 | 38 | if (!UseSplat) { |
6420 | 32 | // If we cannot determine where the constant must be, we have to |
6421 | 32 | // use a splat constant. |
6422 | 32 | Value *ValExtractIdx = Transition->getOperand(getTransitionIdx()); |
6423 | 32 | if (ConstantInt *CstVal = dyn_cast<ConstantInt>(ValExtractIdx)) |
6424 | 31 | ExtractIdx = CstVal->getSExtValue(); |
6425 | 1 | else |
6426 | 1 | UseSplat = true; |
6427 | 32 | } |
6428 | 38 | |
6429 | 38 | unsigned End = getTransitionType()->getVectorNumElements(); |
6430 | 38 | if (UseSplat) |
6431 | 7 | return ConstantVector::getSplat(End, Val); |
6432 | 31 | |
6433 | 31 | SmallVector<Constant *, 4> ConstVec; |
6434 | 31 | UndefValue *UndefVal = UndefValue::get(Val->getType()); |
6435 | 105 | for (unsigned Idx = 0; Idx != End; ++Idx74 ) { |
6436 | 74 | if (Idx == ExtractIdx) |
6437 | 31 | ConstVec.push_back(Val); |
6438 | 43 | else |
6439 | 43 | ConstVec.push_back(UndefVal); |
6440 | 74 | } |
6441 | 31 | return ConstantVector::get(ConstVec); |
6442 | 31 | } |
6443 | | |
6444 | | /// Check if promoting to a vector type an operand at \p OperandIdx |
6445 | | /// in \p Use can trigger undefined behavior. |
6446 | | static bool canCauseUndefinedBehavior(const Instruction *Use, |
6447 | 96 | unsigned OperandIdx) { |
6448 | 96 | // This is not safe to introduce undef when the operand is on |
6449 | 96 | // the right hand side of a division-like instruction. |
6450 | 96 | if (OperandIdx != 1) |
6451 | 52 | return false; |
6452 | 44 | switch (Use->getOpcode()) { |
6453 | 44 | default: |
6454 | 30 | return false; |
6455 | 44 | case Instruction::SDiv: |
6456 | 10 | case Instruction::UDiv: |
6457 | 10 | case Instruction::SRem: |
6458 | 10 | case Instruction::URem: |
6459 | 10 | return true; |
6460 | 10 | case Instruction::FDiv: |
6461 | 4 | case Instruction::FRem: |
6462 | 4 | return !Use->hasNoNaNs(); |
6463 | 0 | } |
6464 | 0 | llvm_unreachable(nullptr); |
6465 | 0 | } |
6466 | | |
6467 | | public: |
6468 | | VectorPromoteHelper(const DataLayout &DL, const TargetLowering &TLI, |
6469 | | const TargetTransformInfo &TTI, Instruction *Transition, |
6470 | | unsigned CombineCost) |
6471 | | : DL(DL), TLI(TLI), TTI(TTI), Transition(Transition), |
6472 | 218 | StoreExtractCombineCost(CombineCost) { |
6473 | 218 | assert(Transition && "Do not know how to promote null"); |
6474 | 218 | } |
6475 | | |
6476 | | /// Check if we can promote \p ToBePromoted to \p Type. |
6477 | 95 | bool canPromote(const Instruction *ToBePromoted) const { |
6478 | 95 | // We could support CastInst too. |
6479 | 95 | return isa<BinaryOperator>(ToBePromoted); |
6480 | 95 | } |
6481 | | |
6482 | | /// Check if it is profitable to promote \p ToBePromoted |
6483 | | /// by moving downward the transition through. |
6484 | 59 | bool shouldPromote(const Instruction *ToBePromoted) const { |
6485 | 59 | // Promote only if all the operands can be statically expanded. |
6486 | 59 | // Indeed, we do not want to introduce any new kind of transitions. |
6487 | 117 | for (const Use &U : ToBePromoted->operands()) { |
6488 | 117 | const Value *Val = U.get(); |
6489 | 117 | if (Val == getEndOfTransition()) { |
6490 | 58 | // If the use is a division and the transition is on the rhs, |
6491 | 58 | // we cannot promote the operation, otherwise we may create a |
6492 | 58 | // division by zero. |
6493 | 58 | if (canCauseUndefinedBehavior(ToBePromoted, U.getOperandNo())) |
6494 | 6 | return false; |
6495 | 52 | continue; |
6496 | 52 | } |
6497 | 59 | if (!isa<ConstantInt>(Val) && !isa<UndefValue>(Val)7 && |
6498 | 59 | !isa<ConstantFP>(Val)7 ) |
6499 | 2 | return false; |
6500 | 59 | } |
6501 | 59 | // Check that the resulting operation is legal. |
6502 | 59 | int ISDOpcode = TLI.InstructionOpcodeToISD(ToBePromoted->getOpcode()); |
6503 | 51 | if (!ISDOpcode) |
6504 | 0 | return false; |
6505 | 51 | return StressStoreExtract || |
6506 | 51 | TLI.isOperationLegalOrCustom( |
6507 | 30 | ISDOpcode, TLI.getValueType(DL, getTransitionType(), true)); |
6508 | 51 | } |
6509 | | |
6510 | | /// Check whether or not \p Use can be combined |
6511 | | /// with the transition. |
6512 | | /// I.e., is it possible to do Use(Transition) => AnotherUse? |
6513 | 168 | bool canCombine(const Instruction *Use) { return isa<StoreInst>(Use); } |
6514 | | |
6515 | | /// Record \p ToBePromoted as part of the chain to be promoted. |
6516 | 41 | void enqueueForPromotion(Instruction *ToBePromoted) { |
6517 | 41 | InstsToBePromoted.push_back(ToBePromoted); |
6518 | 41 | } |
6519 | | |
6520 | | /// Set the instruction that will be combined with the transition. |
6521 | 73 | void recordCombineInstruction(Instruction *ToBeCombined) { |
6522 | 73 | assert(canCombine(ToBeCombined) && "Unsupported instruction to combine"); |
6523 | 73 | CombineInst = ToBeCombined; |
6524 | 73 | } |
6525 | | |
6526 | | /// Promote all the instructions enqueued for promotion if it is |
6527 | | /// is profitable. |
6528 | | /// \return True if the promotion happened, false otherwise. |
6529 | 73 | bool promote() { |
6530 | 73 | // Check if there is something to promote. |
6531 | 73 | // Right now, if we do not have anything to combine with, |
6532 | 73 | // we assume the promotion is not profitable. |
6533 | 73 | if (InstsToBePromoted.empty() || !CombineInst20 ) |
6534 | 53 | return false; |
6535 | 20 | |
6536 | 20 | // Check cost. |
6537 | 20 | if (!StressStoreExtract && !isProfitableToPromote()6 ) |
6538 | 0 | return false; |
6539 | 20 | |
6540 | 20 | // Promote. |
6541 | 20 | for (auto &ToBePromoted : InstsToBePromoted) |
6542 | 38 | promoteImpl(ToBePromoted); |
6543 | 20 | InstsToBePromoted.clear(); |
6544 | 20 | return true; |
6545 | 20 | } |
6546 | | }; |
6547 | | |
6548 | | } // end anonymous namespace |
6549 | | |
6550 | 38 | void VectorPromoteHelper::promoteImpl(Instruction *ToBePromoted) { |
6551 | 38 | // At this point, we know that all the operands of ToBePromoted but Def |
6552 | 38 | // can be statically promoted. |
6553 | 38 | // For Def, we need to use its parameter in ToBePromoted: |
6554 | 38 | // b = ToBePromoted ty1 a |
6555 | 38 | // Def = Transition ty1 b to ty2 |
6556 | 38 | // Move the transition down. |
6557 | 38 | // 1. Replace all uses of the promoted operation by the transition. |
6558 | 38 | // = ... b => = ... Def. |
6559 | 38 | assert(ToBePromoted->getType() == Transition->getType() && |
6560 | 38 | "The type of the result of the transition does not match " |
6561 | 38 | "the final type"); |
6562 | 38 | ToBePromoted->replaceAllUsesWith(Transition); |
6563 | 38 | // 2. Update the type of the uses. |
6564 | 38 | // b = ToBePromoted ty2 Def => b = ToBePromoted ty1 Def. |
6565 | 38 | Type *TransitionTy = getTransitionType(); |
6566 | 38 | ToBePromoted->mutateType(TransitionTy); |
6567 | 38 | // 3. Update all the operands of the promoted operation with promoted |
6568 | 38 | // operands. |
6569 | 38 | // b = ToBePromoted ty1 Def => b = ToBePromoted ty1 a. |
6570 | 76 | for (Use &U : ToBePromoted->operands()) { |
6571 | 76 | Value *Val = U.get(); |
6572 | 76 | Value *NewVal = nullptr; |
6573 | 76 | if (Val == Transition) |
6574 | 38 | NewVal = Transition->getOperand(getTransitionOriginalValueIdx()); |
6575 | 38 | else if (isa<UndefValue>(Val) || isa<ConstantInt>(Val) || |
6576 | 38 | isa<ConstantFP>(Val)5 ) { |
6577 | 38 | // Use a splat constant if it is not safe to use undef. |
6578 | 38 | NewVal = getConstantVector( |
6579 | 38 | cast<Constant>(Val), |
6580 | 38 | isa<UndefValue>(Val) || |
6581 | 38 | canCauseUndefinedBehavior(ToBePromoted, U.getOperandNo())); |
6582 | 38 | } else |
6583 | 38 | llvm_unreachable0 ("Did you modified shouldPromote and forgot to update " |
6584 | 76 | "this?"); |
6585 | 76 | ToBePromoted->setOperand(U.getOperandNo(), NewVal); |
6586 | 76 | } |
6587 | 38 | Transition->moveAfter(ToBePromoted); |
6588 | 38 | Transition->setOperand(getTransitionOriginalValueIdx(), ToBePromoted); |
6589 | 38 | } |
6590 | | |
6591 | | /// Some targets can do store(extractelement) with one instruction. |
6592 | | /// Try to push the extractelement towards the stores when the target |
6593 | | /// has this feature and this is profitable. |
6594 | 67.4k | bool CodeGenPrepare::optimizeExtractElementInst(Instruction *Inst) { |
6595 | 67.4k | unsigned CombineCost = std::numeric_limits<unsigned>::max(); |
6596 | 67.4k | if (DisableStoreExtract || !TLI || |
6597 | 67.4k | (!StressStoreExtract && |
6598 | 67.4k | !TLI->canCombineStoreAndExtract(Inst->getOperand(0)->getType(), |
6599 | 67.3k | Inst->getOperand(1), CombineCost))) |
6600 | 67.2k | return false; |
6601 | 218 | |
6602 | 218 | // At this point we know that Inst is a vector to scalar transition. |
6603 | 218 | // Try to move it down the def-use chain, until: |
6604 | 218 | // - We can combine the transition with its single use |
6605 | 218 | // => we got rid of the transition. |
6606 | 218 | // - We escape the current basic block |
6607 | 218 | // => we would need to check that we are moving it at a cheaper place and |
6608 | 218 | // we do not do that for now. |
6609 | 218 | BasicBlock *Parent = Inst->getParent(); |
6610 | 218 | LLVM_DEBUG(dbgs() << "Found an interesting transition: " << *Inst << '\n'); |
6611 | 218 | VectorPromoteHelper VPH(*DL, *TLI, *TTI, Inst, CombineCost); |
6612 | 218 | // If the transition has more than one use, assume this is not going to be |
6613 | 218 | // beneficial. |
6614 | 259 | while (Inst->hasOneUse()) { |
6615 | 171 | Instruction *ToBePromoted = cast<Instruction>(*Inst->user_begin()); |
6616 | 171 | LLVM_DEBUG(dbgs() << "Use: " << *ToBePromoted << '\n'); |
6617 | 171 | |
6618 | 171 | if (ToBePromoted->getParent() != Parent) { |
6619 | 3 | LLVM_DEBUG(dbgs() << "Instruction to promote is in a different block (" |
6620 | 3 | << ToBePromoted->getParent()->getName() |
6621 | 3 | << ") than the transition (" << Parent->getName() |
6622 | 3 | << ").\n"); |
6623 | 3 | return false; |
6624 | 3 | } |
6625 | 168 | |
6626 | 168 | if (VPH.canCombine(ToBePromoted)) { |
6627 | 73 | LLVM_DEBUG(dbgs() << "Assume " << *Inst << '\n' |
6628 | 73 | << "will be combined with: " << *ToBePromoted << '\n'); |
6629 | 73 | VPH.recordCombineInstruction(ToBePromoted); |
6630 | 73 | bool Changed = VPH.promote(); |
6631 | 73 | NumStoreExtractExposed += Changed; |
6632 | 73 | return Changed; |
6633 | 73 | } |
6634 | 95 | |
6635 | 95 | LLVM_DEBUG(dbgs() << "Try promoting.\n"); |
6636 | 95 | if (!VPH.canPromote(ToBePromoted) || !VPH.shouldPromote(ToBePromoted)59 ) |
6637 | 54 | return false; |
6638 | 41 | |
6639 | 41 | LLVM_DEBUG(dbgs() << "Promoting is possible... Enqueue for promotion!\n"); |
6640 | 41 | |
6641 | 41 | VPH.enqueueForPromotion(ToBePromoted); |
6642 | 41 | Inst = ToBePromoted; |
6643 | 41 | } |
6644 | 218 | return false88 ; |
6645 | 218 | } |
6646 | | |
6647 | | /// For the instruction sequence of store below, F and I values |
6648 | | /// are bundled together as an i64 value before being stored into memory. |
6649 | | /// Sometimes it is more efficient to generate separate stores for F and I, |
6650 | | /// which can remove the bitwise instructions or sink them to colder places. |
6651 | | /// |
6652 | | /// (store (or (zext (bitcast F to i32) to i64), |
6653 | | /// (shl (zext I to i64), 32)), addr) --> |
6654 | | /// (store F, addr) and (store I, addr+4) |
6655 | | /// |
6656 | | /// Similarly, splitting for other merged store can also be beneficial, like: |
6657 | | /// For pair of {i32, i32}, i64 store --> two i32 stores. |
6658 | | /// For pair of {i32, i16}, i64 store --> two i32 stores. |
6659 | | /// For pair of {i16, i16}, i32 store --> two i16 stores. |
6660 | | /// For pair of {i16, i8}, i32 store --> two i16 stores. |
6661 | | /// For pair of {i8, i8}, i16 store --> two i8 stores. |
6662 | | /// |
6663 | | /// We allow each target to determine specifically which kind of splitting is |
6664 | | /// supported. |
6665 | | /// |
6666 | | /// The store patterns are commonly seen from the simple code snippet below |
6667 | | /// if only std::make_pair(...) is sroa transformed before inlined into hoo. |
6668 | | /// void goo(const std::pair<int, float> &); |
6669 | | /// hoo() { |
6670 | | /// ... |
6671 | | /// goo(std::make_pair(tmp, ftmp)); |
6672 | | /// ... |
6673 | | /// } |
6674 | | /// |
6675 | | /// Although we already have similar splitting in DAG Combine, we duplicate |
6676 | | /// it in CodeGenPrepare to catch the case in which pattern is across |
6677 | | /// multiple BBs. The logic in DAG Combine is kept to catch case generated |
6678 | | /// during code expansion. |
6679 | | static bool splitMergedValStore(StoreInst &SI, const DataLayout &DL, |
6680 | 4.05M | const TargetLowering &TLI) { |
6681 | 4.05M | // Handle simple but common cases only. |
6682 | 4.05M | Type *StoreType = SI.getValueOperand()->getType(); |
6683 | 4.05M | if (!DL.typeSizeEqualsStoreSize(StoreType) || |
6684 | 4.05M | DL.getTypeSizeInBits(StoreType) == 04.02M ) |
6685 | 27.2k | return false; |
6686 | 4.02M | |
6687 | 4.02M | unsigned HalfValBitSize = DL.getTypeSizeInBits(StoreType) / 2; |
6688 | 4.02M | Type *SplitStoreType = Type::getIntNTy(SI.getContext(), HalfValBitSize); |
6689 | 4.02M | if (!DL.typeSizeEqualsStoreSize(SplitStoreType)) |
6690 | 360k | return false; |
6691 | 3.66M | |
6692 | 3.66M | // Don't split the store if it is volatile. |
6693 | 3.66M | if (SI.isVolatile()) |
6694 | 22.1k | return false; |
6695 | 3.64M | |
6696 | 3.64M | // Match the following patterns: |
6697 | 3.64M | // (store (or (zext LValue to i64), |
6698 | 3.64M | // (shl (zext HValue to i64), 32)), HalfValBitSize) |
6699 | 3.64M | // or |
6700 | 3.64M | // (store (or (shl (zext HValue to i64), 32)), HalfValBitSize) |
6701 | 3.64M | // (zext LValue to i64), |
6702 | 3.64M | // Expect both operands of OR and the first operand of SHL have only |
6703 | 3.64M | // one use. |
6704 | 3.64M | Value *LValue, *HValue; |
6705 | 3.64M | if (!match(SI.getValueOperand(), |
6706 | 3.64M | m_c_Or(m_OneUse(m_ZExt(m_Value(LValue))), |
6707 | 3.64M | m_OneUse(m_Shl(m_OneUse(m_ZExt(m_Value(HValue))), |
6708 | 3.64M | m_SpecificInt(HalfValBitSize)))))) |
6709 | 3.64M | return false; |
6710 | 802 | |
6711 | 802 | // Check LValue and HValue are int with size less or equal than 32. |
6712 | 802 | if (!LValue->getType()->isIntegerTy() || |
6713 | 802 | DL.getTypeSizeInBits(LValue->getType()) > HalfValBitSize || |
6714 | 802 | !HValue->getType()->isIntegerTy() || |
6715 | 802 | DL.getTypeSizeInBits(HValue->getType()) > HalfValBitSize) |
6716 | 0 | return false; |
6717 | 802 | |
6718 | 802 | // If LValue/HValue is a bitcast instruction, use the EVT before bitcast |
6719 | 802 | // as the input of target query. |
6720 | 802 | auto *LBC = dyn_cast<BitCastInst>(LValue); |
6721 | 802 | auto *HBC = dyn_cast<BitCastInst>(HValue); |
6722 | 802 | EVT LowTy = LBC ? EVT::getEVT(LBC->getOperand(0)->getType())1 |
6723 | 802 | : EVT::getEVT(LValue->getType())801 ; |
6724 | 802 | EVT HighTy = HBC ? EVT::getEVT(HBC->getOperand(0)->getType())6 |
6725 | 802 | : EVT::getEVT(HValue->getType())796 ; |
6726 | 802 | if (!ForceSplitStore && !TLI.isMultiStoresCheaperThanBitsMerge(LowTy, HighTy)787 ) |
6727 | 787 | return false; |
6728 | 15 | |
6729 | 15 | // Start to split store. |
6730 | 15 | IRBuilder<> Builder(SI.getContext()); |
6731 | 15 | Builder.SetInsertPoint(&SI); |
6732 | 15 | |
6733 | 15 | // If LValue/HValue is a bitcast in another BB, create a new one in current |
6734 | 15 | // BB so it may be merged with the splitted stores by dag combiner. |
6735 | 15 | if (LBC && LBC->getParent() != SI.getParent()1 ) |
6736 | 0 | LValue = Builder.CreateBitCast(LBC->getOperand(0), LBC->getType()); |
6737 | 15 | if (HBC && HBC->getParent() != SI.getParent()6 ) |
6738 | 3 | HValue = Builder.CreateBitCast(HBC->getOperand(0), HBC->getType()); |
6739 | 15 | |
6740 | 15 | bool IsLE = SI.getModule()->getDataLayout().isLittleEndian(); |
6741 | 30 | auto CreateSplitStore = [&](Value *V, bool Upper) { |
6742 | 30 | V = Builder.CreateZExtOrBitCast(V, SplitStoreType); |
6743 | 30 | Value *Addr = Builder.CreateBitCast( |
6744 | 30 | SI.getOperand(1), |
6745 | 30 | SplitStoreType->getPointerTo(SI.getPointerAddressSpace())); |
6746 | 30 | if ((IsLE && Upper28 ) || (16 !IsLE16 && !Upper2 )) |
6747 | 15 | Addr = Builder.CreateGEP( |
6748 | 15 | SplitStoreType, Addr, |
6749 | 15 | ConstantInt::get(Type::getInt32Ty(SI.getContext()), 1)); |
6750 | 30 | Builder.CreateAlignedStore( |
6751 | 30 | V, Addr, Upper ? SI.getAlignment() / 215 : SI.getAlignment()15 ); |
6752 | 30 | }; |
6753 | 15 | |
6754 | 15 | CreateSplitStore(LValue, false); |
6755 | 15 | CreateSplitStore(HValue, true); |
6756 | 15 | |
6757 | 15 | // Delete the old store. |
6758 | 15 | SI.eraseFromParent(); |
6759 | 15 | return true; |
6760 | 15 | } |
6761 | | |
6762 | | // Return true if the GEP has two operands, the first operand is of a sequential |
6763 | | // type, and the second operand is a constant. |
6764 | 91 | static bool GEPSequentialConstIndexed(GetElementPtrInst *GEP) { |
6765 | 91 | gep_type_iterator I = gep_type_begin(*GEP); |
6766 | 91 | return GEP->getNumOperands() == 2 && |
6767 | 91 | I.isSequential()24 && |
6768 | 91 | isa<ConstantInt>(GEP->getOperand(1))24 ; |
6769 | 91 | } |
6770 | | |
6771 | | // Try unmerging GEPs to reduce liveness interference (register pressure) across |
6772 | | // IndirectBr edges. Since IndirectBr edges tend to touch on many blocks, |
6773 | | // reducing liveness interference across those edges benefits global register |
6774 | | // allocation. Currently handles only certain cases. |
6775 | | // |
6776 | | // For example, unmerge %GEPI and %UGEPI as below. |
6777 | | // |
6778 | | // ---------- BEFORE ---------- |
6779 | | // SrcBlock: |
6780 | | // ... |
6781 | | // %GEPIOp = ... |
6782 | | // ... |
6783 | | // %GEPI = gep %GEPIOp, Idx |
6784 | | // ... |
6785 | | // indirectbr ... [ label %DstB0, label %DstB1, ... label %DstBi ... ] |
6786 | | // (* %GEPI is alive on the indirectbr edges due to other uses ahead) |
6787 | | // (* %GEPIOp is alive on the indirectbr edges only because of it's used by |
6788 | | // %UGEPI) |
6789 | | // |
6790 | | // DstB0: ... (there may be a gep similar to %UGEPI to be unmerged) |
6791 | | // DstB1: ... (there may be a gep similar to %UGEPI to be unmerged) |
6792 | | // ... |
6793 | | // |
6794 | | // DstBi: |
6795 | | // ... |
6796 | | // %UGEPI = gep %GEPIOp, UIdx |
6797 | | // ... |
6798 | | // --------------------------- |
6799 | | // |
6800 | | // ---------- AFTER ---------- |
6801 | | // SrcBlock: |
6802 | | // ... (same as above) |
6803 | | // (* %GEPI is still alive on the indirectbr edges) |
6804 | | // (* %GEPIOp is no longer alive on the indirectbr edges as a result of the |
6805 | | // unmerging) |
6806 | | // ... |
6807 | | // |
6808 | | // DstBi: |
6809 | | // ... |
6810 | | // %UGEPI = gep %GEPI, (UIdx-Idx) |
6811 | | // ... |
6812 | | // --------------------------- |
6813 | | // |
6814 | | // The register pressure on the IndirectBr edges is reduced because %GEPIOp is |
6815 | | // no longer alive on them. |
6816 | | // |
6817 | | // We try to unmerge GEPs here in CodGenPrepare, as opposed to limiting merging |
6818 | | // of GEPs in the first place in InstCombiner::visitGetElementPtrInst() so as |
6819 | | // not to disable further simplications and optimizations as a result of GEP |
6820 | | // merging. |
6821 | | // |
6822 | | // Note this unmerging may increase the length of the data flow critical path |
6823 | | // (the path from %GEPIOp to %UGEPI would go through %GEPI), which is a tradeoff |
6824 | | // between the register pressure and the length of data-flow critical |
6825 | | // path. Restricting this to the uncommon IndirectBr case would minimize the |
6826 | | // impact of potentially longer critical path, if any, and the impact on compile |
6827 | | // time. |
6828 | | static bool tryUnmergingGEPsAcrossIndirectBr(GetElementPtrInst *GEPI, |
6829 | 6.33M | const TargetTransformInfo *TTI) { |
6830 | 6.33M | BasicBlock *SrcBlock = GEPI->getParent(); |
6831 | 6.33M | // Check that SrcBlock ends with an IndirectBr. If not, give up. The common |
6832 | 6.33M | // (non-IndirectBr) cases exit early here. |
6833 | 6.33M | if (!isa<IndirectBrInst>(SrcBlock->getTerminator())) |
6834 | 6.33M | return false; |
6835 | 88 | // Check that GEPI is a simple gep with a single constant index. |
6836 | 88 | if (!GEPSequentialConstIndexed(GEPI)) |
6837 | 77 | return false; |
6838 | 11 | ConstantInt *GEPIIdx = cast<ConstantInt>(GEPI->getOperand(1)); |
6839 | 11 | // Check that GEPI is a cheap one. |
6840 | 11 | if (TTI->getIntImmCost(GEPIIdx->getValue(), GEPIIdx->getType()) |
6841 | 11 | > TargetTransformInfo::TCC_Basic) |
6842 | 0 | return false; |
6843 | 11 | Value *GEPIOp = GEPI->getOperand(0); |
6844 | 11 | // Check that GEPIOp is an instruction that's also defined in SrcBlock. |
6845 | 11 | if (!isa<Instruction>(GEPIOp)) |
6846 | 1 | return false; |
6847 | 10 | auto *GEPIOpI = cast<Instruction>(GEPIOp); |
6848 | 10 | if (GEPIOpI->getParent() != SrcBlock) |
6849 | 0 | return false; |
6850 | 10 | // Check that GEP is used outside the block, meaning it's alive on the |
6851 | 10 | // IndirectBr edge(s). |
6852 | 12 | if (10 find_if(GEPI->users(), [&](User *Usr) 10 { |
6853 | 12 | if (auto *I = dyn_cast<Instruction>(Usr)) { |
6854 | 12 | if (I->getParent() != SrcBlock) { |
6855 | 8 | return true; |
6856 | 8 | } |
6857 | 4 | } |
6858 | 4 | return false; |
6859 | 4 | }) == GEPI->users().end()) |
6860 | 2 | return false; |
6861 | 8 | // The second elements of the GEP chains to be unmerged. |
6862 | 8 | std::vector<GetElementPtrInst *> UGEPIs; |
6863 | 8 | // Check each user of GEPIOp to check if unmerging would make GEPIOp not alive |
6864 | 8 | // on IndirectBr edges. |
6865 | 19 | for (User *Usr : GEPIOp->users()) { |
6866 | 19 | if (Usr == GEPI) continue8 ; |
6867 | 11 | // Check if Usr is an Instruction. If not, give up. |
6868 | 11 | if (!isa<Instruction>(Usr)) |
6869 | 0 | return false; |
6870 | 11 | auto *UI = cast<Instruction>(Usr); |
6871 | 11 | // Check if Usr in the same block as GEPIOp, which is fine, skip. |
6872 | 11 | if (UI->getParent() == SrcBlock) |
6873 | 8 | continue; |
6874 | 3 | // Check if Usr is a GEP. If not, give up. |
6875 | 3 | if (!isa<GetElementPtrInst>(Usr)) |
6876 | 0 | return false; |
6877 | 3 | auto *UGEPI = cast<GetElementPtrInst>(Usr); |
6878 | 3 | // Check if UGEPI is a simple gep with a single constant index and GEPIOp is |
6879 | 3 | // the pointer operand to it. If so, record it in the vector. If not, give |
6880 | 3 | // up. |
6881 | 3 | if (!GEPSequentialConstIndexed(UGEPI)) |
6882 | 0 | return false; |
6883 | 3 | if (UGEPI->getOperand(0) != GEPIOp) |
6884 | 0 | return false; |
6885 | 3 | if (GEPIIdx->getType() != |
6886 | 3 | cast<ConstantInt>(UGEPI->getOperand(1))->getType()) |
6887 | 0 | return false; |
6888 | 3 | ConstantInt *UGEPIIdx = cast<ConstantInt>(UGEPI->getOperand(1)); |
6889 | 3 | if (TTI->getIntImmCost(UGEPIIdx->getValue(), UGEPIIdx->getType()) |
6890 | 3 | > TargetTransformInfo::TCC_Basic) |
6891 | 0 | return false; |
6892 | 3 | UGEPIs.push_back(UGEPI); |
6893 | 3 | } |
6894 | 8 | if (UGEPIs.size() == 0) |
6895 | 7 | return false; |
6896 | 1 | // Check the materializing cost of (Uidx-Idx). |
6897 | 3 | for (GetElementPtrInst *UGEPI : UGEPIs)1 { |
6898 | 3 | ConstantInt *UGEPIIdx = cast<ConstantInt>(UGEPI->getOperand(1)); |
6899 | 3 | APInt NewIdx = UGEPIIdx->getValue() - GEPIIdx->getValue(); |
6900 | 3 | unsigned ImmCost = TTI->getIntImmCost(NewIdx, GEPIIdx->getType()); |
6901 | 3 | if (ImmCost > TargetTransformInfo::TCC_Basic) |
6902 | 0 | return false; |
6903 | 3 | } |
6904 | 1 | // Now unmerge between GEPI and UGEPIs. |
6905 | 3 | for (GetElementPtrInst *UGEPI : UGEPIs)1 { |
6906 | 3 | UGEPI->setOperand(0, GEPI); |
6907 | 3 | ConstantInt *UGEPIIdx = cast<ConstantInt>(UGEPI->getOperand(1)); |
6908 | 3 | Constant *NewUGEPIIdx = |
6909 | 3 | ConstantInt::get(GEPIIdx->getType(), |
6910 | 3 | UGEPIIdx->getValue() - GEPIIdx->getValue()); |
6911 | 3 | UGEPI->setOperand(1, NewUGEPIIdx); |
6912 | 3 | // If GEPI is not inbounds but UGEPI is inbounds, change UGEPI to not |
6913 | 3 | // inbounds to avoid UB. |
6914 | 3 | if (!GEPI->isInBounds()) { |
6915 | 3 | UGEPI->setIsInBounds(false); |
6916 | 3 | } |
6917 | 3 | } |
6918 | 1 | // After unmerging, verify that GEPIOp is actually only used in SrcBlock (not |
6919 | 1 | // alive on IndirectBr edges). |
6920 | 1 | assert(find_if(GEPIOp->users(), [&](User *Usr) { |
6921 | 1 | return cast<Instruction>(Usr)->getParent() != SrcBlock; |
6922 | 1 | }) == GEPIOp->users().end() && "GEPIOp is used outside SrcBlock"); |
6923 | 1 | return true; |
6924 | 1 | } |
6925 | | |
6926 | 40.9M | bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) { |
6927 | 40.9M | // Bail out if we inserted the instruction to prevent optimizations from |
6928 | 40.9M | // stepping on each other's toes. |
6929 | 40.9M | if (InsertedInsts.count(I)) |
6930 | 278k | return false; |
6931 | 40.7M | |
6932 | 40.7M | // TODO: Move into the switch on opcode below here. |
6933 | 40.7M | if (PHINode *P = dyn_cast<PHINode>(I)) { |
6934 | 1.76M | // It is possible for very late stage optimizations (such as SimplifyCFG) |
6935 | 1.76M | // to introduce PHI nodes too late to be cleaned up. If we detect such a |
6936 | 1.76M | // trivial PHI, go ahead and zap it here. |
6937 | 1.76M | if (Value *V = SimplifyInstruction(P, {*DL, TLInfo})) { |
6938 | 625 | LargeOffsetGEPMap.erase(P); |
6939 | 625 | P->replaceAllUsesWith(V); |
6940 | 625 | P->eraseFromParent(); |
6941 | 625 | ++NumPHIsElim; |
6942 | 625 | return true; |
6943 | 625 | } |
6944 | 1.75M | return false; |
6945 | 1.75M | } |
6946 | 38.9M | |
6947 | 38.9M | if (CastInst *CI = dyn_cast<CastInst>(I)) { |
6948 | 5.70M | // If the source of the cast is a constant, then this should have |
6949 | 5.70M | // already been constant folded. The only reason NOT to constant fold |
6950 | 5.70M | // it is if something (e.g. LSR) was careful to place the constant |
6951 | 5.70M | // evaluation in a block other than then one that uses it (e.g. to hoist |
6952 | 5.70M | // the address of globals out of a loop). If this is the case, we don't |
6953 | 5.70M | // want to forward-subst the cast. |
6954 | 5.70M | if (isa<Constant>(CI->getOperand(0))) |
6955 | 60.8k | return false; |
6956 | 5.64M | |
6957 | 5.64M | if (TLI && OptimizeNoopCopyExpression(CI, *TLI, *DL)5.64M ) |
6958 | 172k | return true; |
6959 | 5.47M | |
6960 | 5.47M | if (isa<ZExtInst>(I) || isa<SExtInst>(I)5.13M ) { |
6961 | 826k | /// Sink a zext or sext into its user blocks if the target type doesn't |
6962 | 826k | /// fit in one register |
6963 | 826k | if (TLI && |
6964 | 826k | TLI->getTypeAction(CI->getContext(), |
6965 | 826k | TLI->getValueType(*DL, CI->getType())) == |
6966 | 826k | TargetLowering::TypeExpandInteger) { |
6967 | 18.6k | return SinkCast(CI); |
6968 | 808k | } else { |
6969 | 808k | bool MadeChange = optimizeExt(I); |
6970 | 808k | return MadeChange | optimizeExtUses(I); |
6971 | 808k | } |
6972 | 4.64M | } |
6973 | 4.64M | return false; |
6974 | 4.64M | } |
6975 | 33.2M | |
6976 | 33.2M | if (auto *Cmp = dyn_cast<CmpInst>(I)) |
6977 | 3.64M | if (TLI && optimizeCmp(Cmp, ModifiedDT)3.64M ) |
6978 | 36.9k | return true; |
6979 | 33.2M | |
6980 | 33.2M | if (LoadInst *LI = dyn_cast<LoadInst>(I)) { |
6981 | 3.66M | LI->setMetadata(LLVMContext::MD_invariant_group, nullptr); |
6982 | 3.66M | if (TLI) { |
6983 | 3.66M | bool Modified = optimizeLoadExt(LI); |
6984 | 3.66M | unsigned AS = LI->getPointerAddressSpace(); |
6985 | 3.66M | Modified |= optimizeMemoryInst(I, I->getOperand(0), LI->getType(), AS); |
6986 | 3.66M | return Modified; |
6987 | 3.66M | } |
6988 | 28 | return false; |
6989 | 28 | } |
6990 | 29.5M | |
6991 | 29.5M | if (StoreInst *SI = dyn_cast<StoreInst>(I)) { |
6992 | 4.05M | if (TLI && splitMergedValStore(*SI, *DL, *TLI)4.05M ) |
6993 | 15 | return true; |
6994 | 4.05M | SI->setMetadata(LLVMContext::MD_invariant_group, nullptr); |
6995 | 4.05M | if (TLI) { |
6996 | 4.05M | unsigned AS = SI->getPointerAddressSpace(); |
6997 | 4.05M | return optimizeMemoryInst(I, SI->getOperand(1), |
6998 | 4.05M | SI->getOperand(0)->getType(), AS); |
6999 | 4.05M | } |
7000 | 20 | return false; |
7001 | 20 | } |
7002 | 25.4M | |
7003 | 25.4M | if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) { |
7004 | 9.14k | unsigned AS = RMW->getPointerAddressSpace(); |
7005 | 9.14k | return optimizeMemoryInst(I, RMW->getPointerOperand(), |
7006 | 9.14k | RMW->getType(), AS); |
7007 | 9.14k | } |
7008 | 25.4M | |
7009 | 25.4M | if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(I)) { |
7010 | 3.87k | unsigned AS = CmpX->getPointerAddressSpace(); |
7011 | 3.87k | return optimizeMemoryInst(I, CmpX->getPointerOperand(), |
7012 | 3.87k | CmpX->getCompareOperand()->getType(), AS); |
7013 | 3.87k | } |
7014 | 25.4M | |
7015 | 25.4M | BinaryOperator *BinOp = dyn_cast<BinaryOperator>(I); |
7016 | 25.4M | |
7017 | 25.4M | if (BinOp && (BinOp->getOpcode() == Instruction::And)3.94M && |
7018 | 25.4M | EnableAndCmpSinking484k && TLI484k ) |
7019 | 484k | return sinkAndCmp0Expression(BinOp, *TLI, InsertedInsts); |
7020 | 24.9M | |
7021 | 24.9M | // TODO: Move this into the switch on opcode - it handles shifts already. |
7022 | 24.9M | if (BinOp && (3.45M BinOp->getOpcode() == Instruction::AShr3.45M || |
7023 | 3.45M | BinOp->getOpcode() == Instruction::LShr3.40M )) { |
7024 | 229k | ConstantInt *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1)); |
7025 | 229k | if (TLI && CI229k && TLI->hasExtractBitsInsn()193k ) |
7026 | 138k | if (OptimizeExtractBits(BinOp, CI, *TLI, *DL)) |
7027 | 1.59k | return true; |
7028 | 24.9M | } |
7029 | 24.9M | |
7030 | 24.9M | if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) { |
7031 | 6.51M | if (GEPI->hasAllZeroIndices()) { |
7032 | 174k | /// The GEP operand must be a pointer, so must its result -> BitCast |
7033 | 174k | Instruction *NC = new BitCastInst(GEPI->getOperand(0), GEPI->getType(), |
7034 | 174k | GEPI->getName(), GEPI); |
7035 | 174k | NC->setDebugLoc(GEPI->getDebugLoc()); |
7036 | 174k | GEPI->replaceAllUsesWith(NC); |
7037 | 174k | GEPI->eraseFromParent(); |
7038 | 174k | ++NumGEPsElim; |
7039 | 174k | optimizeInst(NC, ModifiedDT); |
7040 | 174k | return true; |
7041 | 174k | } |
7042 | 6.33M | if (tryUnmergingGEPsAcrossIndirectBr(GEPI, TTI)) { |
7043 | 1 | return true; |
7044 | 1 | } |
7045 | 6.33M | return false; |
7046 | 6.33M | } |
7047 | 18.4M | |
7048 | 18.4M | if (tryToSinkFreeOperands(I)) |
7049 | 18 | return true; |
7050 | 18.4M | |
7051 | 18.4M | switch (I->getOpcode()) { |
7052 | 18.4M | case Instruction::Shl: |
7053 | 485k | case Instruction::LShr: |
7054 | 485k | case Instruction::AShr: |
7055 | 485k | return optimizeShiftInst(cast<BinaryOperator>(I)); |
7056 | 4.03M | case Instruction::Call: |
7057 | 4.03M | return optimizeCallInst(cast<CallInst>(I), ModifiedDT); |
7058 | 485k | case Instruction::Select: |
7059 | 303k | return optimizeSelectInst(cast<SelectInst>(I)); |
7060 | 485k | case Instruction::ShuffleVector: |
7061 | 230k | return optimizeShuffleVectorInst(cast<ShuffleVectorInst>(I)); |
7062 | 485k | case Instruction::Switch: |
7063 | 39.3k | return optimizeSwitchInst(cast<SwitchInst>(I)); |
7064 | 485k | case Instruction::ExtractElement: |
7065 | 67.4k | return optimizeExtractElementInst(cast<ExtractElementInst>(I)); |
7066 | 13.3M | } |
7067 | 13.3M | |
7068 | 13.3M | return false; |
7069 | 13.3M | } |
7070 | | |
7071 | | /// Given an OR instruction, check to see if this is a bitreverse |
7072 | | /// idiom. If so, insert the new intrinsic and return true. |
7073 | | static bool makeBitReverse(Instruction &I, const DataLayout &DL, |
7074 | 41.2M | const TargetLowering &TLI) { |
7075 | 41.2M | if (!I.getType()->isIntegerTy() || |
7076 | 41.2M | !TLI.isOperationLegalOrCustom(ISD::BITREVERSE, |
7077 | 13.4M | TLI.getValueType(DL, I.getType(), true))) |
7078 | 33.5M | return false; |
7079 | 7.74M | |
7080 | 7.74M | SmallVector<Instruction*, 4> Insts; |
7081 | 7.74M | if (!recognizeBSwapOrBitReverseIdiom(&I, false, true, Insts)) |
7082 | 7.74M | return false; |
7083 | 8 | Instruction *LastInst = Insts.back(); |
7084 | 8 | I.replaceAllUsesWith(LastInst); |
7085 | 8 | RecursivelyDeleteTriviallyDeadInstructions(&I); |
7086 | 8 | return true; |
7087 | 8 | } |
7088 | | |
7089 | | // In this pass we look for GEP and cast instructions that are used |
7090 | | // across basic blocks and rewrite them to improve basic-block-at-a-time |
7091 | | // selection. |
7092 | 6.33M | bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, bool &ModifiedDT) { |
7093 | 6.33M | SunkAddrs.clear(); |
7094 | 6.33M | bool MadeChange = false; |
7095 | 6.33M | |
7096 | 6.33M | CurInstIterator = BB.begin(); |
7097 | 47.1M | while (CurInstIterator != BB.end()) { |
7098 | 40.8M | MadeChange |= optimizeInst(&*CurInstIterator++, ModifiedDT); |
7099 | 40.8M | if (ModifiedDT) |
7100 | 3.02k | return true; |
7101 | 40.8M | } |
7102 | 6.33M | |
7103 | 6.33M | bool MadeBitReverse = true; |
7104 | 12.6M | while (TLI && MadeBitReverse12.6M ) { |
7105 | 6.33M | MadeBitReverse = false; |
7106 | 41.2M | for (auto &I : reverse(BB)) { |
7107 | 41.2M | if (makeBitReverse(I, *DL, *TLI)) { |
7108 | 9 | MadeBitReverse = MadeChange = true; |
7109 | 9 | ModifiedDT = true; |
7110 | 9 | break; |
7111 | 9 | } |
7112 | 41.2M | } |
7113 | 6.33M | } |
7114 | 6.33M | MadeChange |= dupRetToEnableTailCallOpts(&BB, ModifiedDT); |
7115 | 6.33M | |
7116 | 6.33M | return MadeChange; |
7117 | 6.33M | } |
7118 | | |
7119 | | // llvm.dbg.value is far away from the value then iSel may not be able |
7120 | | // handle it properly. iSel will drop llvm.dbg.value if it can not |
7121 | | // find a node corresponding to the value. |
7122 | 490k | bool CodeGenPrepare::placeDbgValues(Function &F) { |
7123 | 490k | bool MadeChange = false; |
7124 | 2.55M | for (BasicBlock &BB : F) { |
7125 | 2.55M | Instruction *PrevNonDbgInst = nullptr; |
7126 | 17.9M | for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) { |
7127 | 15.3M | Instruction *Insn = &*BI++; |
7128 | 15.3M | DbgValueInst *DVI = dyn_cast<DbgValueInst>(Insn); |
7129 | 15.3M | // Leave dbg.values that refer to an alloca alone. These |
7130 | 15.3M | // intrinsics describe the address of a variable (= the alloca) |
7131 | 15.3M | // being taken. They should not be moved next to the alloca |
7132 | 15.3M | // (and to the beginning of the scope), but rather stay close to |
7133 | 15.3M | // where said address is used. |
7134 | 15.3M | if (!DVI || (4.98k DVI->getValue()4.98k && isa<AllocaInst>(DVI->getValue())4.98k )) { |
7135 | 15.3M | PrevNonDbgInst = Insn; |
7136 | 15.3M | continue; |
7137 | 15.3M | } |
7138 | 4.92k | |
7139 | 4.92k | Instruction *VI = dyn_cast_or_null<Instruction>(DVI->getValue()); |
7140 | 4.92k | if (VI && VI != PrevNonDbgInst467 && !VI->isTerminator()91 ) { |
7141 | 91 | // If VI is a phi in a block with an EHPad terminator, we can't insert |
7142 | 91 | // after it. |
7143 | 91 | if (isa<PHINode>(VI) && VI->getParent()->getTerminator()->isEHPad()16 ) |
7144 | 1 | continue; |
7145 | 90 | LLVM_DEBUG(dbgs() << "Moving Debug Value before :\n" |
7146 | 90 | << *DVI << ' ' << *VI); |
7147 | 90 | DVI->removeFromParent(); |
7148 | 90 | if (isa<PHINode>(VI)) |
7149 | 15 | DVI->insertBefore(&*VI->getParent()->getFirstInsertionPt()); |
7150 | 75 | else |
7151 | 75 | DVI->insertAfter(VI); |
7152 | 90 | MadeChange = true; |
7153 | 90 | ++NumDbgValueMoved; |
7154 | 90 | } |
7155 | 4.92k | } |
7156 | 2.55M | } |
7157 | 490k | return MadeChange; |
7158 | 490k | } |
7159 | | |
7160 | | /// Scale down both weights to fit into uint32_t. |
7161 | 4 | static void scaleWeights(uint64_t &NewTrue, uint64_t &NewFalse) { |
7162 | 4 | uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue2 : NewFalse2 ; |
7163 | 4 | uint32_t Scale = (NewMax / std::numeric_limits<uint32_t>::max()) + 1; |
7164 | 4 | NewTrue = NewTrue / Scale; |
7165 | 4 | NewFalse = NewFalse / Scale; |
7166 | 4 | } |
7167 | | |
7168 | | /// Some targets prefer to split a conditional branch like: |
7169 | | /// \code |
7170 | | /// %0 = icmp ne i32 %a, 0 |
7171 | | /// %1 = icmp ne i32 %b, 0 |
7172 | | /// %or.cond = or i1 %0, %1 |
7173 | | /// br i1 %or.cond, label %TrueBB, label %FalseBB |
7174 | | /// \endcode |
7175 | | /// into multiple branch instructions like: |
7176 | | /// \code |
7177 | | /// bb1: |
7178 | | /// %0 = icmp ne i32 %a, 0 |
7179 | | /// br i1 %0, label %TrueBB, label %bb2 |
7180 | | /// bb2: |
7181 | | /// %1 = icmp ne i32 %b, 0 |
7182 | | /// br i1 %1, label %TrueBB, label %FalseBB |
7183 | | /// \endcode |
7184 | | /// This usually allows instruction selection to do even further optimizations |
7185 | | /// and combine the compare with the branch instruction. Currently this is |
7186 | | /// applied for targets which have "cheap" jump instructions. |
7187 | | /// |
7188 | | /// FIXME: Remove the (equivalent?) implementation in SelectionDAG. |
7189 | | /// |
7190 | 490k | bool CodeGenPrepare::splitBranchCondition(Function &F, bool &ModifiedDT) { |
7191 | 490k | if (!TM || !TM->Options.EnableFastISel490k || !TLI8.24k || TLI->isJumpExpensive()8.24k ) |
7192 | 482k | return false; |
7193 | 8.22k | |
7194 | 8.22k | bool MadeChange = false; |
7195 | 8.64k | for (auto &BB : F) { |
7196 | 8.64k | // Does this BB end with the following? |
7197 | 8.64k | // %cond1 = icmp|fcmp|binary instruction ... |
7198 | 8.64k | // %cond2 = icmp|fcmp|binary instruction ... |
7199 | 8.64k | // %cond.or = or|and i1 %cond1, cond2 |
7200 | 8.64k | // br i1 %cond.or label %dest1, label %dest2" |
7201 | 8.64k | BinaryOperator *LogicOp; |
7202 | 8.64k | BasicBlock *TBB, *FBB; |
7203 | 8.64k | if (!match(BB.getTerminator(), m_Br(m_OneUse(m_BinOp(LogicOp)), TBB, FBB))) |
7204 | 8.63k | continue; |
7205 | 6 | |
7206 | 6 | auto *Br1 = cast<BranchInst>(BB.getTerminator()); |
7207 | 6 | if (Br1->getMetadata(LLVMContext::MD_unpredictable)) |
7208 | 2 | continue; |
7209 | 4 | |
7210 | 4 | unsigned Opc; |
7211 | 4 | Value *Cond1, *Cond2; |
7212 | 4 | if (match(LogicOp, m_And(m_OneUse(m_Value(Cond1)), |
7213 | 4 | m_OneUse(m_Value(Cond2))))) |
7214 | 2 | Opc = Instruction::And; |
7215 | 2 | else if (match(LogicOp, m_Or(m_OneUse(m_Value(Cond1)), |
7216 | 2 | m_OneUse(m_Value(Cond2))))) |
7217 | 1 | Opc = Instruction::Or; |
7218 | 1 | else |
7219 | 1 | continue; |
7220 | 3 | |
7221 | 3 | if (!match(Cond1, m_CombineOr(m_Cmp(), m_BinOp())) || |
7222 | 3 | !match(Cond2, m_CombineOr(m_Cmp(), m_BinOp())) ) |
7223 | 0 | continue; |
7224 | 3 | |
7225 | 3 | LLVM_DEBUG(dbgs() << "Before branch condition splitting\n"; BB.dump()); |
7226 | 3 | |
7227 | 3 | // Create a new BB. |
7228 | 3 | auto TmpBB = |
7229 | 3 | BasicBlock::Create(BB.getContext(), BB.getName() + ".cond.split", |
7230 | 3 | BB.getParent(), BB.getNextNode()); |
7231 | 3 | |
7232 | 3 | // Update original basic block by using the first condition directly by the |
7233 | 3 | // branch instruction and removing the no longer needed and/or instruction. |
7234 | 3 | Br1->setCondition(Cond1); |
7235 | 3 | LogicOp->eraseFromParent(); |
7236 | 3 | |
7237 | 3 | // Depending on the condition we have to either replace the true or the |
7238 | 3 | // false successor of the original branch instruction. |
7239 | 3 | if (Opc == Instruction::And) |
7240 | 2 | Br1->setSuccessor(0, TmpBB); |
7241 | 1 | else |
7242 | 1 | Br1->setSuccessor(1, TmpBB); |
7243 | 3 | |
7244 | 3 | // Fill in the new basic block. |
7245 | 3 | auto *Br2 = IRBuilder<>(TmpBB).CreateCondBr(Cond2, TBB, FBB); |
7246 | 3 | if (auto *I = dyn_cast<Instruction>(Cond2)) { |
7247 | 3 | I->removeFromParent(); |
7248 | 3 | I->insertBefore(Br2); |
7249 | 3 | } |
7250 | 3 | |
7251 | 3 | // Update PHI nodes in both successors. The original BB needs to be |
7252 | 3 | // replaced in one successor's PHI nodes, because the branch comes now from |
7253 | 3 | // the newly generated BB (NewBB). In the other successor we need to add one |
7254 | 3 | // incoming edge to the PHI nodes, because both branch instructions target |
7255 | 3 | // now the same successor. Depending on the original branch condition |
7256 | 3 | // (and/or) we have to swap the successors (TrueDest, FalseDest), so that |
7257 | 3 | // we perform the correct update for the PHI nodes. |
7258 | 3 | // This doesn't change the successor order of the just created branch |
7259 | 3 | // instruction (or any other instruction). |
7260 | 3 | if (Opc == Instruction::Or) |
7261 | 1 | std::swap(TBB, FBB); |
7262 | 3 | |
7263 | 3 | // Replace the old BB with the new BB. |
7264 | 3 | TBB->replacePhiUsesWith(&BB, TmpBB); |
7265 | 3 | |
7266 | 3 | // Add another incoming edge form the new BB. |
7267 | 3 | for (PHINode &PN : FBB->phis()) { |
7268 | 0 | auto *Val = PN.getIncomingValueForBlock(&BB); |
7269 | 0 | PN.addIncoming(Val, TmpBB); |
7270 | 0 | } |
7271 | 3 | |
7272 | 3 | // Update the branch weights (from SelectionDAGBuilder:: |
7273 | 3 | // FindMergedConditions). |
7274 | 3 | if (Opc == Instruction::Or) { |
7275 | 1 | // Codegen X | Y as: |
7276 | 1 | // BB1: |
7277 | 1 | // jmp_if_X TBB |
7278 | 1 | // jmp TmpBB |
7279 | 1 | // TmpBB: |
7280 | 1 | // jmp_if_Y TBB |
7281 | 1 | // jmp FBB |
7282 | 1 | // |
7283 | 1 | |
7284 | 1 | // We have flexibility in setting Prob for BB1 and Prob for NewBB. |
7285 | 1 | // The requirement is that |
7286 | 1 | // TrueProb for BB1 + (FalseProb for BB1 * TrueProb for TmpBB) |
7287 | 1 | // = TrueProb for original BB. |
7288 | 1 | // Assuming the original weights are A and B, one choice is to set BB1's |
7289 | 1 | // weights to A and A+2B, and set TmpBB's weights to A and 2B. This choice |
7290 | 1 | // assumes that |
7291 | 1 | // TrueProb for BB1 == FalseProb for BB1 * TrueProb for TmpBB. |
7292 | 1 | // Another choice is to assume TrueProb for BB1 equals to TrueProb for |
7293 | 1 | // TmpBB, but the math is more complicated. |
7294 | 1 | uint64_t TrueWeight, FalseWeight; |
7295 | 1 | if (Br1->extractProfMetadata(TrueWeight, FalseWeight)) { |
7296 | 1 | uint64_t NewTrueWeight = TrueWeight; |
7297 | 1 | uint64_t NewFalseWeight = TrueWeight + 2 * FalseWeight; |
7298 | 1 | scaleWeights(NewTrueWeight, NewFalseWeight); |
7299 | 1 | Br1->setMetadata(LLVMContext::MD_prof, MDBuilder(Br1->getContext()) |
7300 | 1 | .createBranchWeights(TrueWeight, FalseWeight)); |
7301 | 1 | |
7302 | 1 | NewTrueWeight = TrueWeight; |
7303 | 1 | NewFalseWeight = 2 * FalseWeight; |
7304 | 1 | scaleWeights(NewTrueWeight, NewFalseWeight); |
7305 | 1 | Br2->setMetadata(LLVMContext::MD_prof, MDBuilder(Br2->getContext()) |
7306 | 1 | .createBranchWeights(TrueWeight, FalseWeight)); |
7307 | 1 | } |
7308 | 2 | } else { |
7309 | 2 | // Codegen X & Y as: |
7310 | 2 | // BB1: |
7311 | 2 | // jmp_if_X TmpBB |
7312 | 2 | // jmp FBB |
7313 | 2 | // TmpBB: |
7314 | 2 | // jmp_if_Y TBB |
7315 | 2 | // jmp FBB |
7316 | 2 | // |
7317 | 2 | // This requires creation of TmpBB after CurBB. |
7318 | 2 | |
7319 | 2 | // We have flexibility in setting Prob for BB1 and Prob for TmpBB. |
7320 | 2 | // The requirement is that |
7321 | 2 | // FalseProb for BB1 + (TrueProb for BB1 * FalseProb for TmpBB) |
7322 | 2 | // = FalseProb for original BB. |
7323 | 2 | // Assuming the original weights are A and B, one choice is to set BB1's |
7324 | 2 | // weights to 2A+B and B, and set TmpBB's weights to 2A and B. This choice |
7325 | 2 | // assumes that |
7326 | 2 | // FalseProb for BB1 == TrueProb for BB1 * FalseProb for TmpBB. |
7327 | 2 | uint64_t TrueWeight, FalseWeight; |
7328 | 2 | if (Br1->extractProfMetadata(TrueWeight, FalseWeight)) { |
7329 | 1 | uint64_t NewTrueWeight = 2 * TrueWeight + FalseWeight; |
7330 | 1 | uint64_t NewFalseWeight = FalseWeight; |
7331 | 1 | scaleWeights(NewTrueWeight, NewFalseWeight); |
7332 | 1 | Br1->setMetadata(LLVMContext::MD_prof, MDBuilder(Br1->getContext()) |
7333 | 1 | .createBranchWeights(TrueWeight, FalseWeight)); |
7334 | 1 | |
7335 | 1 | NewTrueWeight = 2 * TrueWeight; |
7336 | 1 | NewFalseWeight = FalseWeight; |
7337 | 1 | scaleWeights(NewTrueWeight, NewFalseWeight); |
7338 | 1 | Br2->setMetadata(LLVMContext::MD_prof, MDBuilder(Br2->getContext()) |
7339 | 1 | .createBranchWeights(TrueWeight, FalseWeight)); |
7340 | 1 | } |
7341 | 2 | } |
7342 | 3 | |
7343 | 3 | ModifiedDT = true; |
7344 | 3 | MadeChange = true; |
7345 | 3 | |
7346 | 3 | LLVM_DEBUG(dbgs() << "After branch condition splitting\n"; BB.dump(); |
7347 | 3 | TmpBB->dump()); |
7348 | 3 | } |
7349 | 8.22k | return MadeChange; |
7350 | 8.22k | } |