/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | // |
9 | | // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops |
10 | | // and generates target-independent LLVM-IR. |
11 | | // The vectorizer uses the TargetTransformInfo analysis to estimate the costs |
12 | | // of instructions in order to estimate the profitability of vectorization. |
13 | | // |
14 | | // The loop vectorizer combines consecutive loop iterations into a single |
15 | | // 'wide' iteration. After this transformation the index is incremented |
16 | | // by the SIMD vector width, and not by one. |
17 | | // |
18 | | // This pass has three parts: |
19 | | // 1. The main loop pass that drives the different parts. |
20 | | // 2. LoopVectorizationLegality - A unit that checks for the legality |
21 | | // of the vectorization. |
22 | | // 3. InnerLoopVectorizer - A unit that performs the actual |
23 | | // widening of instructions. |
24 | | // 4. LoopVectorizationCostModel - A unit that checks for the profitability |
25 | | // of vectorization. It decides on the optimal vector width, which |
26 | | // can be one, if vectorization is not profitable. |
27 | | // |
28 | | // There is a development effort going on to migrate loop vectorizer to the |
29 | | // VPlan infrastructure and to introduce outer loop vectorization support (see |
30 | | // docs/Proposal/VectorizationPlan.rst and |
31 | | // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this |
32 | | // purpose, we temporarily introduced the VPlan-native vectorization path: an |
33 | | // alternative vectorization path that is natively implemented on top of the |
34 | | // VPlan infrastructure. See EnableVPlanNativePath for enabling. |
35 | | // |
36 | | //===----------------------------------------------------------------------===// |
37 | | // |
38 | | // The reduction-variable vectorization is based on the paper: |
39 | | // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. |
40 | | // |
41 | | // Variable uniformity checks are inspired by: |
42 | | // Karrenberg, R. and Hack, S. Whole Function Vectorization. |
43 | | // |
44 | | // The interleaved access vectorization is based on the paper: |
45 | | // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved |
46 | | // Data for SIMD |
47 | | // |
48 | | // Other ideas/concepts are from: |
49 | | // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. |
50 | | // |
51 | | // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of |
52 | | // Vectorizing Compilers. |
53 | | // |
54 | | //===----------------------------------------------------------------------===// |
55 | | |
56 | | #include "llvm/Transforms/Vectorize/LoopVectorize.h" |
57 | | #include "LoopVectorizationPlanner.h" |
58 | | #include "VPRecipeBuilder.h" |
59 | | #include "VPlan.h" |
60 | | #include "VPlanHCFGBuilder.h" |
61 | | #include "VPlanHCFGTransforms.h" |
62 | | #include "VPlanPredicator.h" |
63 | | #include "llvm/ADT/APInt.h" |
64 | | #include "llvm/ADT/ArrayRef.h" |
65 | | #include "llvm/ADT/DenseMap.h" |
66 | | #include "llvm/ADT/DenseMapInfo.h" |
67 | | #include "llvm/ADT/Hashing.h" |
68 | | #include "llvm/ADT/MapVector.h" |
69 | | #include "llvm/ADT/None.h" |
70 | | #include "llvm/ADT/Optional.h" |
71 | | #include "llvm/ADT/STLExtras.h" |
72 | | #include "llvm/ADT/SetVector.h" |
73 | | #include "llvm/ADT/SmallPtrSet.h" |
74 | | #include "llvm/ADT/SmallVector.h" |
75 | | #include "llvm/ADT/Statistic.h" |
76 | | #include "llvm/ADT/StringRef.h" |
77 | | #include "llvm/ADT/Twine.h" |
78 | | #include "llvm/ADT/iterator_range.h" |
79 | | #include "llvm/Analysis/AssumptionCache.h" |
80 | | #include "llvm/Analysis/BasicAliasAnalysis.h" |
81 | | #include "llvm/Analysis/BlockFrequencyInfo.h" |
82 | | #include "llvm/Analysis/CFG.h" |
83 | | #include "llvm/Analysis/CodeMetrics.h" |
84 | | #include "llvm/Analysis/DemandedBits.h" |
85 | | #include "llvm/Analysis/GlobalsModRef.h" |
86 | | #include "llvm/Analysis/LoopAccessAnalysis.h" |
87 | | #include "llvm/Analysis/LoopAnalysisManager.h" |
88 | | #include "llvm/Analysis/LoopInfo.h" |
89 | | #include "llvm/Analysis/LoopIterator.h" |
90 | | #include "llvm/Analysis/MemorySSA.h" |
91 | | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
92 | | #include "llvm/Analysis/ProfileSummaryInfo.h" |
93 | | #include "llvm/Analysis/ScalarEvolution.h" |
94 | | #include "llvm/Analysis/ScalarEvolutionExpander.h" |
95 | | #include "llvm/Analysis/ScalarEvolutionExpressions.h" |
96 | | #include "llvm/Analysis/TargetLibraryInfo.h" |
97 | | #include "llvm/Analysis/TargetTransformInfo.h" |
98 | | #include "llvm/Analysis/VectorUtils.h" |
99 | | #include "llvm/IR/Attributes.h" |
100 | | #include "llvm/IR/BasicBlock.h" |
101 | | #include "llvm/IR/CFG.h" |
102 | | #include "llvm/IR/Constant.h" |
103 | | #include "llvm/IR/Constants.h" |
104 | | #include "llvm/IR/DataLayout.h" |
105 | | #include "llvm/IR/DebugInfoMetadata.h" |
106 | | #include "llvm/IR/DebugLoc.h" |
107 | | #include "llvm/IR/DerivedTypes.h" |
108 | | #include "llvm/IR/DiagnosticInfo.h" |
109 | | #include "llvm/IR/Dominators.h" |
110 | | #include "llvm/IR/Function.h" |
111 | | #include "llvm/IR/IRBuilder.h" |
112 | | #include "llvm/IR/InstrTypes.h" |
113 | | #include "llvm/IR/Instruction.h" |
114 | | #include "llvm/IR/Instructions.h" |
115 | | #include "llvm/IR/IntrinsicInst.h" |
116 | | #include "llvm/IR/Intrinsics.h" |
117 | | #include "llvm/IR/LLVMContext.h" |
118 | | #include "llvm/IR/Metadata.h" |
119 | | #include "llvm/IR/Module.h" |
120 | | #include "llvm/IR/Operator.h" |
121 | | #include "llvm/IR/Type.h" |
122 | | #include "llvm/IR/Use.h" |
123 | | #include "llvm/IR/User.h" |
124 | | #include "llvm/IR/Value.h" |
125 | | #include "llvm/IR/ValueHandle.h" |
126 | | #include "llvm/IR/Verifier.h" |
127 | | #include "llvm/Pass.h" |
128 | | #include "llvm/Support/Casting.h" |
129 | | #include "llvm/Support/CommandLine.h" |
130 | | #include "llvm/Support/Compiler.h" |
131 | | #include "llvm/Support/Debug.h" |
132 | | #include "llvm/Support/ErrorHandling.h" |
133 | | #include "llvm/Support/MathExtras.h" |
134 | | #include "llvm/Support/raw_ostream.h" |
135 | | #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
136 | | #include "llvm/Transforms/Utils/LoopSimplify.h" |
137 | | #include "llvm/Transforms/Utils/LoopUtils.h" |
138 | | #include "llvm/Transforms/Utils/LoopVersioning.h" |
139 | | #include "llvm/Transforms/Utils/SizeOpts.h" |
140 | | #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" |
141 | | #include <algorithm> |
142 | | #include <cassert> |
143 | | #include <cstdint> |
144 | | #include <cstdlib> |
145 | | #include <functional> |
146 | | #include <iterator> |
147 | | #include <limits> |
148 | | #include <memory> |
149 | | #include <string> |
150 | | #include <tuple> |
151 | | #include <utility> |
152 | | #include <vector> |
153 | | |
154 | | using namespace llvm; |
155 | | |
156 | 24 | #define LV_NAME "loop-vectorize" |
157 | | #define DEBUG_TYPE LV_NAME |
158 | | |
159 | | /// @{ |
160 | | /// Metadata attribute names |
161 | | static const char *const LLVMLoopVectorizeFollowupAll = |
162 | | "llvm.loop.vectorize.followup_all"; |
163 | | static const char *const LLVMLoopVectorizeFollowupVectorized = |
164 | | "llvm.loop.vectorize.followup_vectorized"; |
165 | | static const char *const LLVMLoopVectorizeFollowupEpilogue = |
166 | | "llvm.loop.vectorize.followup_epilogue"; |
167 | | /// @} |
168 | | |
169 | | STATISTIC(LoopsVectorized, "Number of loops vectorized"); |
170 | | STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); |
171 | | |
172 | | /// Loops with a known constant trip count below this number are vectorized only |
173 | | /// if no scalar iteration overheads are incurred. |
174 | | static cl::opt<unsigned> TinyTripCountVectorThreshold( |
175 | | "vectorizer-min-trip-count", cl::init(16), cl::Hidden, |
176 | | cl::desc("Loops with a constant trip count that is smaller than this " |
177 | | "value are vectorized only if no scalar iteration overheads " |
178 | | "are incurred.")); |
179 | | |
180 | | static cl::opt<bool> MaximizeBandwidth( |
181 | | "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, |
182 | | cl::desc("Maximize bandwidth when selecting vectorization factor which " |
183 | | "will be determined by the smallest type in loop.")); |
184 | | |
185 | | static cl::opt<bool> EnableInterleavedMemAccesses( |
186 | | "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, |
187 | | cl::desc("Enable vectorization on interleaved memory accesses in a loop")); |
188 | | |
189 | | /// An interleave-group may need masking if it resides in a block that needs |
190 | | /// predication, or in order to mask away gaps. |
191 | | static cl::opt<bool> EnableMaskedInterleavedMemAccesses( |
192 | | "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, |
193 | | cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); |
194 | | |
195 | | /// We don't interleave loops with a known constant trip count below this |
196 | | /// number. |
197 | | static const unsigned TinyTripCountInterleaveThreshold = 128; |
198 | | |
199 | | static cl::opt<unsigned> ForceTargetNumScalarRegs( |
200 | | "force-target-num-scalar-regs", cl::init(0), cl::Hidden, |
201 | | cl::desc("A flag that overrides the target's number of scalar registers.")); |
202 | | |
203 | | static cl::opt<unsigned> ForceTargetNumVectorRegs( |
204 | | "force-target-num-vector-regs", cl::init(0), cl::Hidden, |
205 | | cl::desc("A flag that overrides the target's number of vector registers.")); |
206 | | |
207 | | static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( |
208 | | "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, |
209 | | cl::desc("A flag that overrides the target's max interleave factor for " |
210 | | "scalar loops.")); |
211 | | |
212 | | static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( |
213 | | "force-target-max-vector-interleave", cl::init(0), cl::Hidden, |
214 | | cl::desc("A flag that overrides the target's max interleave factor for " |
215 | | "vectorized loops.")); |
216 | | |
217 | | static cl::opt<unsigned> ForceTargetInstructionCost( |
218 | | "force-target-instruction-cost", cl::init(0), cl::Hidden, |
219 | | cl::desc("A flag that overrides the target's expected cost for " |
220 | | "an instruction to a single constant value. Mostly " |
221 | | "useful for getting consistent testing.")); |
222 | | |
223 | | static cl::opt<unsigned> SmallLoopCost( |
224 | | "small-loop-cost", cl::init(20), cl::Hidden, |
225 | | cl::desc( |
226 | | "The cost of a loop that is considered 'small' by the interleaver.")); |
227 | | |
228 | | static cl::opt<bool> LoopVectorizeWithBlockFrequency( |
229 | | "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, |
230 | | cl::desc("Enable the use of the block frequency analysis to access PGO " |
231 | | "heuristics minimizing code growth in cold regions and being more " |
232 | | "aggressive in hot regions.")); |
233 | | |
234 | | // Runtime interleave loops for load/store throughput. |
235 | | static cl::opt<bool> EnableLoadStoreRuntimeInterleave( |
236 | | "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, |
237 | | cl::desc( |
238 | | "Enable runtime interleaving until load/store ports are saturated")); |
239 | | |
240 | | /// The number of stores in a loop that are allowed to need predication. |
241 | | static cl::opt<unsigned> NumberOfStoresToPredicate( |
242 | | "vectorize-num-stores-pred", cl::init(1), cl::Hidden, |
243 | | cl::desc("Max number of stores to be predicated behind an if.")); |
244 | | |
245 | | static cl::opt<bool> EnableIndVarRegisterHeur( |
246 | | "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, |
247 | | cl::desc("Count the induction variable only once when interleaving")); |
248 | | |
249 | | static cl::opt<bool> EnableCondStoresVectorization( |
250 | | "enable-cond-stores-vec", cl::init(true), cl::Hidden, |
251 | | cl::desc("Enable if predication of stores during vectorization.")); |
252 | | |
253 | | static cl::opt<unsigned> MaxNestedScalarReductionIC( |
254 | | "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, |
255 | | cl::desc("The maximum interleave count to use when interleaving a scalar " |
256 | | "reduction in a nested loop.")); |
257 | | |
258 | | cl::opt<bool> EnableVPlanNativePath( |
259 | | "enable-vplan-native-path", cl::init(false), cl::Hidden, |
260 | | cl::desc("Enable VPlan-native vectorization path with " |
261 | | "support for outer loop vectorization.")); |
262 | | |
263 | | // FIXME: Remove this switch once we have divergence analysis. Currently we |
264 | | // assume divergent non-backedge branches when this switch is true. |
265 | | cl::opt<bool> EnableVPlanPredication( |
266 | | "enable-vplan-predication", cl::init(false), cl::Hidden, |
267 | | cl::desc("Enable VPlan-native vectorization path predicator with " |
268 | | "support for outer loop vectorization.")); |
269 | | |
270 | | // This flag enables the stress testing of the VPlan H-CFG construction in the |
271 | | // VPlan-native vectorization path. It must be used in conjuction with |
272 | | // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the |
273 | | // verification of the H-CFGs built. |
274 | | static cl::opt<bool> VPlanBuildStressTest( |
275 | | "vplan-build-stress-test", cl::init(false), cl::Hidden, |
276 | | cl::desc( |
277 | | "Build VPlan for every supported loop nest in the function and bail " |
278 | | "out right after the build (stress test the VPlan H-CFG construction " |
279 | | "in the VPlan-native vectorization path).")); |
280 | | |
281 | | cl::opt<bool> llvm::EnableLoopInterleaving( |
282 | | "interleave-loops", cl::init(true), cl::Hidden, |
283 | | cl::desc("Enable loop interleaving in Loop vectorization passes")); |
284 | | cl::opt<bool> llvm::EnableLoopVectorization( |
285 | | "vectorize-loops", cl::init(true), cl::Hidden, |
286 | | cl::desc("Run the Loop vectorization passes")); |
287 | | |
288 | | /// A helper function for converting Scalar types to vector types. |
289 | | /// If the incoming type is void, we return void. If the VF is 1, we return |
290 | | /// the scalar type. |
291 | 633k | static Type *ToVectorTy(Type *Scalar, unsigned VF) { |
292 | 633k | if (Scalar->isVoidTy() || VF == 1558k ) |
293 | 226k | return Scalar; |
294 | 406k | return VectorType::get(Scalar, VF); |
295 | 406k | } |
296 | | |
297 | | /// A helper function that returns the type of loaded or stored value. |
298 | 260k | static Type *getMemInstValueType(Value *I) { |
299 | 260k | assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && |
300 | 260k | "Expected Load or Store instruction"); |
301 | 260k | if (auto *LI = dyn_cast<LoadInst>(I)) |
302 | 125k | return LI->getType(); |
303 | 134k | return cast<StoreInst>(I)->getValueOperand()->getType(); |
304 | 134k | } |
305 | | |
306 | | /// A helper function that returns true if the given type is irregular. The |
307 | | /// type is irregular if its allocated size doesn't equal the store size of an |
308 | | /// element of the corresponding vector type at the given vectorization factor. |
309 | 45.6k | static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) { |
310 | 45.6k | // Determine if an array of VF elements of type Ty is "bitcast compatible" |
311 | 45.6k | // with a <VF x Ty> vector. |
312 | 45.6k | if (VF > 1) { |
313 | 45.6k | auto *VectorTy = VectorType::get(Ty, VF); |
314 | 45.6k | return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy); |
315 | 45.6k | } |
316 | 0 | |
317 | 0 | // If the vectorization factor is one, we just check if an array of type Ty |
318 | 0 | // requires padding between elements. |
319 | 0 | return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); |
320 | 0 | } |
321 | | |
322 | | /// A helper function that returns the reciprocal of the block probability of |
323 | | /// predicated blocks. If we return X, we are assuming the predicated block |
324 | | /// will execute once for every X iterations of the loop header. |
325 | | /// |
326 | | /// TODO: We should use actual block probability here, if available. Currently, |
327 | | /// we always assume predicated blocks have a 50% chance of executing. |
328 | 4.29k | static unsigned getReciprocalPredBlockProb() { return 2; } |
329 | | |
330 | | /// A helper function that adds a 'fast' flag to floating-point operations. |
331 | 151k | static Value *addFastMathFlag(Value *V) { |
332 | 151k | if (isa<FPMathOperator>(V)) |
333 | 68 | cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); |
334 | 151k | return V; |
335 | 151k | } |
336 | | |
337 | 1.19k | static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { |
338 | 1.19k | if (isa<FPMathOperator>(V)) |
339 | 47 | cast<Instruction>(V)->setFastMathFlags(FMF); |
340 | 1.19k | return V; |
341 | 1.19k | } |
342 | | |
343 | | /// A helper function that returns an integer or floating-point constant with |
344 | | /// value C. |
345 | 70.4k | static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { |
346 | 70.4k | return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)70.4k |
347 | 70.4k | : ConstantFP::get(Ty, C)34 ; |
348 | 70.4k | } |
349 | | |
350 | | namespace llvm { |
351 | | |
352 | | /// InnerLoopVectorizer vectorizes loops which contain only one basic |
353 | | /// block to a specified vectorization factor (VF). |
354 | | /// This class performs the widening of scalars into vectors, or multiple |
355 | | /// scalars. This class also implements the following features: |
356 | | /// * It inserts an epilogue loop for handling loops that don't have iteration |
357 | | /// counts that are known to be a multiple of the vectorization factor. |
358 | | /// * It handles the code generation for reduction variables. |
359 | | /// * Scalarization (implementation using scalars) of un-vectorizable |
360 | | /// instructions. |
361 | | /// InnerLoopVectorizer does not perform any vectorization-legality |
362 | | /// checks, and relies on the caller to check for the different legality |
363 | | /// aspects. The InnerLoopVectorizer relies on the |
364 | | /// LoopVectorizationLegality class to provide information about the induction |
365 | | /// and reduction variables that were found to a given vectorization factor. |
366 | | class InnerLoopVectorizer { |
367 | | public: |
368 | | InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, |
369 | | LoopInfo *LI, DominatorTree *DT, |
370 | | const TargetLibraryInfo *TLI, |
371 | | const TargetTransformInfo *TTI, AssumptionCache *AC, |
372 | | OptimizationRemarkEmitter *ORE, unsigned VecWidth, |
373 | | unsigned UnrollFactor, LoopVectorizationLegality *LVL, |
374 | | LoopVectorizationCostModel *CM) |
375 | | : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), |
376 | | AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), |
377 | | Builder(PSE.getSE()->getContext()), |
378 | 17.0k | VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {} |
379 | 17.0k | virtual ~InnerLoopVectorizer() = default; |
380 | | |
381 | | /// Create a new empty loop. Unlink the old loop and connect the new one. |
382 | | /// Return the pre-header block of the new loop. |
383 | | BasicBlock *createVectorizedLoopSkeleton(); |
384 | | |
385 | | /// Widen a single instruction within the innermost loop. |
386 | | void widenInstruction(Instruction &I); |
387 | | |
388 | | /// Fix the vectorized code, taking care of header phi's, live-outs, and more. |
389 | | void fixVectorizedLoop(); |
390 | | |
391 | | // Return true if any runtime check is added. |
392 | 15.3k | bool areSafetyChecksAdded() { return AddedSafetyChecks; } |
393 | | |
394 | | /// A type for vectorized values in the new loop. Each value from the |
395 | | /// original loop, when vectorized, is represented by UF vector values in the |
396 | | /// new unrolled loop, where UF is the unroll factor. |
397 | | using VectorParts = SmallVector<Value *, 2>; |
398 | | |
399 | | /// Vectorize a single PHINode in a block. This method handles the induction |
400 | | /// variable canonicalization. It supports both VF = 1 for unrolled loops and |
401 | | /// arbitrary length vectors. |
402 | | void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); |
403 | | |
404 | | /// A helper function to scalarize a single Instruction in the innermost loop. |
405 | | /// Generates a sequence of scalar instances for each lane between \p MinLane |
406 | | /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, |
407 | | /// inclusive.. |
408 | | void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance, |
409 | | bool IfPredicateInstr); |
410 | | |
411 | | /// Widen an integer or floating-point induction variable \p IV. If \p Trunc |
412 | | /// is provided, the integer induction variable will first be truncated to |
413 | | /// the corresponding type. |
414 | | void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); |
415 | | |
416 | | /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a |
417 | | /// vector or scalar value on-demand if one is not yet available. When |
418 | | /// vectorizing a loop, we visit the definition of an instruction before its |
419 | | /// uses. When visiting the definition, we either vectorize or scalarize the |
420 | | /// instruction, creating an entry for it in the corresponding map. (In some |
421 | | /// cases, such as induction variables, we will create both vector and scalar |
422 | | /// entries.) Then, as we encounter uses of the definition, we derive values |
423 | | /// for each scalar or vector use unless such a value is already available. |
424 | | /// For example, if we scalarize a definition and one of its uses is vector, |
425 | | /// we build the required vector on-demand with an insertelement sequence |
426 | | /// when visiting the use. Otherwise, if the use is scalar, we can use the |
427 | | /// existing scalar definition. |
428 | | /// |
429 | | /// Return a value in the new loop corresponding to \p V from the original |
430 | | /// loop at unroll index \p Part. If the value has already been vectorized, |
431 | | /// the corresponding vector entry in VectorLoopValueMap is returned. If, |
432 | | /// however, the value has a scalar entry in VectorLoopValueMap, we construct |
433 | | /// a new vector value on-demand by inserting the scalar values into a vector |
434 | | /// with an insertelement sequence. If the value has been neither vectorized |
435 | | /// nor scalarized, it must be loop invariant, so we simply broadcast the |
436 | | /// value into a vector. |
437 | | Value *getOrCreateVectorValue(Value *V, unsigned Part); |
438 | | |
439 | | /// Return a value in the new loop corresponding to \p V from the original |
440 | | /// loop at unroll and vector indices \p Instance. If the value has been |
441 | | /// vectorized but not scalarized, the necessary extractelement instruction |
442 | | /// will be generated. |
443 | | Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); |
444 | | |
445 | | /// Construct the vector value of a scalarized value \p V one lane at a time. |
446 | | void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); |
447 | | |
448 | | /// Try to vectorize the interleaved access group that \p Instr belongs to, |
449 | | /// optionally masking the vector operations if \p BlockInMask is non-null. |
450 | | void vectorizeInterleaveGroup(Instruction *Instr, |
451 | | VectorParts *BlockInMask = nullptr); |
452 | | |
453 | | /// Vectorize Load and Store instructions, optionally masking the vector |
454 | | /// operations if \p BlockInMask is non-null. |
455 | | void vectorizeMemoryInstruction(Instruction *Instr, |
456 | | VectorParts *BlockInMask = nullptr); |
457 | | |
458 | | /// Set the debug location in the builder using the debug location in |
459 | | /// the instruction. |
460 | | void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); |
461 | | |
462 | | /// Fix the non-induction PHIs in the OrigPHIsToFix vector. |
463 | | void fixNonInductionPHIs(void); |
464 | | |
465 | | protected: |
466 | | friend class LoopVectorizationPlanner; |
467 | | |
468 | | /// A small list of PHINodes. |
469 | | using PhiVector = SmallVector<PHINode *, 4>; |
470 | | |
471 | | /// A type for scalarized values in the new loop. Each value from the |
472 | | /// original loop, when scalarized, is represented by UF x VF scalar values |
473 | | /// in the new unrolled loop, where UF is the unroll factor and VF is the |
474 | | /// vectorization factor. |
475 | | using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; |
476 | | |
477 | | /// Set up the values of the IVs correctly when exiting the vector loop. |
478 | | void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, |
479 | | Value *CountRoundDown, Value *EndValue, |
480 | | BasicBlock *MiddleBlock); |
481 | | |
482 | | /// Create a new induction variable inside L. |
483 | | PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, |
484 | | Value *Step, Instruction *DL); |
485 | | |
486 | | /// Handle all cross-iteration phis in the header. |
487 | | void fixCrossIterationPHIs(); |
488 | | |
489 | | /// Fix a first-order recurrence. This is the second phase of vectorizing |
490 | | /// this phi node. |
491 | | void fixFirstOrderRecurrence(PHINode *Phi); |
492 | | |
493 | | /// Fix a reduction cross-iteration phi. This is the second phase of |
494 | | /// vectorizing this phi node. |
495 | | void fixReduction(PHINode *Phi); |
496 | | |
497 | | /// The Loop exit block may have single value PHI nodes with some |
498 | | /// incoming value. While vectorizing we only handled real values |
499 | | /// that were defined inside the loop and we should have one value for |
500 | | /// each predecessor of its parent basic block. See PR14725. |
501 | | void fixLCSSAPHIs(); |
502 | | |
503 | | /// Iteratively sink the scalarized operands of a predicated instruction into |
504 | | /// the block that was created for it. |
505 | | void sinkScalarOperands(Instruction *PredInst); |
506 | | |
507 | | /// Shrinks vector element sizes to the smallest bitwidth they can be legally |
508 | | /// represented as. |
509 | | void truncateToMinimalBitwidths(); |
510 | | |
511 | | /// Insert the new loop to the loop hierarchy and pass manager |
512 | | /// and update the analysis passes. |
513 | | void updateAnalysis(); |
514 | | |
515 | | /// Create a broadcast instruction. This method generates a broadcast |
516 | | /// instruction (shuffle) for loop invariant values and for the induction |
517 | | /// value. If this is the induction variable then we extend it to N, N+1, ... |
518 | | /// this is needed because each iteration in the loop corresponds to a SIMD |
519 | | /// element. |
520 | | virtual Value *getBroadcastInstrs(Value *V); |
521 | | |
522 | | /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) |
523 | | /// to each vector element of Val. The sequence starts at StartIndex. |
524 | | /// \p Opcode is relevant for FP induction variable. |
525 | | virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, |
526 | | Instruction::BinaryOps Opcode = |
527 | | Instruction::BinaryOpsEnd); |
528 | | |
529 | | /// Compute scalar induction steps. \p ScalarIV is the scalar induction |
530 | | /// variable on which to base the steps, \p Step is the size of the step, and |
531 | | /// \p EntryVal is the value from the original loop that maps to the steps. |
532 | | /// Note that \p EntryVal doesn't have to be an induction variable - it |
533 | | /// can also be a truncate instruction. |
534 | | void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, |
535 | | const InductionDescriptor &ID); |
536 | | |
537 | | /// Create a vector induction phi node based on an existing scalar one. \p |
538 | | /// EntryVal is the value from the original loop that maps to the vector phi |
539 | | /// node, and \p Step is the loop-invariant step. If \p EntryVal is a |
540 | | /// truncate instruction, instead of widening the original IV, we widen a |
541 | | /// version of the IV truncated to \p EntryVal's type. |
542 | | void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, |
543 | | Value *Step, Instruction *EntryVal); |
544 | | |
545 | | /// Returns true if an instruction \p I should be scalarized instead of |
546 | | /// vectorized for the chosen vectorization factor. |
547 | | bool shouldScalarizeInstruction(Instruction *I) const; |
548 | | |
549 | | /// Returns true if we should generate a scalar version of \p IV. |
550 | | bool needsScalarInduction(Instruction *IV) const; |
551 | | |
552 | | /// If there is a cast involved in the induction variable \p ID, which should |
553 | | /// be ignored in the vectorized loop body, this function records the |
554 | | /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the |
555 | | /// cast. We had already proved that the casted Phi is equal to the uncasted |
556 | | /// Phi in the vectorized loop (under a runtime guard), and therefore |
557 | | /// there is no need to vectorize the cast - the same value can be used in the |
558 | | /// vector loop for both the Phi and the cast. |
559 | | /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, |
560 | | /// Otherwise, \p VectorLoopValue is a widened/vectorized value. |
561 | | /// |
562 | | /// \p EntryVal is the value from the original loop that maps to the vector |
563 | | /// phi node and is used to distinguish what is the IV currently being |
564 | | /// processed - original one (if \p EntryVal is a phi corresponding to the |
565 | | /// original IV) or the "newly-created" one based on the proof mentioned above |
566 | | /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the |
567 | | /// latter case \p EntryVal is a TruncInst and we must not record anything for |
568 | | /// that IV, but it's error-prone to expect callers of this routine to care |
569 | | /// about that, hence this explicit parameter. |
570 | | void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, |
571 | | const Instruction *EntryVal, |
572 | | Value *VectorLoopValue, |
573 | | unsigned Part, |
574 | | unsigned Lane = UINT_MAX); |
575 | | |
576 | | /// Generate a shuffle sequence that will reverse the vector Vec. |
577 | | virtual Value *reverseVector(Value *Vec); |
578 | | |
579 | | /// Returns (and creates if needed) the original loop trip count. |
580 | | Value *getOrCreateTripCount(Loop *NewLoop); |
581 | | |
582 | | /// Returns (and creates if needed) the trip count of the widened loop. |
583 | | Value *getOrCreateVectorTripCount(Loop *NewLoop); |
584 | | |
585 | | /// Returns a bitcasted value to the requested vector type. |
586 | | /// Also handles bitcasts of vector<float> <-> vector<pointer> types. |
587 | | Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, |
588 | | const DataLayout &DL); |
589 | | |
590 | | /// Emit a bypass check to see if the vector trip count is zero, including if |
591 | | /// it overflows. |
592 | | void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); |
593 | | |
594 | | /// Emit a bypass check to see if all of the SCEV assumptions we've |
595 | | /// had to make are correct. |
596 | | void emitSCEVChecks(Loop *L, BasicBlock *Bypass); |
597 | | |
598 | | /// Emit bypass checks to check any memory assumptions we may have made. |
599 | | void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); |
600 | | |
601 | | /// Compute the transformed value of Index at offset StartValue using step |
602 | | /// StepValue. |
603 | | /// For integer induction, returns StartValue + Index * StepValue. |
604 | | /// For pointer induction, returns StartValue[Index * StepValue]. |
605 | | /// FIXME: The newly created binary instructions should contain nsw/nuw |
606 | | /// flags, which can be found from the original scalar operations. |
607 | | Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, |
608 | | const DataLayout &DL, |
609 | | const InductionDescriptor &ID) const; |
610 | | |
611 | | /// Add additional metadata to \p To that was not present on \p Orig. |
612 | | /// |
613 | | /// Currently this is used to add the noalias annotations based on the |
614 | | /// inserted memchecks. Use this for instructions that are *cloned* into the |
615 | | /// vector loop. |
616 | | void addNewMetadata(Instruction *To, const Instruction *Orig); |
617 | | |
618 | | /// Add metadata from one instruction to another. |
619 | | /// |
620 | | /// This includes both the original MDs from \p From and additional ones (\see |
621 | | /// addNewMetadata). Use this for *newly created* instructions in the vector |
622 | | /// loop. |
623 | | void addMetadata(Instruction *To, Instruction *From); |
624 | | |
625 | | /// Similar to the previous function but it adds the metadata to a |
626 | | /// vector of instructions. |
627 | | void addMetadata(ArrayRef<Value *> To, Instruction *From); |
628 | | |
629 | | /// The original loop. |
630 | | Loop *OrigLoop; |
631 | | |
632 | | /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies |
633 | | /// dynamic knowledge to simplify SCEV expressions and converts them to a |
634 | | /// more usable form. |
635 | | PredicatedScalarEvolution &PSE; |
636 | | |
637 | | /// Loop Info. |
638 | | LoopInfo *LI; |
639 | | |
640 | | /// Dominator Tree. |
641 | | DominatorTree *DT; |
642 | | |
643 | | /// Alias Analysis. |
644 | | AliasAnalysis *AA; |
645 | | |
646 | | /// Target Library Info. |
647 | | const TargetLibraryInfo *TLI; |
648 | | |
649 | | /// Target Transform Info. |
650 | | const TargetTransformInfo *TTI; |
651 | | |
652 | | /// Assumption Cache. |
653 | | AssumptionCache *AC; |
654 | | |
655 | | /// Interface to emit optimization remarks. |
656 | | OptimizationRemarkEmitter *ORE; |
657 | | |
658 | | /// LoopVersioning. It's only set up (non-null) if memchecks were |
659 | | /// used. |
660 | | /// |
661 | | /// This is currently only used to add no-alias metadata based on the |
662 | | /// memchecks. The actually versioning is performed manually. |
663 | | std::unique_ptr<LoopVersioning> LVer; |
664 | | |
665 | | /// The vectorization SIMD factor to use. Each vector will have this many |
666 | | /// vector elements. |
667 | | unsigned VF; |
668 | | |
669 | | /// The vectorization unroll factor to use. Each scalar is vectorized to this |
670 | | /// many different vector instructions. |
671 | | unsigned UF; |
672 | | |
673 | | /// The builder that we use |
674 | | IRBuilder<> Builder; |
675 | | |
676 | | // --- Vectorization state --- |
677 | | |
678 | | /// The vector-loop preheader. |
679 | | BasicBlock *LoopVectorPreHeader; |
680 | | |
681 | | /// The scalar-loop preheader. |
682 | | BasicBlock *LoopScalarPreHeader; |
683 | | |
684 | | /// Middle Block between the vector and the scalar. |
685 | | BasicBlock *LoopMiddleBlock; |
686 | | |
687 | | /// The ExitBlock of the scalar loop. |
688 | | BasicBlock *LoopExitBlock; |
689 | | |
690 | | /// The vector loop body. |
691 | | BasicBlock *LoopVectorBody; |
692 | | |
693 | | /// The scalar loop body. |
694 | | BasicBlock *LoopScalarBody; |
695 | | |
696 | | /// A list of all bypass blocks. The first block is the entry of the loop. |
697 | | SmallVector<BasicBlock *, 4> LoopBypassBlocks; |
698 | | |
699 | | /// The new Induction variable which was added to the new block. |
700 | | PHINode *Induction = nullptr; |
701 | | |
702 | | /// The induction variable of the old basic block. |
703 | | PHINode *OldInduction = nullptr; |
704 | | |
705 | | /// Maps values from the original loop to their corresponding values in the |
706 | | /// vectorized loop. A key value can map to either vector values, scalar |
707 | | /// values or both kinds of values, depending on whether the key was |
708 | | /// vectorized and scalarized. |
709 | | VectorizerValueMap VectorLoopValueMap; |
710 | | |
711 | | /// Store instructions that were predicated. |
712 | | SmallVector<Instruction *, 4> PredicatedInstructions; |
713 | | |
714 | | /// Trip count of the original loop. |
715 | | Value *TripCount = nullptr; |
716 | | |
717 | | /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) |
718 | | Value *VectorTripCount = nullptr; |
719 | | |
720 | | /// The legality analysis. |
721 | | LoopVectorizationLegality *Legal; |
722 | | |
723 | | /// The profitablity analysis. |
724 | | LoopVectorizationCostModel *Cost; |
725 | | |
726 | | // Record whether runtime checks are added. |
727 | | bool AddedSafetyChecks = false; |
728 | | |
729 | | // Holds the end values for each induction variable. We save the end values |
730 | | // so we can later fix-up the external users of the induction variables. |
731 | | DenseMap<PHINode *, Value *> IVEndValues; |
732 | | |
733 | | // Vector of original scalar PHIs whose corresponding widened PHIs need to be |
734 | | // fixed up at the end of vector code generation. |
735 | | SmallVector<PHINode *, 8> OrigPHIsToFix; |
736 | | }; |
737 | | |
738 | | class InnerLoopUnroller : public InnerLoopVectorizer { |
739 | | public: |
740 | | InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, |
741 | | LoopInfo *LI, DominatorTree *DT, |
742 | | const TargetLibraryInfo *TLI, |
743 | | const TargetTransformInfo *TTI, AssumptionCache *AC, |
744 | | OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, |
745 | | LoopVectorizationLegality *LVL, |
746 | | LoopVectorizationCostModel *CM) |
747 | | : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1, |
748 | 1.74k | UnrollFactor, LVL, CM) {} |
749 | | |
750 | | private: |
751 | | Value *getBroadcastInstrs(Value *V) override; |
752 | | Value *getStepVector(Value *Val, int StartIdx, Value *Step, |
753 | | Instruction::BinaryOps Opcode = |
754 | | Instruction::BinaryOpsEnd) override; |
755 | | Value *reverseVector(Value *Vec) override; |
756 | | }; |
757 | | |
758 | | } // end namespace llvm |
759 | | |
760 | | /// Look for a meaningful debug location on the instruction or it's |
761 | | /// operands. |
762 | 34.1k | static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { |
763 | 34.1k | if (!I) |
764 | 7.56k | return I; |
765 | 26.5k | |
766 | 26.5k | DebugLoc Empty; |
767 | 26.5k | if (I->getDebugLoc() != Empty) |
768 | 4 | return I; |
769 | 26.5k | |
770 | 78.0k | for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); 26.5k OI != OE; ++OI51.4k ) { |
771 | 52.3k | if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) |
772 | 26.5k | if (OpInst->getDebugLoc() != Empty) |
773 | 928 | return OpInst; |
774 | 52.3k | } |
775 | 26.5k | |
776 | 26.5k | return I25.6k ; |
777 | 26.5k | } |
778 | | |
779 | 186k | void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { |
780 | 186k | if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { |
781 | 178k | const DILocation *DIL = Inst->getDebugLoc(); |
782 | 178k | if (DIL && Inst->getFunction()->isDebugInfoForProfiling()8.93k && |
783 | 178k | !isa<DbgInfoIntrinsic>(Inst)34 ) { |
784 | 34 | auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF); |
785 | 34 | if (NewDIL) |
786 | 34 | B.SetCurrentDebugLocation(NewDIL.getValue()); |
787 | 34 | else |
788 | 34 | LLVM_DEBUG(dbgs() |
789 | 34 | << "Failed to create new discriminator: " |
790 | 34 | << DIL->getFilename() << " Line: " << DIL->getLine()); |
791 | 34 | } |
792 | 178k | else |
793 | 178k | B.SetCurrentDebugLocation(DIL); |
794 | 178k | } else |
795 | 8.50k | B.SetCurrentDebugLocation(DebugLoc()); |
796 | 186k | } |
797 | | |
798 | | #ifndef NDEBUG |
799 | | /// \return string containing a file name and a line # for the given loop. |
800 | | static std::string getDebugLocString(const Loop *L) { |
801 | | std::string Result; |
802 | | if (L) { |
803 | | raw_string_ostream OS(Result); |
804 | | if (const DebugLoc LoopDbgLoc = L->getStartLoc()) |
805 | | LoopDbgLoc.print(OS); |
806 | | else |
807 | | // Just print the module name. |
808 | | OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); |
809 | | OS.flush(); |
810 | | } |
811 | | return Result; |
812 | | } |
813 | | #endif |
814 | | |
815 | | void InnerLoopVectorizer::addNewMetadata(Instruction *To, |
816 | 210k | const Instruction *Orig) { |
817 | 210k | // If the loop was versioned with memchecks, add the corresponding no-alias |
818 | 210k | // metadata. |
819 | 210k | if (LVer && (30.8k isa<LoadInst>(Orig)30.8k || isa<StoreInst>(Orig)23.9k )) |
820 | 11.9k | LVer->annotateInstWithNoAlias(To, Orig); |
821 | 210k | } |
822 | | |
823 | | void InnerLoopVectorizer::addMetadata(Instruction *To, |
824 | 138k | Instruction *From) { |
825 | 138k | propagateMetadata(To, From); |
826 | 138k | addNewMetadata(To, From); |
827 | 138k | } |
828 | | |
829 | | void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, |
830 | 105k | Instruction *From) { |
831 | 105k | for (Value *V : To) { |
832 | 105k | if (Instruction *I = dyn_cast<Instruction>(V)) |
833 | 105k | addMetadata(I, From); |
834 | 105k | } |
835 | 105k | } |
836 | | |
837 | | namespace llvm { |
838 | | |
839 | | /// LoopVectorizationCostModel - estimates the expected speedups due to |
840 | | /// vectorization. |
841 | | /// In many cases vectorization is not profitable. This can happen because of |
842 | | /// a number of reasons. In this class we mainly attempt to predict the |
843 | | /// expected speedup/slowdowns due to the supported instruction set. We use the |
844 | | /// TargetTransformInfo to query the different backends for the cost of |
845 | | /// different operations. |
846 | | class LoopVectorizationCostModel { |
847 | | public: |
848 | | LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE, |
849 | | LoopInfo *LI, LoopVectorizationLegality *Legal, |
850 | | const TargetTransformInfo &TTI, |
851 | | const TargetLibraryInfo *TLI, DemandedBits *DB, |
852 | | AssumptionCache *AC, |
853 | | OptimizationRemarkEmitter *ORE, const Function *F, |
854 | | const LoopVectorizeHints *Hints, |
855 | | InterleavedAccessInfo &IAI) |
856 | | : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), |
857 | 19.9k | AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {} |
858 | | |
859 | | /// \return An upper bound for the vectorization factor, or None if |
860 | | /// vectorization and interleaving should be avoided up front. |
861 | | Optional<unsigned> computeMaxVF(bool OptForSize); |
862 | | |
863 | | /// \return The most profitable vectorization factor and the cost of that VF. |
864 | | /// This method checks every power of two up to MaxVF. If UserVF is not ZERO |
865 | | /// then this vectorization factor will be selected if vectorization is |
866 | | /// possible. |
867 | | VectorizationFactor selectVectorizationFactor(unsigned MaxVF); |
868 | | |
869 | | /// Setup cost-based decisions for user vectorization factor. |
870 | 649 | void selectUserVectorizationFactor(unsigned UserVF) { |
871 | 649 | collectUniformsAndScalars(UserVF); |
872 | 649 | collectInstsToScalarize(UserVF); |
873 | 649 | } |
874 | | |
875 | | /// \return The size (in bits) of the smallest and widest types in the code |
876 | | /// that needs to be vectorized. We ignore values that remain scalar such as |
877 | | /// 64 bit loop indices. |
878 | | std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); |
879 | | |
880 | | /// \return The desired interleave count. |
881 | | /// If interleave count has been specified by metadata it will be returned. |
882 | | /// Otherwise, the interleave count is computed and returned. VF and LoopCost |
883 | | /// are the selected vectorization factor and the cost of the selected VF. |
884 | | unsigned selectInterleaveCount(bool OptForSize, unsigned VF, |
885 | | unsigned LoopCost); |
886 | | |
887 | | /// Memory access instruction may be vectorized in more than one way. |
888 | | /// Form of instruction after vectorization depends on cost. |
889 | | /// This function takes cost-based decisions for Load/Store instructions |
890 | | /// and collects them in a map. This decisions map is used for building |
891 | | /// the lists of loop-uniform and loop-scalar instructions. |
892 | | /// The calculated cost is saved with widening decision in order to |
893 | | /// avoid redundant calculations. |
894 | | void setCostBasedWideningDecision(unsigned VF); |
895 | | |
896 | | /// A struct that represents some properties of the register usage |
897 | | /// of a loop. |
898 | | struct RegisterUsage { |
899 | | /// Holds the number of loop invariant values that are used in the loop. |
900 | | unsigned LoopInvariantRegs; |
901 | | |
902 | | /// Holds the maximum number of concurrent live intervals in the loop. |
903 | | unsigned MaxLocalUsers; |
904 | | }; |
905 | | |
906 | | /// \return Returns information about the register usages of the loop for the |
907 | | /// given vectorization factors. |
908 | | SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs); |
909 | | |
910 | | /// Collect values we want to ignore in the cost model. |
911 | | void collectValuesToIgnore(); |
912 | | |
913 | | /// \returns The smallest bitwidth each instruction can be represented with. |
914 | | /// The vector equivalents of these instructions should be truncated to this |
915 | | /// type. |
916 | 30.6k | const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { |
917 | 30.6k | return MinBWs; |
918 | 30.6k | } |
919 | | |
920 | | /// \returns True if it is more profitable to scalarize instruction \p I for |
921 | | /// vectorization factor \p VF. |
922 | 492k | bool isProfitableToScalarize(Instruction *I, unsigned VF) const { |
923 | 492k | assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1."); |
924 | 492k | |
925 | 492k | // Cost model is not run in the VPlan-native path - return conservative |
926 | 492k | // result until this changes. |
927 | 492k | if (EnableVPlanNativePath) |
928 | 47 | return false; |
929 | 492k | |
930 | 492k | auto Scalars = InstsToScalarize.find(VF); |
931 | 492k | assert(Scalars != InstsToScalarize.end() && |
932 | 492k | "VF not yet analyzed for scalarization profitability"); |
933 | 492k | return Scalars->second.find(I) != Scalars->second.end(); |
934 | 492k | } |
935 | | |
936 | | /// Returns true if \p I is known to be uniform after vectorization. |
937 | 917k | bool isUniformAfterVectorization(Instruction *I, unsigned VF) const { |
938 | 917k | if (VF == 1) |
939 | 404k | return true; |
940 | 512k | |
941 | 512k | // Cost model is not run in the VPlan-native path - return conservative |
942 | 512k | // result until this changes. |
943 | 512k | if (EnableVPlanNativePath) |
944 | 0 | return false; |
945 | 512k | |
946 | 512k | auto UniformsPerVF = Uniforms.find(VF); |
947 | 512k | assert(UniformsPerVF != Uniforms.end() && |
948 | 512k | "VF not yet analyzed for uniformity"); |
949 | 512k | return UniformsPerVF->second.find(I) != UniformsPerVF->second.end(); |
950 | 512k | } |
951 | | |
952 | | /// Returns true if \p I is known to be scalar after vectorization. |
953 | 1.72M | bool isScalarAfterVectorization(Instruction *I, unsigned VF) const { |
954 | 1.72M | if (VF == 1) |
955 | 650k | return true; |
956 | 1.07M | |
957 | 1.07M | // Cost model is not run in the VPlan-native path - return conservative |
958 | 1.07M | // result until this changes. |
959 | 1.07M | if (EnableVPlanNativePath) |
960 | 47 | return false; |
961 | 1.07M | |
962 | 1.07M | auto ScalarsPerVF = Scalars.find(VF); |
963 | 1.07M | assert(ScalarsPerVF != Scalars.end() && |
964 | 1.07M | "Scalar values are not calculated for VF"); |
965 | 1.07M | return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end(); |
966 | 1.07M | } |
967 | | |
968 | | /// \returns True if instruction \p I can be truncated to a smaller bitwidth |
969 | | /// for vectorization factor \p VF. |
970 | 757k | bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const { |
971 | 757k | return VF > 1 && MinBWs.find(I) != MinBWs.end()296k && |
972 | 757k | !isProfitableToScalarize(I, VF)893 && |
973 | 757k | !isScalarAfterVectorization(I, VF)893 ; |
974 | 757k | } |
975 | | |
976 | | /// Decision that was taken during cost calculation for memory instruction. |
977 | | enum InstWidening { |
978 | | CM_Unknown, |
979 | | CM_Widen, // For consecutive accesses with stride +1. |
980 | | CM_Widen_Reverse, // For consecutive accesses with stride -1. |
981 | | CM_Interleave, |
982 | | CM_GatherScatter, |
983 | | CM_Scalarize |
984 | | }; |
985 | | |
986 | | /// Save vectorization decision \p W and \p Cost taken by the cost model for |
987 | | /// instruction \p I and vector width \p VF. |
988 | | void setWideningDecision(Instruction *I, unsigned VF, InstWidening W, |
989 | 58.3k | unsigned Cost) { |
990 | 58.3k | assert(VF >= 2 && "Expected VF >=2"); |
991 | 58.3k | WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); |
992 | 58.3k | } |
993 | | |
994 | | /// Save vectorization decision \p W and \p Cost taken by the cost model for |
995 | | /// interleaving group \p Grp and vector width \p VF. |
996 | | void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF, |
997 | 2.26k | InstWidening W, unsigned Cost) { |
998 | 2.26k | assert(VF >= 2 && "Expected VF >=2"); |
999 | 2.26k | /// Broadcast this decicion to all instructions inside the group. |
1000 | 2.26k | /// But the cost will be assigned to one instruction only. |
1001 | 9.05k | for (unsigned i = 0; i < Grp->getFactor(); ++i6.78k ) { |
1002 | 6.78k | if (auto *I = Grp->getMember(i)) { |
1003 | 5.90k | if (Grp->getInsertPos() == I) |
1004 | 2.26k | WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); |
1005 | 3.64k | else |
1006 | 3.64k | WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); |
1007 | 5.90k | } |
1008 | 6.78k | } |
1009 | 2.26k | } |
1010 | | |
1011 | | /// Return the cost model decision for the given instruction \p I and vector |
1012 | | /// width \p VF. Return CM_Unknown if this instruction did not pass |
1013 | | /// through the cost modeling. |
1014 | 248k | InstWidening getWideningDecision(Instruction *I, unsigned VF) { |
1015 | 248k | assert(VF >= 2 && "Expected VF >=2"); |
1016 | 248k | |
1017 | 248k | // Cost model is not run in the VPlan-native path - return conservative |
1018 | 248k | // result until this changes. |
1019 | 248k | if (EnableVPlanNativePath) |
1020 | 16 | return CM_GatherScatter; |
1021 | 248k | |
1022 | 248k | std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); |
1023 | 248k | auto Itr = WideningDecisions.find(InstOnVF); |
1024 | 248k | if (Itr == WideningDecisions.end()) |
1025 | 2.26k | return CM_Unknown; |
1026 | 245k | return Itr->second.first; |
1027 | 245k | } |
1028 | | |
1029 | | /// Return the vectorization cost for the given instruction \p I and vector |
1030 | | /// width \p VF. |
1031 | 64.1k | unsigned getWideningCost(Instruction *I, unsigned VF) { |
1032 | 64.1k | assert(VF >= 2 && "Expected VF >=2"); |
1033 | 64.1k | std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); |
1034 | 64.1k | assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && |
1035 | 64.1k | "The cost is not calculated"); |
1036 | 64.1k | return WideningDecisions[InstOnVF].second; |
1037 | 64.1k | } |
1038 | | |
1039 | | /// Return True if instruction \p I is an optimizable truncate whose operand |
1040 | | /// is an induction variable. Such a truncate will be removed by adding a new |
1041 | | /// induction variable with the destination type. |
1042 | 99.7k | bool isOptimizableIVTruncate(Instruction *I, unsigned VF) { |
1043 | 99.7k | // If the instruction is not a truncate, return false. |
1044 | 99.7k | auto *Trunc = dyn_cast<TruncInst>(I); |
1045 | 99.7k | if (!Trunc) |
1046 | 45.2k | return false; |
1047 | 54.5k | |
1048 | 54.5k | // Get the source and destination types of the truncate. |
1049 | 54.5k | Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); |
1050 | 54.5k | Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); |
1051 | 54.5k | |
1052 | 54.5k | // If the truncate is free for the given types, return false. Replacing a |
1053 | 54.5k | // free truncate with an induction variable would add an induction variable |
1054 | 54.5k | // update instruction to each iteration of the loop. We exclude from this |
1055 | 54.5k | // check the primary induction variable since it will need an update |
1056 | 54.5k | // instruction regardless. |
1057 | 54.5k | Value *Op = Trunc->getOperand(0); |
1058 | 54.5k | if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)42.0k ) |
1059 | 16.7k | return false; |
1060 | 37.8k | |
1061 | 37.8k | // If the truncated value is not an induction variable, return false. |
1062 | 37.8k | return Legal->isInductionPhi(Op); |
1063 | 37.8k | } |
1064 | | |
1065 | | /// Collects the instructions to scalarize for each predicated instruction in |
1066 | | /// the loop. |
1067 | | void collectInstsToScalarize(unsigned VF); |
1068 | | |
1069 | | /// Collect Uniform and Scalar values for the given \p VF. |
1070 | | /// The sets depend on CM decision for Load/Store instructions |
1071 | | /// that may be vectorized as interleave, gather-scatter or scalarized. |
1072 | 197k | void collectUniformsAndScalars(unsigned VF) { |
1073 | 197k | // Do the analysis once. |
1074 | 197k | if (VF == 1 || Uniforms.find(VF) != Uniforms.end()177k ) |
1075 | 164k | return; |
1076 | 32.3k | setCostBasedWideningDecision(VF); |
1077 | 32.3k | collectLoopUniforms(VF); |
1078 | 32.3k | collectLoopScalars(VF); |
1079 | 32.3k | } |
1080 | | |
1081 | | /// Returns true if the target machine supports masked store operation |
1082 | | /// for the given \p DataType and kind of access to \p Ptr. |
1083 | 4.52k | bool isLegalMaskedStore(Type *DataType, Value *Ptr) { |
1084 | 4.52k | return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType)3.02k ; |
1085 | 4.52k | } |
1086 | | |
1087 | | /// Returns true if the target machine supports masked load operation |
1088 | | /// for the given \p DataType and kind of access to \p Ptr. |
1089 | 3.64k | bool isLegalMaskedLoad(Type *DataType, Value *Ptr) { |
1090 | 3.64k | return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType)1.89k ; |
1091 | 3.64k | } |
1092 | | |
1093 | | /// Returns true if the target machine supports masked scatter operation |
1094 | | /// for the given \p DataType. |
1095 | 10.6k | bool isLegalMaskedScatter(Type *DataType) { |
1096 | 10.6k | return TTI.isLegalMaskedScatter(DataType); |
1097 | 10.6k | } |
1098 | | |
1099 | | /// Returns true if the target machine supports masked gather operation |
1100 | | /// for the given \p DataType. |
1101 | 13.7k | bool isLegalMaskedGather(Type *DataType) { |
1102 | 13.7k | return TTI.isLegalMaskedGather(DataType); |
1103 | 13.7k | } |
1104 | | |
1105 | | /// Returns true if the target machine can represent \p V as a masked gather |
1106 | | /// or scatter operation. |
1107 | 16.8k | bool isLegalGatherOrScatter(Value *V) { |
1108 | 16.8k | bool LI = isa<LoadInst>(V); |
1109 | 16.8k | bool SI = isa<StoreInst>(V); |
1110 | 16.8k | if (!LI && !SI6.49k ) |
1111 | 0 | return false; |
1112 | 16.8k | auto *Ty = getMemInstValueType(V); |
1113 | 16.8k | return (LI && isLegalMaskedGather(Ty)10.3k ) || (16.7k SI16.7k && isLegalMaskedScatter(Ty)6.49k ); |
1114 | 16.8k | } |
1115 | | |
1116 | | /// Returns true if \p I is an instruction that will be scalarized with |
1117 | | /// predication. Such instructions include conditional stores and |
1118 | | /// instructions that may divide by zero. |
1119 | | /// If a non-zero VF has been calculated, we check if I will be scalarized |
1120 | | /// predication for that VF. |
1121 | | bool isScalarWithPredication(Instruction *I, unsigned VF = 1); |
1122 | | |
1123 | | // Returns true if \p I is an instruction that will be predicated either |
1124 | | // through scalar predication or masked load/store or masked gather/scatter. |
1125 | | // Superset of instructions that return true for isScalarWithPredication. |
1126 | 16.5k | bool isPredicatedInst(Instruction *I) { |
1127 | 16.5k | if (!blockNeedsPredication(I->getParent())) |
1128 | 14.2k | return false; |
1129 | 2.34k | // Loads and stores that need some form of masked operation are predicated |
1130 | 2.34k | // instructions. |
1131 | 2.34k | if (isa<LoadInst>(I) || isa<StoreInst>(I)1.08k ) |
1132 | 2.34k | return Legal->isMaskRequired(I); |
1133 | 0 | return isScalarWithPredication(I); |
1134 | 0 | } |
1135 | | |
1136 | | /// Returns true if \p I is a memory instruction with consecutive memory |
1137 | | /// access that can be widened. |
1138 | | bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); |
1139 | | |
1140 | | /// Returns true if \p I is a memory instruction in an interleaved-group |
1141 | | /// of memory accesses that can be vectorized with wide vector loads/stores |
1142 | | /// and shuffles. |
1143 | | bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1); |
1144 | | |
1145 | | /// Check if \p Instr belongs to any interleaved access group. |
1146 | 20.4k | bool isAccessInterleaved(Instruction *Instr) { |
1147 | 20.4k | return InterleaveInfo.isInterleaved(Instr); |
1148 | 20.4k | } |
1149 | | |
1150 | | /// Get the interleaved access group that \p Instr belongs to. |
1151 | | const InterleaveGroup<Instruction> * |
1152 | 790k | getInterleavedAccessGroup(Instruction *Instr) { |
1153 | 790k | return InterleaveInfo.getInterleaveGroup(Instr); |
1154 | 790k | } |
1155 | | |
1156 | | /// Returns true if an interleaved group requires a scalar iteration |
1157 | | /// to handle accesses with gaps, and there is nothing preventing us from |
1158 | | /// creating a scalar epilogue. |
1159 | 32.3k | bool requiresScalarEpilogue() const { |
1160 | 32.3k | return IsScalarEpilogueAllowed && InterleaveInfo.requiresScalarEpilogue()32.2k ; |
1161 | 32.3k | } |
1162 | | |
1163 | | /// Returns true if a scalar epilogue is not allowed due to optsize. |
1164 | 68 | bool isScalarEpilogueAllowed() const { return IsScalarEpilogueAllowed; } |
1165 | | |
1166 | | /// Returns true if all loop blocks should be masked to fold tail loop. |
1167 | 915k | bool foldTailByMasking() const { return FoldTailByMasking; } |
1168 | | |
1169 | 844k | bool blockNeedsPredication(BasicBlock *BB) { |
1170 | 844k | return foldTailByMasking() || Legal->blockNeedsPredication(BB)840k ; |
1171 | 844k | } |
1172 | | |
1173 | | /// Estimate cost of an intrinsic call instruction CI if it were vectorized |
1174 | | /// with factor VF. Return the cost of the instruction, including |
1175 | | /// scalarization overhead if it's needed. |
1176 | | unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF); |
1177 | | |
1178 | | /// Estimate cost of a call instruction CI if it were vectorized with factor |
1179 | | /// VF. Return the cost of the instruction, including scalarization overhead |
1180 | | /// if it's needed. The flag NeedToScalarize shows if the call needs to be |
1181 | | /// scalarized - |
1182 | | /// i.e. either vector version isn't available, or is too expensive. |
1183 | | unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize); |
1184 | | |
1185 | | private: |
1186 | | unsigned NumPredStores = 0; |
1187 | | |
1188 | | /// \return An upper bound for the vectorization factor, larger than zero. |
1189 | | /// One is returned if vectorization should best be avoided due to cost. |
1190 | | unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount); |
1191 | | |
1192 | | /// The vectorization cost is a combination of the cost itself and a boolean |
1193 | | /// indicating whether any of the contributing operations will actually |
1194 | | /// operate on |
1195 | | /// vector values after type legalization in the backend. If this latter value |
1196 | | /// is |
1197 | | /// false, then all operations will be scalarized (i.e. no vectorization has |
1198 | | /// actually taken place). |
1199 | | using VectorizationCostTy = std::pair<unsigned, bool>; |
1200 | | |
1201 | | /// Returns the expected execution cost. The unit of the cost does |
1202 | | /// not matter because we use the 'cost' units to compare different |
1203 | | /// vector widths. The cost that is returned is *not* normalized by |
1204 | | /// the factor width. |
1205 | | VectorizationCostTy expectedCost(unsigned VF); |
1206 | | |
1207 | | /// Returns the execution time cost of an instruction for a given vector |
1208 | | /// width. Vector width of one means scalar. |
1209 | | VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF); |
1210 | | |
1211 | | /// The cost-computation logic from getInstructionCost which provides |
1212 | | /// the vector type as an output parameter. |
1213 | | unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy); |
1214 | | |
1215 | | /// Calculate vectorization cost of memory instruction \p I. |
1216 | | unsigned getMemoryInstructionCost(Instruction *I, unsigned VF); |
1217 | | |
1218 | | /// The cost computation for scalarized memory instruction. |
1219 | | unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF); |
1220 | | |
1221 | | /// The cost computation for interleaving group of memory instructions. |
1222 | | unsigned getInterleaveGroupCost(Instruction *I, unsigned VF); |
1223 | | |
1224 | | /// The cost computation for Gather/Scatter instruction. |
1225 | | unsigned getGatherScatterCost(Instruction *I, unsigned VF); |
1226 | | |
1227 | | /// The cost computation for widening instruction \p I with consecutive |
1228 | | /// memory access. |
1229 | | unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF); |
1230 | | |
1231 | | /// The cost calculation for Load/Store instruction \p I with uniform pointer - |
1232 | | /// Load: scalar load + broadcast. |
1233 | | /// Store: scalar store + (loop invariant value stored? 0 : extract of last |
1234 | | /// element) |
1235 | | unsigned getUniformMemOpCost(Instruction *I, unsigned VF); |
1236 | | |
1237 | | /// Estimate the overhead of scalarizing an instruction. This is a |
1238 | | /// convenience wrapper for the type-based getScalarizationOverhead API. |
1239 | | unsigned getScalarizationOverhead(Instruction *I, unsigned VF); |
1240 | | |
1241 | | /// Returns whether the instruction is a load or store and will be a emitted |
1242 | | /// as a vector operation. |
1243 | | bool isConsecutiveLoadOrStore(Instruction *I); |
1244 | | |
1245 | | /// Returns true if an artificially high cost for emulated masked memrefs |
1246 | | /// should be used. |
1247 | | bool useEmulatedMaskMemRefHack(Instruction *I); |
1248 | | |
1249 | | /// Create an analysis remark that explains why vectorization failed |
1250 | | /// |
1251 | | /// \p RemarkName is the identifier for the remark. \return the remark object |
1252 | | /// that can be streamed to. |
1253 | 103 | OptimizationRemarkAnalysis createMissedAnalysis(StringRef RemarkName) { |
1254 | 103 | return createLVMissedAnalysis(Hints->vectorizeAnalysisPassName(), |
1255 | 103 | RemarkName, TheLoop); |
1256 | 103 | } |
1257 | | |
1258 | | /// Map of scalar integer values to the smallest bitwidth they can be legally |
1259 | | /// represented as. The vector equivalents of these values should be truncated |
1260 | | /// to this type. |
1261 | | MapVector<Instruction *, uint64_t> MinBWs; |
1262 | | |
1263 | | /// A type representing the costs for instructions if they were to be |
1264 | | /// scalarized rather than vectorized. The entries are Instruction-Cost |
1265 | | /// pairs. |
1266 | | using ScalarCostsTy = DenseMap<Instruction *, unsigned>; |
1267 | | |
1268 | | /// A set containing all BasicBlocks that are known to present after |
1269 | | /// vectorization as a predicated block. |
1270 | | SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; |
1271 | | |
1272 | | /// Records whether it is allowed to have the original scalar loop execute at |
1273 | | /// least once. This may be needed as a fallback loop in case runtime |
1274 | | /// aliasing/dependence checks fail, or to handle the tail/remainder |
1275 | | /// iterations when the trip count is unknown or doesn't divide by the VF, |
1276 | | /// or as a peel-loop to handle gaps in interleave-groups. |
1277 | | /// Under optsize and when the trip count is very small we don't allow any |
1278 | | /// iterations to execute in the scalar loop. |
1279 | | bool IsScalarEpilogueAllowed = true; |
1280 | | |
1281 | | /// All blocks of loop are to be masked to fold tail of scalar iterations. |
1282 | | bool FoldTailByMasking = false; |
1283 | | |
1284 | | /// A map holding scalar costs for different vectorization factors. The |
1285 | | /// presence of a cost for an instruction in the mapping indicates that the |
1286 | | /// instruction will be scalarized when vectorizing with the associated |
1287 | | /// vectorization factor. The entries are VF-ScalarCostTy pairs. |
1288 | | DenseMap<unsigned, ScalarCostsTy> InstsToScalarize; |
1289 | | |
1290 | | /// Holds the instructions known to be uniform after vectorization. |
1291 | | /// The data is collected per VF. |
1292 | | DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms; |
1293 | | |
1294 | | /// Holds the instructions known to be scalar after vectorization. |
1295 | | /// The data is collected per VF. |
1296 | | DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars; |
1297 | | |
1298 | | /// Holds the instructions (address computations) that are forced to be |
1299 | | /// scalarized. |
1300 | | DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars; |
1301 | | |
1302 | | /// Returns the expected difference in cost from scalarizing the expression |
1303 | | /// feeding a predicated instruction \p PredInst. The instructions to |
1304 | | /// scalarize and their scalar costs are collected in \p ScalarCosts. A |
1305 | | /// non-negative return value implies the expression will be scalarized. |
1306 | | /// Currently, only single-use chains are considered for scalarization. |
1307 | | int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, |
1308 | | unsigned VF); |
1309 | | |
1310 | | /// Collect the instructions that are uniform after vectorization. An |
1311 | | /// instruction is uniform if we represent it with a single scalar value in |
1312 | | /// the vectorized loop corresponding to each vector iteration. Examples of |
1313 | | /// uniform instructions include pointer operands of consecutive or |
1314 | | /// interleaved memory accesses. Note that although uniformity implies an |
1315 | | /// instruction will be scalar, the reverse is not true. In general, a |
1316 | | /// scalarized instruction will be represented by VF scalar values in the |
1317 | | /// vectorized loop, each corresponding to an iteration of the original |
1318 | | /// scalar loop. |
1319 | | void collectLoopUniforms(unsigned VF); |
1320 | | |
1321 | | /// Collect the instructions that are scalar after vectorization. An |
1322 | | /// instruction is scalar if it is known to be uniform or will be scalarized |
1323 | | /// during vectorization. Non-uniform scalarized instructions will be |
1324 | | /// represented by VF values in the vectorized loop, each corresponding to an |
1325 | | /// iteration of the original scalar loop. |
1326 | | void collectLoopScalars(unsigned VF); |
1327 | | |
1328 | | /// Keeps cost model vectorization decision and cost for instructions. |
1329 | | /// Right now it is used for memory instructions only. |
1330 | | using DecisionList = DenseMap<std::pair<Instruction *, unsigned>, |
1331 | | std::pair<InstWidening, unsigned>>; |
1332 | | |
1333 | | DecisionList WideningDecisions; |
1334 | | |
1335 | | /// Returns true if \p V is expected to be vectorized and it needs to be |
1336 | | /// extracted. |
1337 | 30.3k | bool needsExtract(Value *V, unsigned VF) const { |
1338 | 30.3k | Instruction *I = dyn_cast<Instruction>(V); |
1339 | 30.3k | if (VF == 1 || !I || !TheLoop->contains(I)28.4k || TheLoop->isLoopInvariant(I)27.8k ) |
1340 | 2.51k | return false; |
1341 | 27.8k | |
1342 | 27.8k | // Assume we can vectorize V (and hence we need extraction) if the |
1343 | 27.8k | // scalars are not computed yet. This can happen, because it is called |
1344 | 27.8k | // via getScalarizationOverhead from setCostBasedWideningDecision, before |
1345 | 27.8k | // the scalars are collected. That should be a safe assumption in most |
1346 | 27.8k | // cases, because we check if the operands have vectorizable types |
1347 | 27.8k | // beforehand in LoopVectorizationLegality. |
1348 | 27.8k | return Scalars.find(VF) == Scalars.end() || |
1349 | 27.8k | !isScalarAfterVectorization(I, VF)2.59k ; |
1350 | 27.8k | }; |
1351 | | |
1352 | | /// Returns a range containing only operands needing to be extracted. |
1353 | | SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, |
1354 | 18.1k | unsigned VF) { |
1355 | 18.1k | return SmallVector<Value *, 4>(make_filter_range( |
1356 | 29.6k | Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); |
1357 | 18.1k | } |
1358 | | |
1359 | | public: |
1360 | | /// The loop that we evaluate. |
1361 | | Loop *TheLoop; |
1362 | | |
1363 | | /// Predicated scalar evolution analysis. |
1364 | | PredicatedScalarEvolution &PSE; |
1365 | | |
1366 | | /// Loop Info analysis. |
1367 | | LoopInfo *LI; |
1368 | | |
1369 | | /// Vectorization legality. |
1370 | | LoopVectorizationLegality *Legal; |
1371 | | |
1372 | | /// Vector target information. |
1373 | | const TargetTransformInfo &TTI; |
1374 | | |
1375 | | /// Target Library Info. |
1376 | | const TargetLibraryInfo *TLI; |
1377 | | |
1378 | | /// Demanded bits analysis. |
1379 | | DemandedBits *DB; |
1380 | | |
1381 | | /// Assumption cache. |
1382 | | AssumptionCache *AC; |
1383 | | |
1384 | | /// Interface to emit optimization remarks. |
1385 | | OptimizationRemarkEmitter *ORE; |
1386 | | |
1387 | | const Function *TheFunction; |
1388 | | |
1389 | | /// Loop Vectorize Hint. |
1390 | | const LoopVectorizeHints *Hints; |
1391 | | |
1392 | | /// The interleave access information contains groups of interleaved accesses |
1393 | | /// with the same stride and close to each other. |
1394 | | InterleavedAccessInfo &InterleaveInfo; |
1395 | | |
1396 | | /// Values to ignore in the cost model. |
1397 | | SmallPtrSet<const Value *, 16> ValuesToIgnore; |
1398 | | |
1399 | | /// Values to ignore in the cost model when VF > 1. |
1400 | | SmallPtrSet<const Value *, 16> VecValuesToIgnore; |
1401 | | }; |
1402 | | |
1403 | | } // end namespace llvm |
1404 | | |
1405 | | // Return true if \p OuterLp is an outer loop annotated with hints for explicit |
1406 | | // vectorization. The loop needs to be annotated with #pragma omp simd |
1407 | | // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the |
1408 | | // vector length information is not provided, vectorization is not considered |
1409 | | // explicit. Interleave hints are not allowed either. These limitations will be |
1410 | | // relaxed in the future. |
1411 | | // Please, note that we are currently forced to abuse the pragma 'clang |
1412 | | // vectorize' semantics. This pragma provides *auto-vectorization hints* |
1413 | | // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' |
1414 | | // provides *explicit vectorization hints* (LV can bypass legal checks and |
1415 | | // assume that vectorization is legal). However, both hints are implemented |
1416 | | // using the same metadata (llvm.loop.vectorize, processed by |
1417 | | // LoopVectorizeHints). This will be fixed in the future when the native IR |
1418 | | // representation for pragma 'omp simd' is introduced. |
1419 | | static bool isExplicitVecOuterLoop(Loop *OuterLp, |
1420 | 7 | OptimizationRemarkEmitter *ORE) { |
1421 | 7 | assert(!OuterLp->empty() && "This is not an outer loop"); |
1422 | 7 | LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); |
1423 | 7 | |
1424 | 7 | // Only outer loops with an explicit vectorization hint are supported. |
1425 | 7 | // Unannotated outer loops are ignored. |
1426 | 7 | if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) |
1427 | 0 | return false; |
1428 | 7 | |
1429 | 7 | Function *Fn = OuterLp->getHeader()->getParent(); |
1430 | 7 | if (!Hints.allowVectorization(Fn, OuterLp, |
1431 | 7 | true /*VectorizeOnlyWhenForced*/)) { |
1432 | 0 | LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); |
1433 | 0 | return false; |
1434 | 0 | } |
1435 | 7 | |
1436 | 7 | if (Hints.getInterleave() > 1) { |
1437 | 0 | // TODO: Interleave support is future work. |
1438 | 0 | LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " |
1439 | 0 | "outer loops.\n"); |
1440 | 0 | Hints.emitRemarkWithHints(); |
1441 | 0 | return false; |
1442 | 0 | } |
1443 | 7 | |
1444 | 7 | return true; |
1445 | 7 | } |
1446 | | |
1447 | | static void collectSupportedLoops(Loop &L, LoopInfo *LI, |
1448 | | OptimizationRemarkEmitter *ORE, |
1449 | 179k | SmallVectorImpl<Loop *> &V) { |
1450 | 179k | // Collect inner loops and outer loops without irreducible control flow. For |
1451 | 179k | // now, only collect outer loops that have explicit vectorization hints. If we |
1452 | 179k | // are stress testing the VPlan H-CFG construction, we collect the outermost |
1453 | 179k | // loop of every loop nest. |
1454 | 179k | if (L.empty() || VPlanBuildStressTest32.7k || |
1455 | 179k | (32.7k EnableVPlanNativePath32.7k && isExplicitVecOuterLoop(&L, ORE)7 )) { |
1456 | 146k | LoopBlocksRPO RPOT(&L); |
1457 | 146k | RPOT.perform(LI); |
1458 | 146k | if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { |
1459 | 146k | V.push_back(&L); |
1460 | 146k | // TODO: Collect inner loops inside marked outer loops in case |
1461 | 146k | // vectorization fails for the outer loop. Do not invoke |
1462 | 146k | // 'containsIrreducibleCFG' again for inner loops when the outer loop is |
1463 | 146k | // already known to be reducible. We can use an inherited attribute for |
1464 | 146k | // that. |
1465 | 146k | return; |
1466 | 146k | } |
1467 | 32.7k | } |
1468 | 32.7k | for (Loop *InnerL : L) |
1469 | 49.7k | collectSupportedLoops(*InnerL, LI, ORE, V); |
1470 | 32.7k | } |
1471 | | |
1472 | | namespace { |
1473 | | |
1474 | | /// The LoopVectorize Pass. |
1475 | | struct LoopVectorize : public FunctionPass { |
1476 | | /// Pass identification, replacement for typeid |
1477 | | static char ID; |
1478 | | |
1479 | | LoopVectorizePass Impl; |
1480 | | |
1481 | | explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, |
1482 | | bool VectorizeOnlyWhenForced = false) |
1483 | 13.7k | : FunctionPass(ID) { |
1484 | 13.7k | Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced; |
1485 | 13.7k | Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced; |
1486 | 13.7k | initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); |
1487 | 13.7k | } |
1488 | | |
1489 | 279k | bool runOnFunction(Function &F) override { |
1490 | 279k | if (skipFunction(F)) |
1491 | 44 | return false; |
1492 | 279k | |
1493 | 279k | auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); |
1494 | 279k | auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); |
1495 | 279k | auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); |
1496 | 279k | auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); |
1497 | 279k | auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); |
1498 | 279k | auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); |
1499 | 279k | auto *TLI = TLIP ? &TLIP->getTLI() : nullptr0 ; |
1500 | 279k | auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); |
1501 | 279k | auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); |
1502 | 279k | auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); |
1503 | 279k | auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); |
1504 | 279k | auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); |
1505 | 279k | auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); |
1506 | 279k | |
1507 | 279k | std::function<const LoopAccessInfo &(Loop &)> GetLAA = |
1508 | 279k | [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }27.3k ; |
1509 | 279k | |
1510 | 279k | return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, |
1511 | 279k | GetLAA, *ORE, PSI); |
1512 | 279k | } |
1513 | | |
1514 | 13.7k | void getAnalysisUsage(AnalysisUsage &AU) const override { |
1515 | 13.7k | AU.addRequired<AssumptionCacheTracker>(); |
1516 | 13.7k | AU.addRequired<BlockFrequencyInfoWrapperPass>(); |
1517 | 13.7k | AU.addRequired<DominatorTreeWrapperPass>(); |
1518 | 13.7k | AU.addRequired<LoopInfoWrapperPass>(); |
1519 | 13.7k | AU.addRequired<ScalarEvolutionWrapperPass>(); |
1520 | 13.7k | AU.addRequired<TargetTransformInfoWrapperPass>(); |
1521 | 13.7k | AU.addRequired<AAResultsWrapperPass>(); |
1522 | 13.7k | AU.addRequired<LoopAccessLegacyAnalysis>(); |
1523 | 13.7k | AU.addRequired<DemandedBitsWrapperPass>(); |
1524 | 13.7k | AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); |
1525 | 13.7k | |
1526 | 13.7k | // We currently do not preserve loopinfo/dominator analyses with outer loop |
1527 | 13.7k | // vectorization. Until this is addressed, mark these analyses as preserved |
1528 | 13.7k | // only for non-VPlan-native path. |
1529 | 13.7k | // TODO: Preserve Loop and Dominator analyses for VPlan-native path. |
1530 | 13.7k | if (!EnableVPlanNativePath) { |
1531 | 13.7k | AU.addPreserved<LoopInfoWrapperPass>(); |
1532 | 13.7k | AU.addPreserved<DominatorTreeWrapperPass>(); |
1533 | 13.7k | } |
1534 | 13.7k | |
1535 | 13.7k | AU.addPreserved<BasicAAWrapperPass>(); |
1536 | 13.7k | AU.addPreserved<GlobalsAAWrapperPass>(); |
1537 | 13.7k | AU.addRequired<ProfileSummaryInfoWrapperPass>(); |
1538 | 13.7k | } |
1539 | | }; |
1540 | | |
1541 | | } // end anonymous namespace |
1542 | | |
1543 | | //===----------------------------------------------------------------------===// |
1544 | | // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and |
1545 | | // LoopVectorizationCostModel and LoopVectorizationPlanner. |
1546 | | //===----------------------------------------------------------------------===// |
1547 | | |
1548 | 52.8k | Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { |
1549 | 52.8k | // We need to place the broadcast of invariant variables outside the loop, |
1550 | 52.8k | // but only if it's proven safe to do so. Else, broadcast will be inside |
1551 | 52.8k | // vector loop body. |
1552 | 52.8k | Instruction *Instr = dyn_cast<Instruction>(V); |
1553 | 52.8k | bool SafeToHoist = OrigLoop->isLoopInvariant(V) && |
1554 | 52.8k | (!Instr || |
1555 | 52.8k | DT->dominates(Instr->getParent(), LoopVectorPreHeader)14.1k ); |
1556 | 52.8k | // Place the code for broadcasting invariant variables in the new preheader. |
1557 | 52.8k | IRBuilder<>::InsertPointGuard Guard(Builder); |
1558 | 52.8k | if (SafeToHoist) |
1559 | 46.6k | Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); |
1560 | 52.8k | |
1561 | 52.8k | // Broadcast the scalar into all locations in the vector. |
1562 | 52.8k | Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); |
1563 | 52.8k | |
1564 | 52.8k | return Shuf; |
1565 | 52.8k | } |
1566 | | |
1567 | | void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( |
1568 | 11.2k | const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { |
1569 | 11.2k | assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && |
1570 | 11.2k | "Expected either an induction phi-node or a truncate of it!"); |
1571 | 11.2k | Value *Start = II.getStartValue(); |
1572 | 11.2k | |
1573 | 11.2k | // Construct the initial value of the vector IV in the vector loop preheader |
1574 | 11.2k | auto CurrIP = Builder.saveIP(); |
1575 | 11.2k | Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); |
1576 | 11.2k | if (isa<TruncInst>(EntryVal)) { |
1577 | 2.39k | assert(Start->getType()->isIntegerTy() && |
1578 | 2.39k | "Truncation requires an integer type"); |
1579 | 2.39k | auto *TruncType = cast<IntegerType>(EntryVal->getType()); |
1580 | 2.39k | Step = Builder.CreateTrunc(Step, TruncType); |
1581 | 2.39k | Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); |
1582 | 2.39k | } |
1583 | 11.2k | Value *SplatStart = Builder.CreateVectorSplat(VF, Start); |
1584 | 11.2k | Value *SteppedStart = |
1585 | 11.2k | getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); |
1586 | 11.2k | |
1587 | 11.2k | // We create vector phi nodes for both integer and floating-point induction |
1588 | 11.2k | // variables. Here, we determine the kind of arithmetic we will perform. |
1589 | 11.2k | Instruction::BinaryOps AddOp; |
1590 | 11.2k | Instruction::BinaryOps MulOp; |
1591 | 11.2k | if (Step->getType()->isIntegerTy()) { |
1592 | 11.1k | AddOp = Instruction::Add; |
1593 | 11.1k | MulOp = Instruction::Mul; |
1594 | 11.1k | } else { |
1595 | 20 | AddOp = II.getInductionOpcode(); |
1596 | 20 | MulOp = Instruction::FMul; |
1597 | 20 | } |
1598 | 11.2k | |
1599 | 11.2k | // Multiply the vectorization factor by the step using integer or |
1600 | 11.2k | // floating-point arithmetic as appropriate. |
1601 | 11.2k | Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF); |
1602 | 11.2k | Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); |
1603 | 11.2k | |
1604 | 11.2k | // Create a vector splat to use in the induction update. |
1605 | 11.2k | // |
1606 | 11.2k | // FIXME: If the step is non-constant, we create the vector splat with |
1607 | 11.2k | // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't |
1608 | 11.2k | // handle a constant vector splat. |
1609 | 11.2k | Value *SplatVF = isa<Constant>(Mul) |
1610 | 11.2k | ? ConstantVector::getSplat(VF, cast<Constant>(Mul))11.1k |
1611 | 11.2k | : Builder.CreateVectorSplat(VF, Mul)32 ; |
1612 | 11.2k | Builder.restoreIP(CurrIP); |
1613 | 11.2k | |
1614 | 11.2k | // We may need to add the step a number of times, depending on the unroll |
1615 | 11.2k | // factor. The last of those goes into the PHI. |
1616 | 11.2k | PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", |
1617 | 11.2k | &*LoopVectorBody->getFirstInsertionPt()); |
1618 | 11.2k | VecInd->setDebugLoc(EntryVal->getDebugLoc()); |
1619 | 11.2k | Instruction *LastInduction = VecInd; |
1620 | 32.9k | for (unsigned Part = 0; Part < UF; ++Part21.7k ) { |
1621 | 21.7k | VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); |
1622 | 21.7k | |
1623 | 21.7k | if (isa<TruncInst>(EntryVal)) |
1624 | 4.46k | addMetadata(LastInduction, EntryVal); |
1625 | 21.7k | recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); |
1626 | 21.7k | |
1627 | 21.7k | LastInduction = cast<Instruction>(addFastMathFlag( |
1628 | 21.7k | Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); |
1629 | 21.7k | LastInduction->setDebugLoc(EntryVal->getDebugLoc()); |
1630 | 21.7k | } |
1631 | 11.2k | |
1632 | 11.2k | // Move the last step to the end of the latch block. This ensures consistent |
1633 | 11.2k | // placement of all induction updates. |
1634 | 11.2k | auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); |
1635 | 11.2k | auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); |
1636 | 11.2k | auto *ICmp = cast<Instruction>(Br->getCondition()); |
1637 | 11.2k | LastInduction->moveBefore(ICmp); |
1638 | 11.2k | LastInduction->setName("vec.ind.next"); |
1639 | 11.2k | |
1640 | 11.2k | VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); |
1641 | 11.2k | VecInd->addIncoming(LastInduction, LoopVectorLatch); |
1642 | 11.2k | } |
1643 | | |
1644 | 48.8k | bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { |
1645 | 48.8k | return Cost->isScalarAfterVectorization(I, VF) || |
1646 | 48.8k | Cost->isProfitableToScalarize(I, VF)27.8k ; |
1647 | 48.8k | } |
1648 | | |
1649 | 17.4k | bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { |
1650 | 17.4k | if (shouldScalarizeInstruction(IV)) |
1651 | 6.20k | return true; |
1652 | 13.9k | auto isScalarInst = [&](User *U) -> bool 11.2k { |
1653 | 13.9k | auto *I = cast<Instruction>(U); |
1654 | 13.9k | return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)13.9k ); |
1655 | 13.9k | }; |
1656 | 11.2k | return llvm::any_of(IV->users(), isScalarInst); |
1657 | 11.2k | } |
1658 | | |
1659 | | void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( |
1660 | | const InductionDescriptor &ID, const Instruction *EntryVal, |
1661 | 96.1k | Value *VectorLoopVal, unsigned Part, unsigned Lane) { |
1662 | 96.1k | assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && |
1663 | 96.1k | "Expected either an induction phi-node or a truncate of it!"); |
1664 | 96.1k | |
1665 | 96.1k | // This induction variable is not the phi from the original loop but the |
1666 | 96.1k | // newly-created IV based on the proof that casted Phi is equal to the |
1667 | 96.1k | // uncasted Phi in the vectorized loop (under a runtime guard possibly). It |
1668 | 96.1k | // re-uses the same InductionDescriptor that original IV uses but we don't |
1669 | 96.1k | // have to do any recording in this case - that is done when original IV is |
1670 | 96.1k | // processed. |
1671 | 96.1k | if (isa<TruncInst>(EntryVal)) |
1672 | 4.81k | return; |
1673 | 91.3k | |
1674 | 91.3k | const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); |
1675 | 91.3k | if (Casts.empty()) |
1676 | 91.3k | return; |
1677 | 24 | // Only the first Cast instruction in the Casts vector is of interest. |
1678 | 24 | // The rest of the Casts (if exist) have no uses outside the |
1679 | 24 | // induction update chain itself. |
1680 | 24 | Instruction *CastInst = *Casts.begin(); |
1681 | 24 | if (Lane < UINT_MAX) |
1682 | 24 | VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal)4 ; |
1683 | 20 | else |
1684 | 20 | VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); |
1685 | 24 | } |
1686 | | |
1687 | 19.2k | void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { |
1688 | 19.2k | assert((IV->getType()->isIntegerTy() || IV != OldInduction) && |
1689 | 19.2k | "Primary induction variable must have an integer type"); |
1690 | 19.2k | |
1691 | 19.2k | auto II = Legal->getInductionVars()->find(IV); |
1692 | 19.2k | assert(II != Legal->getInductionVars()->end() && "IV is not an induction"); |
1693 | 19.2k | |
1694 | 19.2k | auto ID = II->second; |
1695 | 19.2k | assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); |
1696 | 19.2k | |
1697 | 19.2k | // The scalar value to broadcast. This will be derived from the canonical |
1698 | 19.2k | // induction variable. |
1699 | 19.2k | Value *ScalarIV = nullptr; |
1700 | 19.2k | |
1701 | 19.2k | // The value from the original loop to which we are mapping the new induction |
1702 | 19.2k | // variable. |
1703 | 19.2k | Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc)2.50k : IV16.7k ; |
1704 | 19.2k | |
1705 | 19.2k | // True if we have vectorized the induction variable. |
1706 | 19.2k | auto VectorizedIV = false; |
1707 | 19.2k | |
1708 | 19.2k | // Determine if we want a scalar version of the induction variable. This is |
1709 | 19.2k | // true if the induction variable itself is not widened, or if it has at |
1710 | 19.2k | // least one user in the loop that is not widened. |
1711 | 19.2k | auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal)17.4k ; |
1712 | 19.2k | |
1713 | 19.2k | // Generate code for the induction step. Note that induction steps are |
1714 | 19.2k | // required to be loop-invariant |
1715 | 19.2k | assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) && |
1716 | 19.2k | "Induction step should be loop invariant"); |
1717 | 19.2k | auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); |
1718 | 19.2k | Value *Step = nullptr; |
1719 | 19.2k | if (PSE.getSE()->isSCEVable(IV->getType())) { |
1720 | 19.2k | SCEVExpander Exp(*PSE.getSE(), DL, "induction"); |
1721 | 19.2k | Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(), |
1722 | 19.2k | LoopVectorPreHeader->getTerminator()); |
1723 | 19.2k | } else { |
1724 | 26 | Step = cast<SCEVUnknown>(ID.getStep())->getValue(); |
1725 | 26 | } |
1726 | 19.2k | |
1727 | 19.2k | // Try to create a new independent vector induction variable. If we can't |
1728 | 19.2k | // create the phi node, we will splat the scalar induction variable in each |
1729 | 19.2k | // loop iteration. |
1730 | 19.2k | if (VF > 1 && !shouldScalarizeInstruction(EntryVal)17.4k ) { |
1731 | 11.2k | createVectorIntOrFpInductionPHI(ID, Step, EntryVal); |
1732 | 11.2k | VectorizedIV = true; |
1733 | 11.2k | } |
1734 | 19.2k | |
1735 | 19.2k | // If we haven't yet vectorized the induction variable, or if we will create |
1736 | 19.2k | // a scalar one, we need to define the scalar induction variable and step |
1737 | 19.2k | // values. If we were given a truncation type, truncate the canonical |
1738 | 19.2k | // induction variable and step. Otherwise, derive these values from the |
1739 | 19.2k | // induction descriptor. |
1740 | 19.2k | if (!VectorizedIV || NeedsScalarIV11.2k ) { |
1741 | 16.6k | ScalarIV = Induction; |
1742 | 16.6k | if (IV != OldInduction) { |
1743 | 3.31k | ScalarIV = IV->getType()->isIntegerTy() |
1744 | 3.31k | ? Builder.CreateSExtOrTrunc(Induction, IV->getType())3.30k |
1745 | 3.31k | : Builder.CreateCast(Instruction::SIToFP, Induction, |
1746 | 9 | IV->getType()); |
1747 | 3.31k | ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); |
1748 | 3.31k | ScalarIV->setName("offset.idx"); |
1749 | 3.31k | } |
1750 | 16.6k | if (Trunc) { |
1751 | 118 | auto *TruncType = cast<IntegerType>(Trunc->getType()); |
1752 | 118 | assert(Step->getType()->isIntegerTy() && |
1753 | 118 | "Truncation requires an integer step"); |
1754 | 118 | ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); |
1755 | 118 | Step = Builder.CreateTrunc(Step, TruncType); |
1756 | 118 | } |
1757 | 16.6k | } |
1758 | 19.2k | |
1759 | 19.2k | // If we haven't yet vectorized the induction variable, splat the scalar |
1760 | 19.2k | // induction variable, and build the necessary step vectors. |
1761 | 19.2k | // TODO: Don't do it unless the vectorized IV is really required. |
1762 | 19.2k | if (!VectorizedIV) { |
1763 | 8.01k | Value *Broadcasted = getBroadcastInstrs(ScalarIV); |
1764 | 23.1k | for (unsigned Part = 0; Part < UF; ++Part15.1k ) { |
1765 | 15.1k | Value *EntryPart = |
1766 | 15.1k | getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode()); |
1767 | 15.1k | VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); |
1768 | 15.1k | if (Trunc) |
1769 | 199 | addMetadata(EntryPart, Trunc); |
1770 | 15.1k | recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); |
1771 | 15.1k | } |
1772 | 8.01k | } |
1773 | 19.2k | |
1774 | 19.2k | // If an induction variable is only used for counting loop iterations or |
1775 | 19.2k | // calculating addresses, it doesn't need to be widened. Create scalar steps |
1776 | 19.2k | // that can be used by instructions we will later scalarize. Note that the |
1777 | 19.2k | // addition of the scalar steps will not increase the number of instructions |
1778 | 19.2k | // in the loop in the common case prior to InstCombine. We will be trading |
1779 | 19.2k | // one vector extract for each scalar step. |
1780 | 19.2k | if (NeedsScalarIV) |
1781 | 14.8k | buildScalarSteps(ScalarIV, Step, EntryVal, ID); |
1782 | 19.2k | } |
1783 | | |
1784 | | Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, |
1785 | 22.7k | Instruction::BinaryOps BinOp) { |
1786 | 22.7k | // Create and check the types. |
1787 | 22.7k | assert(Val->getType()->isVectorTy() && "Must be a vector"); |
1788 | 22.7k | int VLen = Val->getType()->getVectorNumElements(); |
1789 | 22.7k | |
1790 | 22.7k | Type *STy = Val->getType()->getScalarType(); |
1791 | 22.7k | assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && |
1792 | 22.7k | "Induction Step must be an integer or FP"); |
1793 | 22.7k | assert(Step->getType() == STy && "Step has wrong type"); |
1794 | 22.7k | |
1795 | 22.7k | SmallVector<Constant *, 8> Indices; |
1796 | 22.7k | |
1797 | 22.7k | if (STy->isIntegerTy()) { |
1798 | 22.7k | // Create a vector of consecutive numbers from zero to VF. |
1799 | 111k | for (int i = 0; i < VLen; ++i89.0k ) |
1800 | 89.0k | Indices.push_back(ConstantInt::get(STy, StartIdx + i)); |
1801 | 22.7k | |
1802 | 22.7k | // Add the consecutive indices to the vector value. |
1803 | 22.7k | Constant *Cv = ConstantVector::get(Indices); |
1804 | 22.7k | assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); |
1805 | 22.7k | Step = Builder.CreateVectorSplat(VLen, Step); |
1806 | 22.7k | assert(Step->getType() == Val->getType() && "Invalid step vec"); |
1807 | 22.7k | // FIXME: The newly created binary instructions should contain nsw/nuw flags, |
1808 | 22.7k | // which can be found from the original scalar operations. |
1809 | 22.7k | Step = Builder.CreateMul(Cv, Step); |
1810 | 22.7k | return Builder.CreateAdd(Val, Step, "induction"); |
1811 | 22.7k | } |
1812 | 20 | |
1813 | 20 | // Floating point induction. |
1814 | 20 | assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && |
1815 | 20 | "Binary Opcode should be specified for FP induction"); |
1816 | 20 | // Create a vector of consecutive numbers from zero to VF. |
1817 | 92 | for (int i = 0; i < VLen; ++i72 ) |
1818 | 72 | Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); |
1819 | 20 | |
1820 | 20 | // Add the consecutive indices to the vector value. |
1821 | 20 | Constant *Cv = ConstantVector::get(Indices); |
1822 | 20 | |
1823 | 20 | Step = Builder.CreateVectorSplat(VLen, Step); |
1824 | 20 | |
1825 | 20 | // Floating point operations had to be 'fast' to enable the induction. |
1826 | 20 | FastMathFlags Flags; |
1827 | 20 | Flags.setFast(); |
1828 | 20 | |
1829 | 20 | Value *MulOp = Builder.CreateFMul(Cv, Step); |
1830 | 20 | if (isa<Instruction>(MulOp)) |
1831 | 6 | // Have to check, MulOp may be a constant |
1832 | 6 | cast<Instruction>(MulOp)->setFastMathFlags(Flags); |
1833 | 20 | |
1834 | 20 | Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); |
1835 | 20 | if (isa<Instruction>(BOp)) |
1836 | 9 | cast<Instruction>(BOp)->setFastMathFlags(Flags); |
1837 | 20 | return BOp; |
1838 | 20 | } |
1839 | | |
1840 | | void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, |
1841 | | Instruction *EntryVal, |
1842 | 14.8k | const InductionDescriptor &ID) { |
1843 | 14.8k | // We shouldn't have to build scalar steps if we aren't vectorizing. |
1844 | 14.8k | assert(VF > 1 && "VF should be greater than one"); |
1845 | 14.8k | |
1846 | 14.8k | // Get the value type and ensure it and the step have the same integer type. |
1847 | 14.8k | Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); |
1848 | 14.8k | assert(ScalarIVTy == Step->getType() && |
1849 | 14.8k | "Val and Step should have the same type"); |
1850 | 14.8k | |
1851 | 14.8k | // We build scalar steps for both integer and floating-point induction |
1852 | 14.8k | // variables. Here, we determine the kind of arithmetic we will perform. |
1853 | 14.8k | Instruction::BinaryOps AddOp; |
1854 | 14.8k | Instruction::BinaryOps MulOp; |
1855 | 14.8k | if (ScalarIVTy->isIntegerTy()) { |
1856 | 14.8k | AddOp = Instruction::Add; |
1857 | 14.8k | MulOp = Instruction::Mul; |
1858 | 14.8k | } else { |
1859 | 3 | AddOp = ID.getInductionOpcode(); |
1860 | 3 | MulOp = Instruction::FMul; |
1861 | 3 | } |
1862 | 14.8k | |
1863 | 14.8k | // Determine the number of scalars we need to generate for each unroll |
1864 | 14.8k | // iteration. If EntryVal is uniform, we only need to generate the first |
1865 | 14.8k | // lane. Otherwise, we generate all VF values. |
1866 | 14.8k | unsigned Lanes = |
1867 | 14.8k | Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 16.11k |
1868 | 14.8k | : VF8.71k ; |
1869 | 14.8k | // Compute the scalar steps and save the results in VectorLoopValueMap. |
1870 | 43.2k | for (unsigned Part = 0; Part < UF; ++Part28.4k ) { |
1871 | 87.7k | for (unsigned Lane = 0; Lane < Lanes; ++Lane59.2k ) { |
1872 | 59.2k | auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane); |
1873 | 59.2k | auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); |
1874 | 59.2k | auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); |
1875 | 59.2k | VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); |
1876 | 59.2k | recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); |
1877 | 59.2k | } |
1878 | 28.4k | } |
1879 | 14.8k | } |
1880 | | |
1881 | 197k | Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { |
1882 | 197k | assert(V != Induction && "The new induction variable should not be used."); |
1883 | 197k | assert(!V->getType()->isVectorTy() && "Can't widen a vector"); |
1884 | 197k | assert(!V->getType()->isVoidTy() && "Type does not produce a value"); |
1885 | 197k | |
1886 | 197k | // If we have a stride that is replaced by one, do it here. Defer this for |
1887 | 197k | // the VPlan-native path until we start running Legal checks in that path. |
1888 | 197k | if (!EnableVPlanNativePath && Legal->hasStride(V)197k ) |
1889 | 0 | V = ConstantInt::get(V->getType(), 1); |
1890 | 197k | |
1891 | 197k | // If we have a vector mapped to this value, return it. |
1892 | 197k | if (VectorLoopValueMap.hasVectorValue(V, Part)) |
1893 | 149k | return VectorLoopValueMap.getVectorValue(V, Part); |
1894 | 48.1k | |
1895 | 48.1k | // If the value has not been vectorized, check if it has been scalarized |
1896 | 48.1k | // instead. If it has been scalarized, and we actually need the value in |
1897 | 48.1k | // vector form, we will construct the vector values on demand. |
1898 | 48.1k | if (VectorLoopValueMap.hasAnyScalarValue(V)) { |
1899 | 1.49k | Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); |
1900 | 1.49k | |
1901 | 1.49k | // If we've scalarized a value, that value should be an instruction. |
1902 | 1.49k | auto *I = cast<Instruction>(V); |
1903 | 1.49k | |
1904 | 1.49k | // If we aren't vectorizing, we can just copy the scalar map values over to |
1905 | 1.49k | // the vector map. |
1906 | 1.49k | if (VF == 1) { |
1907 | 980 | VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); |
1908 | 980 | return ScalarValue; |
1909 | 980 | } |
1910 | 519 | |
1911 | 519 | // Get the last scalar instruction we generated for V and Part. If the value |
1912 | 519 | // is known to be uniform after vectorization, this corresponds to lane zero |
1913 | 519 | // of the Part unroll iteration. Otherwise, the last instruction is the one |
1914 | 519 | // we created for the last vector lane of the Part unroll iteration. |
1915 | 519 | unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 00 : VF - 1; |
1916 | 519 | auto *LastInst = cast<Instruction>( |
1917 | 519 | VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); |
1918 | 519 | |
1919 | 519 | // Set the insert point after the last scalarized instruction. This ensures |
1920 | 519 | // the insertelement sequence will directly follow the scalar definitions. |
1921 | 519 | auto OldIP = Builder.saveIP(); |
1922 | 519 | auto NewIP = std::next(BasicBlock::iterator(LastInst)); |
1923 | 519 | Builder.SetInsertPoint(&*NewIP); |
1924 | 519 | |
1925 | 519 | // However, if we are vectorizing, we need to construct the vector values. |
1926 | 519 | // If the value is known to be uniform after vectorization, we can just |
1927 | 519 | // broadcast the scalar value corresponding to lane zero for each unroll |
1928 | 519 | // iteration. Otherwise, we construct the vector values using insertelement |
1929 | 519 | // instructions. Since the resulting vectors are stored in |
1930 | 519 | // VectorLoopValueMap, we will only generate the insertelements once. |
1931 | 519 | Value *VectorValue = nullptr; |
1932 | 519 | if (Cost->isUniformAfterVectorization(I, VF)) { |
1933 | 0 | VectorValue = getBroadcastInstrs(ScalarValue); |
1934 | 0 | VectorLoopValueMap.setVectorValue(V, Part, VectorValue); |
1935 | 519 | } else { |
1936 | 519 | // Initialize packing with insertelements to start from undef. |
1937 | 519 | Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); |
1938 | 519 | VectorLoopValueMap.setVectorValue(V, Part, Undef); |
1939 | 2.27k | for (unsigned Lane = 0; Lane < VF; ++Lane1.75k ) |
1940 | 1.75k | packScalarIntoVectorValue(V, {Part, Lane}); |
1941 | 519 | VectorValue = VectorLoopValueMap.getVectorValue(V, Part); |
1942 | 519 | } |
1943 | 519 | Builder.restoreIP(OldIP); |
1944 | 519 | return VectorValue; |
1945 | 519 | } |
1946 | 46.6k | |
1947 | 46.6k | // If this scalar is unknown, assume that it is a constant or that it is |
1948 | 46.6k | // loop invariant. Broadcast V and save the value for future uses. |
1949 | 46.6k | Value *B = getBroadcastInstrs(V); |
1950 | 46.6k | VectorLoopValueMap.setVectorValue(V, Part, B); |
1951 | 46.6k | return B; |
1952 | 46.6k | } |
1953 | | |
1954 | | Value * |
1955 | | InnerLoopVectorizer::getOrCreateScalarValue(Value *V, |
1956 | 189k | const VPIteration &Instance) { |
1957 | 189k | // If the value is not an instruction contained in the loop, it should |
1958 | 189k | // already be scalar. |
1959 | 189k | if (OrigLoop->isLoopInvariant(V)) |
1960 | 91.7k | return V; |
1961 | 98.1k | |
1962 | 98.1k | assert(Instance.Lane > 0 |
1963 | 98.1k | ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) |
1964 | 98.1k | : true && "Uniform values only have lane zero"); |
1965 | 98.1k | |
1966 | 98.1k | // If the value from the original loop has not been vectorized, it is |
1967 | 98.1k | // represented by UF x VF scalar values in the new loop. Return the requested |
1968 | 98.1k | // scalar value. |
1969 | 98.1k | if (VectorLoopValueMap.hasScalarValue(V, Instance)) |
1970 | 92.2k | return VectorLoopValueMap.getScalarValue(V, Instance); |
1971 | 5.90k | |
1972 | 5.90k | // If the value has not been scalarized, get its entry in VectorLoopValueMap |
1973 | 5.90k | // for the given unroll part. If this entry is not a vector type (i.e., the |
1974 | 5.90k | // vectorization factor is one), there is no need to generate an |
1975 | 5.90k | // extractelement instruction. |
1976 | 5.90k | auto *U = getOrCreateVectorValue(V, Instance.Part); |
1977 | 5.90k | if (!U->getType()->isVectorTy()) { |
1978 | 4.27k | assert(VF == 1 && "Value not scalarized has non-vector type"); |
1979 | 4.27k | return U; |
1980 | 4.27k | } |
1981 | 1.63k | |
1982 | 1.63k | // Otherwise, the value from the original loop has been vectorized and is |
1983 | 1.63k | // represented by UF vector values. Extract and return the requested scalar |
1984 | 1.63k | // value from the appropriate vector lane. |
1985 | 1.63k | return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); |
1986 | 1.63k | } |
1987 | | |
1988 | | void InnerLoopVectorizer::packScalarIntoVectorValue( |
1989 | 1.93k | Value *V, const VPIteration &Instance) { |
1990 | 1.93k | assert(V != Induction && "The new induction variable should not be used."); |
1991 | 1.93k | assert(!V->getType()->isVectorTy() && "Can't pack a vector"); |
1992 | 1.93k | assert(!V->getType()->isVoidTy() && "Type does not produce a value"); |
1993 | 1.93k | |
1994 | 1.93k | Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); |
1995 | 1.93k | Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); |
1996 | 1.93k | VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, |
1997 | 1.93k | Builder.getInt32(Instance.Lane)); |
1998 | 1.93k | VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); |
1999 | 1.93k | } |
2000 | | |
2001 | 1.24k | Value *InnerLoopVectorizer::reverseVector(Value *Vec) { |
2002 | 1.24k | assert(Vec->getType()->isVectorTy() && "Invalid type"); |
2003 | 1.24k | SmallVector<Constant *, 8> ShuffleMask; |
2004 | 14.3k | for (unsigned i = 0; i < VF; ++i13.1k ) |
2005 | 13.1k | ShuffleMask.push_back(Builder.getInt32(VF - i - 1)); |
2006 | 1.24k | |
2007 | 1.24k | return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), |
2008 | 1.24k | ConstantVector::get(ShuffleMask), |
2009 | 1.24k | "reverse"); |
2010 | 1.24k | } |
2011 | | |
2012 | | // Return whether we allow using masked interleave-groups (for dealing with |
2013 | | // strided loads/stores that reside in predicated blocks, or for dealing |
2014 | | // with gaps). |
2015 | 19.6k | static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { |
2016 | 19.6k | // If an override option has been passed in for interleaved accesses, use it. |
2017 | 19.6k | if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) |
2018 | 24 | return EnableMaskedInterleavedMemAccesses; |
2019 | 19.5k | |
2020 | 19.5k | return TTI.enableMaskedInterleavedAccessVectorization(); |
2021 | 19.5k | } |
2022 | | |
2023 | | // Try to vectorize the interleave group that \p Instr belongs to. |
2024 | | // |
2025 | | // E.g. Translate following interleaved load group (factor = 3): |
2026 | | // for (i = 0; i < N; i+=3) { |
2027 | | // R = Pic[i]; // Member of index 0 |
2028 | | // G = Pic[i+1]; // Member of index 1 |
2029 | | // B = Pic[i+2]; // Member of index 2 |
2030 | | // ... // do something to R, G, B |
2031 | | // } |
2032 | | // To: |
2033 | | // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B |
2034 | | // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements |
2035 | | // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements |
2036 | | // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements |
2037 | | // |
2038 | | // Or translate following interleaved store group (factor = 3): |
2039 | | // for (i = 0; i < N; i+=3) { |
2040 | | // ... do something to R, G, B |
2041 | | // Pic[i] = R; // Member of index 0 |
2042 | | // Pic[i+1] = G; // Member of index 1 |
2043 | | // Pic[i+2] = B; // Member of index 2 |
2044 | | // } |
2045 | | // To: |
2046 | | // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> |
2047 | | // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> |
2048 | | // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, |
2049 | | // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements |
2050 | | // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B |
2051 | | void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, |
2052 | 669 | VectorParts *BlockInMask) { |
2053 | 669 | const InterleaveGroup<Instruction> *Group = |
2054 | 669 | Cost->getInterleavedAccessGroup(Instr); |
2055 | 669 | assert(Group && "Fail to get an interleaved access group."); |
2056 | 669 | |
2057 | 669 | // Skip if current instruction is not the insert position. |
2058 | 669 | if (Instr != Group->getInsertPos()) |
2059 | 0 | return; |
2060 | 669 | |
2061 | 669 | const DataLayout &DL = Instr->getModule()->getDataLayout(); |
2062 | 669 | Value *Ptr = getLoadStorePointerOperand(Instr); |
2063 | 669 | |
2064 | 669 | // Prepare for the vector type of the interleaved load/store. |
2065 | 669 | Type *ScalarTy = getMemInstValueType(Instr); |
2066 | 669 | unsigned InterleaveFactor = Group->getFactor(); |
2067 | 669 | Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF); |
2068 | 669 | Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr)); |
2069 | 669 | |
2070 | 669 | // Prepare for the new pointers. |
2071 | 669 | setDebugLocFromInst(Builder, Ptr); |
2072 | 669 | SmallVector<Value *, 2> NewPtrs; |
2073 | 669 | unsigned Index = Group->getIndex(Instr); |
2074 | 669 | |
2075 | 669 | VectorParts Mask; |
2076 | 669 | bool IsMaskForCondRequired = BlockInMask; |
2077 | 669 | if (IsMaskForCondRequired) { |
2078 | 11 | Mask = *BlockInMask; |
2079 | 11 | // TODO: extend the masked interleaved-group support to reversed access. |
2080 | 11 | assert(!Group->isReverse() && "Reversed masked interleave-group " |
2081 | 11 | "not supported."); |
2082 | 11 | } |
2083 | 669 | |
2084 | 669 | // If the group is reverse, adjust the index to refer to the last vector lane |
2085 | 669 | // instead of the first. We adjust the index from the first vector lane, |
2086 | 669 | // rather than directly getting the pointer for lane VF - 1, because the |
2087 | 669 | // pointer operand of the interleaved access is supposed to be uniform. For |
2088 | 669 | // uniform instructions, we're only required to generate a value for the |
2089 | 669 | // first vector lane in each unroll iteration. |
2090 | 669 | if (Group->isReverse()) |
2091 | 4 | Index += (VF - 1) * Group->getFactor(); |
2092 | 669 | |
2093 | 669 | bool InBounds = false; |
2094 | 669 | if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) |
2095 | 567 | InBounds = gep->isInBounds(); |
2096 | 669 | |
2097 | 1.76k | for (unsigned Part = 0; Part < UF; Part++1.09k ) { |
2098 | 1.09k | Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0}); |
2099 | 1.09k | |
2100 | 1.09k | // Notice current instruction could be any index. Need to adjust the address |
2101 | 1.09k | // to the member of index 0. |
2102 | 1.09k | // |
2103 | 1.09k | // E.g. a = A[i+1]; // Member of index 1 (Current instruction) |
2104 | 1.09k | // b = A[i]; // Member of index 0 |
2105 | 1.09k | // Current pointer is pointed to A[i+1], adjust it to A[i]. |
2106 | 1.09k | // |
2107 | 1.09k | // E.g. A[i+1] = a; // Member of index 1 |
2108 | 1.09k | // A[i] = b; // Member of index 0 |
2109 | 1.09k | // A[i+2] = c; // Member of index 2 (Current instruction) |
2110 | 1.09k | // Current pointer is pointed to A[i+2], adjust it to A[i]. |
2111 | 1.09k | NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index)); |
2112 | 1.09k | if (InBounds) |
2113 | 860 | cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true); |
2114 | 1.09k | |
2115 | 1.09k | // Cast to the vector pointer type. |
2116 | 1.09k | NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy)); |
2117 | 1.09k | } |
2118 | 669 | |
2119 | 669 | setDebugLocFromInst(Builder, Instr); |
2120 | 669 | Value *UndefVec = UndefValue::get(VecTy); |
2121 | 669 | |
2122 | 669 | Value *MaskForGaps = nullptr; |
2123 | 669 | if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()68 ) { |
2124 | 5 | MaskForGaps = createBitMaskForGaps(Builder, VF, *Group); |
2125 | 5 | assert(MaskForGaps && "Mask for Gaps is required but it is null"); |
2126 | 5 | } |
2127 | 669 | |
2128 | 669 | // Vectorize the interleaved load group. |
2129 | 669 | if (isa<LoadInst>(Instr)) { |
2130 | 324 | // For each unroll part, create a wide load for the group. |
2131 | 324 | SmallVector<Value *, 2> NewLoads; |
2132 | 841 | for (unsigned Part = 0; Part < UF; Part++517 ) { |
2133 | 517 | Instruction *NewLoad; |
2134 | 517 | if (IsMaskForCondRequired || MaskForGaps509 ) { |
2135 | 9 | assert(useMaskedInterleavedAccesses(*TTI) && |
2136 | 9 | "masked interleaved groups are not allowed."); |
2137 | 9 | Value *GroupMask = MaskForGaps; |
2138 | 9 | if (IsMaskForCondRequired) { |
2139 | 8 | auto *Undefs = UndefValue::get(Mask[Part]->getType()); |
2140 | 8 | auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); |
2141 | 8 | Value *ShuffledMask = Builder.CreateShuffleVector( |
2142 | 8 | Mask[Part], Undefs, RepMask, "interleaved.mask"); |
2143 | 8 | GroupMask = MaskForGaps |
2144 | 8 | ? Builder.CreateBinOp(Instruction::And, ShuffledMask, |
2145 | 4 | MaskForGaps) |
2146 | 8 | : ShuffledMask4 ; |
2147 | 8 | } |
2148 | 9 | NewLoad = |
2149 | 9 | Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(), |
2150 | 9 | GroupMask, UndefVec, "wide.masked.vec"); |
2151 | 9 | } |
2152 | 508 | else |
2153 | 508 | NewLoad = Builder.CreateAlignedLoad(VecTy, NewPtrs[Part], |
2154 | 508 | Group->getAlignment(), "wide.vec"); |
2155 | 517 | Group->addMetadata(NewLoad); |
2156 | 517 | NewLoads.push_back(NewLoad); |
2157 | 517 | } |
2158 | 324 | |
2159 | 324 | // For each member in the group, shuffle out the appropriate data from the |
2160 | 324 | // wide loads. |
2161 | 1.17k | for (unsigned I = 0; I < InterleaveFactor; ++I849 ) { |
2162 | 849 | Instruction *Member = Group->getMember(I); |
2163 | 849 | |
2164 | 849 | // Skip the gaps in the group. |
2165 | 849 | if (!Member) |
2166 | 137 | continue; |
2167 | 712 | |
2168 | 712 | Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF); |
2169 | 1.89k | for (unsigned Part = 0; Part < UF; Part++1.17k ) { |
2170 | 1.17k | Value *StridedVec = Builder.CreateShuffleVector( |
2171 | 1.17k | NewLoads[Part], UndefVec, StrideMask, "strided.vec"); |
2172 | 1.17k | |
2173 | 1.17k | // If this member has different type, cast the result type. |
2174 | 1.17k | if (Member->getType() != ScalarTy) { |
2175 | 7 | VectorType *OtherVTy = VectorType::get(Member->getType(), VF); |
2176 | 7 | StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); |
2177 | 7 | } |
2178 | 1.17k | |
2179 | 1.17k | if (Group->isReverse()) |
2180 | 6 | StridedVec = reverseVector(StridedVec); |
2181 | 1.17k | |
2182 | 1.17k | VectorLoopValueMap.setVectorValue(Member, Part, StridedVec); |
2183 | 1.17k | } |
2184 | 712 | } |
2185 | 324 | return; |
2186 | 324 | } |
2187 | 345 | |
2188 | 345 | // The sub vector type for current instruction. |
2189 | 345 | VectorType *SubVT = VectorType::get(ScalarTy, VF); |
2190 | 345 | |
2191 | 345 | // Vectorize the interleaved store group. |
2192 | 922 | for (unsigned Part = 0; Part < UF; Part++577 ) { |
2193 | 577 | // Collect the stored vector from each member. |
2194 | 577 | SmallVector<Value *, 4> StoredVecs; |
2195 | 1.90k | for (unsigned i = 0; i < InterleaveFactor; i++1.33k ) { |
2196 | 1.33k | // Interleaved store group doesn't allow a gap, so each index has a member |
2197 | 1.33k | Instruction *Member = Group->getMember(i); |
2198 | 1.33k | assert(Member && "Fail to get a member from an interleaved store group"); |
2199 | 1.33k | |
2200 | 1.33k | Value *StoredVec = getOrCreateVectorValue( |
2201 | 1.33k | cast<StoreInst>(Member)->getValueOperand(), Part); |
2202 | 1.33k | if (Group->isReverse()) |
2203 | 2 | StoredVec = reverseVector(StoredVec); |
2204 | 1.33k | |
2205 | 1.33k | // If this member has different type, cast it to a unified type. |
2206 | 1.33k | |
2207 | 1.33k | if (StoredVec->getType() != SubVT) |
2208 | 7 | StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); |
2209 | 1.33k | |
2210 | 1.33k | StoredVecs.push_back(StoredVec); |
2211 | 1.33k | } |
2212 | 577 | |
2213 | 577 | // Concatenate all vectors into a wide vector. |
2214 | 577 | Value *WideVec = concatenateVectors(Builder, StoredVecs); |
2215 | 577 | |
2216 | 577 | // Interleave the elements in the wide vector. |
2217 | 577 | Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor); |
2218 | 577 | Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask, |
2219 | 577 | "interleaved.vec"); |
2220 | 577 | |
2221 | 577 | Instruction *NewStoreInstr; |
2222 | 577 | if (IsMaskForCondRequired) { |
2223 | 3 | auto *Undefs = UndefValue::get(Mask[Part]->getType()); |
2224 | 3 | auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); |
2225 | 3 | Value *ShuffledMask = Builder.CreateShuffleVector( |
2226 | 3 | Mask[Part], Undefs, RepMask, "interleaved.mask"); |
2227 | 3 | NewStoreInstr = Builder.CreateMaskedStore( |
2228 | 3 | IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask); |
2229 | 3 | } |
2230 | 574 | else |
2231 | 574 | NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part], |
2232 | 574 | Group->getAlignment()); |
2233 | 577 | |
2234 | 577 | Group->addMetadata(NewStoreInstr); |
2235 | 577 | } |
2236 | 345 | } |
2237 | | |
2238 | | void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, |
2239 | 21.6k | VectorParts *BlockInMask) { |
2240 | 21.6k | // Attempt to issue a wide load. |
2241 | 21.6k | LoadInst *LI = dyn_cast<LoadInst>(Instr); |
2242 | 21.6k | StoreInst *SI = dyn_cast<StoreInst>(Instr); |
2243 | 21.6k | |
2244 | 21.6k | assert((LI || SI) && "Invalid Load/Store instruction"); |
2245 | 21.6k | |
2246 | 21.6k | LoopVectorizationCostModel::InstWidening Decision = |
2247 | 21.6k | Cost->getWideningDecision(Instr, VF); |
2248 | 21.6k | assert(Decision != LoopVectorizationCostModel::CM_Unknown && |
2249 | 21.6k | "CM decision should be taken at this point"); |
2250 | 21.6k | if (Decision == LoopVectorizationCostModel::CM_Interleave) |
2251 | 0 | return vectorizeInterleaveGroup(Instr); |
2252 | 21.6k | |
2253 | 21.6k | Type *ScalarDataTy = getMemInstValueType(Instr); |
2254 | 21.6k | Type *DataTy = VectorType::get(ScalarDataTy, VF); |
2255 | 21.6k | Value *Ptr = getLoadStorePointerOperand(Instr); |
2256 | 21.6k | unsigned Alignment = getLoadStoreAlignment(Instr); |
2257 | 21.6k | // An alignment of 0 means target abi alignment. We need to use the scalar's |
2258 | 21.6k | // target abi alignment in such a case. |
2259 | 21.6k | const DataLayout &DL = Instr->getModule()->getDataLayout(); |
2260 | 21.6k | if (!Alignment) |
2261 | 79 | Alignment = DL.getABITypeAlignment(ScalarDataTy); |
2262 | 21.6k | unsigned AddressSpace = getLoadStoreAddressSpace(Instr); |
2263 | 21.6k | |
2264 | 21.6k | // Determine if the pointer operand of the access is either consecutive or |
2265 | 21.6k | // reverse consecutive. |
2266 | 21.6k | bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); |
2267 | 21.6k | bool ConsecutiveStride = |
2268 | 21.6k | Reverse || (Decision == LoopVectorizationCostModel::CM_Widen)21.0k ; |
2269 | 21.6k | bool CreateGatherScatter = |
2270 | 21.6k | (Decision == LoopVectorizationCostModel::CM_GatherScatter); |
2271 | 21.6k | |
2272 | 21.6k | // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector |
2273 | 21.6k | // gather/scatter. Otherwise Decision should have been to Scalarize. |
2274 | 21.6k | assert((ConsecutiveStride || CreateGatherScatter) && |
2275 | 21.6k | "The instruction should be scalarized"); |
2276 | 21.6k | |
2277 | 21.6k | // Handle consecutive loads/stores. |
2278 | 21.6k | if (ConsecutiveStride) |
2279 | 21.5k | Ptr = getOrCreateScalarValue(Ptr, {0, 0}); |
2280 | 21.6k | |
2281 | 21.6k | VectorParts Mask; |
2282 | 21.6k | bool isMaskRequired = BlockInMask; |
2283 | 21.6k | if (isMaskRequired) |
2284 | 95 | Mask = *BlockInMask; |
2285 | 21.6k | |
2286 | 21.6k | bool InBounds = false; |
2287 | 21.6k | if (auto *gep = dyn_cast<GetElementPtrInst>( |
2288 | 19.2k | getLoadStorePointerOperand(Instr)->stripPointerCasts())) |
2289 | 19.2k | InBounds = gep->isInBounds(); |
2290 | 21.6k | |
2291 | 40.2k | const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { |
2292 | 40.2k | // Calculate the pointer for the specific unroll-part. |
2293 | 40.2k | GetElementPtrInst *PartPtr = nullptr; |
2294 | 40.2k | |
2295 | 40.2k | if (Reverse) { |
2296 | 1.21k | // If the address is consecutive but reversed, then the |
2297 | 1.21k | // wide store needs to start at the last vector element. |
2298 | 1.21k | PartPtr = cast<GetElementPtrInst>( |
2299 | 1.21k | Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF))); |
2300 | 1.21k | PartPtr->setIsInBounds(InBounds); |
2301 | 1.21k | PartPtr = cast<GetElementPtrInst>( |
2302 | 1.21k | Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF))); |
2303 | 1.21k | PartPtr->setIsInBounds(InBounds); |
2304 | 1.21k | if (isMaskRequired) // Reverse of a null all-one mask is a null mask. |
2305 | 16 | Mask[Part] = reverseVector(Mask[Part]); |
2306 | 39.0k | } else { |
2307 | 39.0k | PartPtr = cast<GetElementPtrInst>( |
2308 | 39.0k | Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF))); |
2309 | 39.0k | PartPtr->setIsInBounds(InBounds); |
2310 | 39.0k | } |
2311 | 40.2k | |
2312 | 40.2k | return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); |
2313 | 40.2k | }; |
2314 | 21.6k | |
2315 | 21.6k | // Handle Stores: |
2316 | 21.6k | if (SI) { |
2317 | 14.4k | setDebugLocFromInst(Builder, SI); |
2318 | 14.4k | |
2319 | 42.2k | for (unsigned Part = 0; Part < UF; ++Part27.7k ) { |
2320 | 27.7k | Instruction *NewSI = nullptr; |
2321 | 27.7k | Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part); |
2322 | 27.7k | if (CreateGatherScatter) { |
2323 | 29 | Value *MaskPart = isMaskRequired ? Mask[Part]12 : nullptr17 ; |
2324 | 29 | Value *VectorGep = getOrCreateVectorValue(Ptr, Part); |
2325 | 29 | NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, |
2326 | 29 | MaskPart); |
2327 | 27.7k | } else { |
2328 | 27.7k | if (Reverse) { |
2329 | 574 | // If we store to reverse consecutive memory locations, then we need |
2330 | 574 | // to reverse the order of elements in the stored value. |
2331 | 574 | StoredVal = reverseVector(StoredVal); |
2332 | 574 | // We don't want to update the value in the map as it might be used in |
2333 | 574 | // another expression. So don't call resetVectorValue(StoredVal). |
2334 | 574 | } |
2335 | 27.7k | auto *VecPtr = CreateVecPtr(Part, Ptr); |
2336 | 27.7k | if (isMaskRequired) |
2337 | 100 | NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, |
2338 | 100 | Mask[Part]); |
2339 | 27.6k | else |
2340 | 27.6k | NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); |
2341 | 27.7k | } |
2342 | 27.7k | addMetadata(NewSI, SI); |
2343 | 27.7k | } |
2344 | 14.4k | return; |
2345 | 14.4k | } |
2346 | 7.20k | |
2347 | 7.20k | // Handle loads. |
2348 | 7.20k | assert(LI && "Must have a load instruction"); |
2349 | 7.20k | setDebugLocFromInst(Builder, LI); |
2350 | 19.7k | for (unsigned Part = 0; Part < UF; ++Part12.5k ) { |
2351 | 12.5k | Value *NewLI; |
2352 | 12.5k | if (CreateGatherScatter) { |
2353 | 58 | Value *MaskPart = isMaskRequired ? Mask[Part]29 : nullptr29 ; |
2354 | 58 | Value *VectorGep = getOrCreateVectorValue(Ptr, Part); |
2355 | 58 | NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, |
2356 | 58 | nullptr, "wide.masked.gather"); |
2357 | 58 | addMetadata(NewLI, LI); |
2358 | 12.4k | } else { |
2359 | 12.4k | auto *VecPtr = CreateVecPtr(Part, Ptr); |
2360 | 12.4k | if (isMaskRequired) |
2361 | 80 | NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part], |
2362 | 80 | UndefValue::get(DataTy), |
2363 | 80 | "wide.masked.load"); |
2364 | 12.4k | else |
2365 | 12.4k | NewLI = |
2366 | 12.4k | Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); |
2367 | 12.4k | |
2368 | 12.4k | // Add metadata to the load, but setVectorValue to the reverse shuffle. |
2369 | 12.4k | addMetadata(NewLI, LI); |
2370 | 12.4k | if (Reverse) |
2371 | 643 | NewLI = reverseVector(NewLI); |
2372 | 12.4k | } |
2373 | 12.5k | VectorLoopValueMap.setVectorValue(Instr, Part, NewLI); |
2374 | 12.5k | } |
2375 | 7.20k | } |
2376 | | |
2377 | | void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, |
2378 | | const VPIteration &Instance, |
2379 | 71.6k | bool IfPredicateInstr) { |
2380 | 71.6k | assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); |
2381 | 71.6k | |
2382 | 71.6k | setDebugLocFromInst(Builder, Instr); |
2383 | 71.6k | |
2384 | 71.6k | // Does this instruction return a value ? |
2385 | 71.6k | bool IsVoidRetTy = Instr->getType()->isVoidTy(); |
2386 | 71.6k | |
2387 | 71.6k | Instruction *Cloned = Instr->clone(); |
2388 | 71.6k | if (!IsVoidRetTy) |
2389 | 68.4k | Cloned->setName(Instr->getName() + ".cloned"); |
2390 | 71.6k | |
2391 | 71.6k | // Replace the operands of the cloned instructions with their scalar |
2392 | 71.6k | // equivalents in the new loop. |
2393 | 238k | for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op166k ) { |
2394 | 166k | auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance); |
2395 | 166k | Cloned->setOperand(op, NewOp); |
2396 | 166k | } |
2397 | 71.6k | addNewMetadata(Cloned, Instr); |
2398 | 71.6k | |
2399 | 71.6k | // Place the cloned scalar in the new loop. |
2400 | 71.6k | Builder.Insert(Cloned); |
2401 | 71.6k | |
2402 | 71.6k | // Add the cloned scalar to the scalar map entry. |
2403 | 71.6k | VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); |
2404 | 71.6k | |
2405 | 71.6k | // If we just cloned a new assumption, add it the assumption cache. |
2406 | 71.6k | if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) |
2407 | 64 | if (II->getIntrinsicID() == Intrinsic::assume) |
2408 | 24 | AC->registerAssumption(II); |
2409 | 71.6k | |
2410 | 71.6k | // End if-block. |
2411 | 71.6k | if (IfPredicateInstr) |
2412 | 948 | PredicatedInstructions.push_back(Cloned); |
2413 | 71.6k | } |
2414 | | |
2415 | | PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, |
2416 | | Value *End, Value *Step, |
2417 | 17.0k | Instruction *DL) { |
2418 | 17.0k | BasicBlock *Header = L->getHeader(); |
2419 | 17.0k | BasicBlock *Latch = L->getLoopLatch(); |
2420 | 17.0k | // As we're just creating this loop, it's possible no latch exists |
2421 | 17.0k | // yet. If so, use the header as this will be a single block loop. |
2422 | 17.0k | if (!Latch) |
2423 | 17.0k | Latch = Header; |
2424 | 17.0k | |
2425 | 17.0k | IRBuilder<> Builder(&*Header->getFirstInsertionPt()); |
2426 | 17.0k | Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); |
2427 | 17.0k | setDebugLocFromInst(Builder, OldInst); |
2428 | 17.0k | auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); |
2429 | 17.0k | |
2430 | 17.0k | Builder.SetInsertPoint(Latch->getTerminator()); |
2431 | 17.0k | setDebugLocFromInst(Builder, OldInst); |
2432 | 17.0k | |
2433 | 17.0k | // Create i+1 and fill the PHINode. |
2434 | 17.0k | Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); |
2435 | 17.0k | Induction->addIncoming(Start, L->getLoopPreheader()); |
2436 | 17.0k | Induction->addIncoming(Next, Latch); |
2437 | 17.0k | // Create the compare. |
2438 | 17.0k | Value *ICmp = Builder.CreateICmpEQ(Next, End); |
2439 | 17.0k | Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); |
2440 | 17.0k | |
2441 | 17.0k | // Now we have two terminators. Remove the old one from the block. |
2442 | 17.0k | Latch->getTerminator()->eraseFromParent(); |
2443 | 17.0k | |
2444 | 17.0k | return Induction; |
2445 | 17.0k | } |
2446 | | |
2447 | 68.2k | Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { |
2448 | 68.2k | if (TripCount) |
2449 | 51.1k | return TripCount; |
2450 | 17.0k | |
2451 | 17.0k | assert(L && "Create Trip Count for null loop."); |
2452 | 17.0k | IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); |
2453 | 17.0k | // Find the loop boundaries. |
2454 | 17.0k | ScalarEvolution *SE = PSE.getSE(); |
2455 | 17.0k | const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); |
2456 | 17.0k | assert(BackedgeTakenCount != SE->getCouldNotCompute() && |
2457 | 17.0k | "Invalid loop count"); |
2458 | 17.0k | |
2459 | 17.0k | Type *IdxTy = Legal->getWidestInductionType(); |
2460 | 17.0k | assert(IdxTy && "No type for induction"); |
2461 | 17.0k | |
2462 | 17.0k | // The exit count might have the type of i64 while the phi is i32. This can |
2463 | 17.0k | // happen if we have an induction variable that is sign extended before the |
2464 | 17.0k | // compare. The only way that we get a backedge taken count is that the |
2465 | 17.0k | // induction variable was signed and as such will not overflow. In such a case |
2466 | 17.0k | // truncation is legal. |
2467 | 17.0k | if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() > |
2468 | 17.0k | IdxTy->getPrimitiveSizeInBits()) |
2469 | 2 | BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); |
2470 | 17.0k | BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); |
2471 | 17.0k | |
2472 | 17.0k | // Get the total trip count from the count by adding 1. |
2473 | 17.0k | const SCEV *ExitCount = SE->getAddExpr( |
2474 | 17.0k | BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); |
2475 | 17.0k | |
2476 | 17.0k | const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); |
2477 | 17.0k | |
2478 | 17.0k | // Expand the trip count and place the new instructions in the preheader. |
2479 | 17.0k | // Notice that the pre-header does not change, only the loop body. |
2480 | 17.0k | SCEVExpander Exp(*SE, DL, "induction"); |
2481 | 17.0k | |
2482 | 17.0k | // Count holds the overall loop count (N). |
2483 | 17.0k | TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), |
2484 | 17.0k | L->getLoopPreheader()->getTerminator()); |
2485 | 17.0k | |
2486 | 17.0k | if (TripCount->getType()->isPointerTy()) |
2487 | 159 | TripCount = |
2488 | 159 | CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", |
2489 | 159 | L->getLoopPreheader()->getTerminator()); |
2490 | 17.0k | |
2491 | 17.0k | return TripCount; |
2492 | 17.0k | } |
2493 | | |
2494 | 37.3k | Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { |
2495 | 37.3k | if (VectorTripCount) |
2496 | 20.3k | return VectorTripCount; |
2497 | 17.0k | |
2498 | 17.0k | Value *TC = getOrCreateTripCount(L); |
2499 | 17.0k | IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); |
2500 | 17.0k | |
2501 | 17.0k | Type *Ty = TC->getType(); |
2502 | 17.0k | Constant *Step = ConstantInt::get(Ty, VF * UF); |
2503 | 17.0k | |
2504 | 17.0k | // If the tail is to be folded by masking, round the number of iterations N |
2505 | 17.0k | // up to a multiple of Step instead of rounding down. This is done by first |
2506 | 17.0k | // adding Step-1 and then rounding down. Note that it's ok if this addition |
2507 | 17.0k | // overflows: the vector induction variable will eventually wrap to zero given |
2508 | 17.0k | // that it starts at zero and its Step is a power of two; the loop will then |
2509 | 17.0k | // exit, with the last early-exit vector comparison also producing all-true. |
2510 | 17.0k | if (Cost->foldTailByMasking()) { |
2511 | 18 | assert(isPowerOf2_32(VF * UF) && |
2512 | 18 | "VF*UF must be a power of 2 when folding tail by masking"); |
2513 | 18 | TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up"); |
2514 | 18 | } |
2515 | 17.0k | |
2516 | 17.0k | // Now we need to generate the expression for the part of the loop that the |
2517 | 17.0k | // vectorized body will execute. This is equal to N - (N % Step) if scalar |
2518 | 17.0k | // iterations are not required for correctness, or N - Step, otherwise. Step |
2519 | 17.0k | // is equal to the vectorization factor (number of SIMD elements) times the |
2520 | 17.0k | // unroll factor (number of SIMD instructions). |
2521 | 17.0k | Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); |
2522 | 17.0k | |
2523 | 17.0k | // If there is a non-reversed interleaved group that may speculatively access |
2524 | 17.0k | // memory out-of-bounds, we need to ensure that there will be at least one |
2525 | 17.0k | // iteration of the scalar epilogue loop. Thus, if the step evenly divides |
2526 | 17.0k | // the trip count, we set the remainder to be equal to the step. If the step |
2527 | 17.0k | // does not evenly divide the trip count, no adjustment is necessary since |
2528 | 17.0k | // there will already be scalar iterations. Note that the minimum iterations |
2529 | 17.0k | // check ensures that N >= Step. |
2530 | 17.0k | if (VF > 1 && Cost->requiresScalarEpilogue()15.3k ) { |
2531 | 56 | auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); |
2532 | 56 | R = Builder.CreateSelect(IsZero, Step, R); |
2533 | 56 | } |
2534 | 17.0k | |
2535 | 17.0k | VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); |
2536 | 17.0k | |
2537 | 17.0k | return VectorTripCount; |
2538 | 17.0k | } |
2539 | | |
2540 | | Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, |
2541 | 14 | const DataLayout &DL) { |
2542 | 14 | // Verify that V is a vector type with same number of elements as DstVTy. |
2543 | 14 | unsigned VF = DstVTy->getNumElements(); |
2544 | 14 | VectorType *SrcVecTy = cast<VectorType>(V->getType()); |
2545 | 14 | assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); |
2546 | 14 | Type *SrcElemTy = SrcVecTy->getElementType(); |
2547 | 14 | Type *DstElemTy = DstVTy->getElementType(); |
2548 | 14 | assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && |
2549 | 14 | "Vector elements must have same size"); |
2550 | 14 | |
2551 | 14 | // Do a direct cast if element types are castable. |
2552 | 14 | if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { |
2553 | 2 | return Builder.CreateBitOrPointerCast(V, DstVTy); |
2554 | 2 | } |
2555 | 12 | // V cannot be directly casted to desired vector type. |
2556 | 12 | // May happen when V is a floating point vector but DstVTy is a vector of |
2557 | 12 | // pointers or vice-versa. Handle this using a two-step bitcast using an |
2558 | 12 | // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. |
2559 | 12 | assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && |
2560 | 12 | "Only one type should be a pointer type"); |
2561 | 12 | assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && |
2562 | 12 | "Only one type should be a floating point type"); |
2563 | 12 | Type *IntTy = |
2564 | 12 | IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); |
2565 | 12 | VectorType *VecIntTy = VectorType::get(IntTy, VF); |
2566 | 12 | Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); |
2567 | 12 | return Builder.CreateBitOrPointerCast(CastVal, DstVTy); |
2568 | 12 | } |
2569 | | |
2570 | | void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, |
2571 | 17.0k | BasicBlock *Bypass) { |
2572 | 17.0k | Value *Count = getOrCreateTripCount(L); |
2573 | 17.0k | BasicBlock *BB = L->getLoopPreheader(); |
2574 | 17.0k | IRBuilder<> Builder(BB->getTerminator()); |
2575 | 17.0k | |
2576 | 17.0k | // Generate code to check if the loop's trip count is less than VF * UF, or |
2577 | 17.0k | // equal to it in case a scalar epilogue is required; this implies that the |
2578 | 17.0k | // vector trip count is zero. This check also covers the case where adding one |
2579 | 17.0k | // to the backedge-taken count overflowed leading to an incorrect trip count |
2580 | 17.0k | // of zero. In this case we will also jump to the scalar loop. |
2581 | 17.0k | auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE106 |
2582 | 17.0k | : ICmpInst::ICMP_ULT16.9k ; |
2583 | 17.0k | |
2584 | 17.0k | // If tail is to be folded, vector loop takes care of all iterations. |
2585 | 17.0k | Value *CheckMinIters = Builder.getFalse(); |
2586 | 17.0k | if (!Cost->foldTailByMasking()) |
2587 | 17.0k | CheckMinIters = Builder.CreateICmp( |
2588 | 17.0k | P, Count, ConstantInt::get(Count->getType(), VF * UF), |
2589 | 17.0k | "min.iters.check"); |
2590 | 17.0k | |
2591 | 17.0k | BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); |
2592 | 17.0k | // Update dominator tree immediately if the generated block is a |
2593 | 17.0k | // LoopBypassBlock because SCEV expansions to generate loop bypass |
2594 | 17.0k | // checks may query it before the current function is finished. |
2595 | 17.0k | DT->addNewBlock(NewBB, BB); |
2596 | 17.0k | if (L->getParentLoop()) |
2597 | 4.84k | L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); |
2598 | 17.0k | ReplaceInstWithInst(BB->getTerminator(), |
2599 | 17.0k | BranchInst::Create(Bypass, NewBB, CheckMinIters)); |
2600 | 17.0k | LoopBypassBlocks.push_back(BB); |
2601 | 17.0k | } |
2602 | | |
2603 | 17.0k | void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { |
2604 | 17.0k | BasicBlock *BB = L->getLoopPreheader(); |
2605 | 17.0k | |
2606 | 17.0k | // Generate the code to check that the SCEV assumptions that we made. |
2607 | 17.0k | // We want the new basic block to start at the first instruction in a |
2608 | 17.0k | // sequence of instructions that form a check. |
2609 | 17.0k | SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), |
2610 | 17.0k | "scev.check"); |
2611 | 17.0k | Value *SCEVCheck = |
2612 | 17.0k | Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator()); |
2613 | 17.0k | |
2614 | 17.0k | if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) |
2615 | 16.7k | if (C->isZero()) |
2616 | 16.7k | return; |
2617 | 332 | |
2618 | 332 | assert(!Cost->foldTailByMasking() && |
2619 | 332 | "Cannot SCEV check stride or overflow when folding tail"); |
2620 | 332 | // Create a new block containing the stride check. |
2621 | 332 | BB->setName("vector.scevcheck"); |
2622 | 332 | auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); |
2623 | 332 | // Update dominator tree immediately if the generated block is a |
2624 | 332 | // LoopBypassBlock because SCEV expansions to generate loop bypass |
2625 | 332 | // checks may query it before the current function is finished. |
2626 | 332 | DT->addNewBlock(NewBB, BB); |
2627 | 332 | if (L->getParentLoop()) |
2628 | 159 | L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); |
2629 | 332 | ReplaceInstWithInst(BB->getTerminator(), |
2630 | 332 | BranchInst::Create(Bypass, NewBB, SCEVCheck)); |
2631 | 332 | LoopBypassBlocks.push_back(BB); |
2632 | 332 | AddedSafetyChecks = true; |
2633 | 332 | } |
2634 | | |
2635 | 17.0k | void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { |
2636 | 17.0k | // VPlan-native path does not do any analysis for runtime checks currently. |
2637 | 17.0k | if (EnableVPlanNativePath) |
2638 | 7 | return; |
2639 | 17.0k | |
2640 | 17.0k | BasicBlock *BB = L->getLoopPreheader(); |
2641 | 17.0k | |
2642 | 17.0k | // Generate the code that checks in runtime if arrays overlap. We put the |
2643 | 17.0k | // checks into a separate block to make the more common case of few elements |
2644 | 17.0k | // faster. |
2645 | 17.0k | Instruction *FirstCheckInst; |
2646 | 17.0k | Instruction *MemRuntimeCheck; |
2647 | 17.0k | std::tie(FirstCheckInst, MemRuntimeCheck) = |
2648 | 17.0k | Legal->getLAI()->addRuntimeChecks(BB->getTerminator()); |
2649 | 17.0k | if (!MemRuntimeCheck) |
2650 | 14.5k | return; |
2651 | 2.48k | |
2652 | 2.48k | assert(!Cost->foldTailByMasking() && "Cannot check memory when folding tail"); |
2653 | 2.48k | // Create a new block containing the memory check. |
2654 | 2.48k | BB->setName("vector.memcheck"); |
2655 | 2.48k | auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); |
2656 | 2.48k | // Update dominator tree immediately if the generated block is a |
2657 | 2.48k | // LoopBypassBlock because SCEV expansions to generate loop bypass |
2658 | 2.48k | // checks may query it before the current function is finished. |
2659 | 2.48k | DT->addNewBlock(NewBB, BB); |
2660 | 2.48k | if (L->getParentLoop()) |
2661 | 887 | L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); |
2662 | 2.48k | ReplaceInstWithInst(BB->getTerminator(), |
2663 | 2.48k | BranchInst::Create(Bypass, NewBB, MemRuntimeCheck)); |
2664 | 2.48k | LoopBypassBlocks.push_back(BB); |
2665 | 2.48k | AddedSafetyChecks = true; |
2666 | 2.48k | |
2667 | 2.48k | // We currently don't use LoopVersioning for the actual loop cloning but we |
2668 | 2.48k | // still use it to add the noalias metadata. |
2669 | 2.48k | LVer = llvm::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, |
2670 | 2.48k | PSE.getSE()); |
2671 | 2.48k | LVer->prepareNoAliasMetadata(); |
2672 | 2.48k | } |
2673 | | |
2674 | | Value *InnerLoopVectorizer::emitTransformedIndex( |
2675 | | IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, |
2676 | 17.7k | const InductionDescriptor &ID) const { |
2677 | 17.7k | |
2678 | 17.7k | SCEVExpander Exp(*SE, DL, "induction"); |
2679 | 17.7k | auto Step = ID.getStep(); |
2680 | 17.7k | auto StartValue = ID.getStartValue(); |
2681 | 17.7k | assert(Index->getType() == Step->getType() && |
2682 | 17.7k | "Index type does not match StepValue type"); |
2683 | 17.7k | |
2684 | 17.7k | // Note: the IR at this point is broken. We cannot use SE to create any new |
2685 | 17.7k | // SCEV and then expand it, hoping that SCEV's simplification will give us |
2686 | 17.7k | // a more optimal code. Unfortunately, attempt of doing so on invalid IR may |
2687 | 17.7k | // lead to various SCEV crashes. So all we can do is to use builder and rely |
2688 | 17.7k | // on InstCombine for future simplifications. Here we handle some trivial |
2689 | 17.7k | // cases only. |
2690 | 17.7k | auto CreateAdd = [&B](Value *X, Value *Y) { |
2691 | 5.07k | assert(X->getType() == Y->getType() && "Types don't match!"); |
2692 | 5.07k | if (auto *CX = dyn_cast<ConstantInt>(X)) |
2693 | 3.56k | if (CX->isZero()) |
2694 | 2.74k | return Y; |
2695 | 2.32k | if (auto *CY = dyn_cast<ConstantInt>(Y)) |
2696 | 150 | if (CY->isZero()) |
2697 | 9 | return X; |
2698 | 2.31k | return B.CreateAdd(X, Y); |
2699 | 2.31k | }; |
2700 | 17.7k | |
2701 | 17.7k | auto CreateMul = [&B](Value *X, Value *Y) { |
2702 | 16.0k | assert(X->getType() == Y->getType() && "Types don't match!"); |
2703 | 16.0k | if (auto *CX = dyn_cast<ConstantInt>(X)) |
2704 | 2.70k | if (CX->isOne()) |
2705 | 0 | return Y; |
2706 | 16.0k | if (auto *CY = dyn_cast<ConstantInt>(Y)) |
2707 | 15.6k | if (CY->isOne()) |
2708 | 8.13k | return X; |
2709 | 7.93k | return B.CreateMul(X, Y); |
2710 | 7.93k | }; |
2711 | 17.7k | |
2712 | 17.7k | switch (ID.getKind()) { |
2713 | 17.7k | case InductionDescriptor::IK_IntInduction: { |
2714 | 6.75k | assert(Index->getType() == StartValue->getType() && |
2715 | 6.75k | "Index type does not match StartValue type"); |
2716 | 6.75k | if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()6.35k ) |
2717 | 1.68k | return B.CreateSub(StartValue, Index); |
2718 | 5.07k | auto *Offset = CreateMul( |
2719 | 5.07k | Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint())); |
2720 | 5.07k | return CreateAdd(StartValue, Offset); |
2721 | 5.07k | } |
2722 | 10.9k | case InductionDescriptor::IK_PtrInduction: { |
2723 | 10.9k | assert(isa<SCEVConstant>(Step) && |
2724 | 10.9k | "Expected constant step for pointer induction"); |
2725 | 10.9k | return B.CreateGEP( |
2726 | 10.9k | StartValue->getType()->getPointerElementType(), StartValue, |
2727 | 10.9k | CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(), |
2728 | 10.9k | &*B.GetInsertPoint()))); |
2729 | 5.07k | } |
2730 | 5.07k | case InductionDescriptor::IK_FpInduction: { |
2731 | 36 | assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); |
2732 | 36 | auto InductionBinOp = ID.getInductionBinOp(); |
2733 | 36 | assert(InductionBinOp && |
2734 | 36 | (InductionBinOp->getOpcode() == Instruction::FAdd || |
2735 | 36 | InductionBinOp->getOpcode() == Instruction::FSub) && |
2736 | 36 | "Original bin op should be defined for FP induction"); |
2737 | 36 | |
2738 | 36 | Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); |
2739 | 36 | |
2740 | 36 | // Floating point operations had to be 'fast' to enable the induction. |
2741 | 36 | FastMathFlags Flags; |
2742 | 36 | Flags.setFast(); |
2743 | 36 | |
2744 | 36 | Value *MulExp = B.CreateFMul(StepValue, Index); |
2745 | 36 | if (isa<Instruction>(MulExp)) |
2746 | 36 | // We have to check, the MulExp may be a constant. |
2747 | 36 | cast<Instruction>(MulExp)->setFastMathFlags(Flags); |
2748 | 36 | |
2749 | 36 | Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, |
2750 | 36 | "induction"); |
2751 | 36 | if (isa<Instruction>(BOp)) |
2752 | 36 | cast<Instruction>(BOp)->setFastMathFlags(Flags); |
2753 | 36 | |
2754 | 36 | return BOp; |
2755 | 5.07k | } |
2756 | 5.07k | case InductionDescriptor::IK_NoInduction: |
2757 | 0 | return nullptr; |
2758 | 0 | } |
2759 | 0 | llvm_unreachable("invalid enum"); |
2760 | 0 | } |
2761 | | |
2762 | 17.0k | BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { |
2763 | 17.0k | /* |
2764 | 17.0k | In this function we generate a new loop. The new loop will contain |
2765 | 17.0k | the vectorized instructions while the old loop will continue to run the |
2766 | 17.0k | scalar remainder. |
2767 | 17.0k | |
2768 | 17.0k | [ ] <-- loop iteration number check. |
2769 | 17.0k | / | |
2770 | 17.0k | / v |
2771 | 17.0k | | [ ] <-- vector loop bypass (may consist of multiple blocks). |
2772 | 17.0k | | / | |
2773 | 17.0k | | / v |
2774 | 17.0k | || [ ] <-- vector pre header. |
2775 | 17.0k | |/ | |
2776 | 17.0k | | v |
2777 | 17.0k | | [ ] \ |
2778 | 17.0k | | [ ]_| <-- vector loop. |
2779 | 17.0k | | | |
2780 | 17.0k | | v |
2781 | 17.0k | | -[ ] <--- middle-block. |
2782 | 17.0k | | / | |
2783 | 17.0k | | / v |
2784 | 17.0k | -|- >[ ] <--- new preheader. |
2785 | 17.0k | | | |
2786 | 17.0k | | v |
2787 | 17.0k | | [ ] \ |
2788 | 17.0k | | [ ]_| <-- old scalar loop to handle remainder. |
2789 | 17.0k | \ | |
2790 | 17.0k | \ v |
2791 | 17.0k | >[ ] <-- exit block. |
2792 | 17.0k | ... |
2793 | 17.0k | */ |
2794 | 17.0k | |
2795 | 17.0k | BasicBlock *OldBasicBlock = OrigLoop->getHeader(); |
2796 | 17.0k | BasicBlock *VectorPH = OrigLoop->getLoopPreheader(); |
2797 | 17.0k | BasicBlock *ExitBlock = OrigLoop->getExitBlock(); |
2798 | 17.0k | MDNode *OrigLoopID = OrigLoop->getLoopID(); |
2799 | 17.0k | assert(VectorPH && "Invalid loop structure"); |
2800 | 17.0k | assert(ExitBlock && "Must have an exit block"); |
2801 | 17.0k | |
2802 | 17.0k | // Some loops have a single integer induction variable, while other loops |
2803 | 17.0k | // don't. One example is c++ iterators that often have multiple pointer |
2804 | 17.0k | // induction variables. In the code below we also support a case where we |
2805 | 17.0k | // don't have a single induction variable. |
2806 | 17.0k | // |
2807 | 17.0k | // We try to obtain an induction variable from the original loop as hard |
2808 | 17.0k | // as possible. However if we don't find one that: |
2809 | 17.0k | // - is an integer |
2810 | 17.0k | // - counts from zero, stepping by one |
2811 | 17.0k | // - is the size of the widest induction variable type |
2812 | 17.0k | // then we create a new one. |
2813 | 17.0k | OldInduction = Legal->getPrimaryInduction(); |
2814 | 17.0k | Type *IdxTy = Legal->getWidestInductionType(); |
2815 | 17.0k | |
2816 | 17.0k | // Split the single block loop into the two loop structure described above. |
2817 | 17.0k | BasicBlock *VecBody = |
2818 | 17.0k | VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); |
2819 | 17.0k | BasicBlock *MiddleBlock = |
2820 | 17.0k | VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); |
2821 | 17.0k | BasicBlock *ScalarPH = |
2822 | 17.0k | MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); |
2823 | 17.0k | |
2824 | 17.0k | // Create and register the new vector loop. |
2825 | 17.0k | Loop *Lp = LI->AllocateLoop(); |
2826 | 17.0k | Loop *ParentLoop = OrigLoop->getParentLoop(); |
2827 | 17.0k | |
2828 | 17.0k | // Insert the new loop into the loop nest and register the new basic blocks |
2829 | 17.0k | // before calling any utilities such as SCEV that require valid LoopInfo. |
2830 | 17.0k | if (ParentLoop) { |
2831 | 4.84k | ParentLoop->addChildLoop(Lp); |
2832 | 4.84k | ParentLoop->addBasicBlockToLoop(ScalarPH, *LI); |
2833 | 4.84k | ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI); |
2834 | 12.2k | } else { |
2835 | 12.2k | LI->addTopLevelLoop(Lp); |
2836 | 12.2k | } |
2837 | 17.0k | Lp->addBasicBlockToLoop(VecBody, *LI); |
2838 | 17.0k | |
2839 | 17.0k | // Find the loop boundaries. |
2840 | 17.0k | Value *Count = getOrCreateTripCount(Lp); |
2841 | 17.0k | |
2842 | 17.0k | Value *StartIdx = ConstantInt::get(IdxTy, 0); |
2843 | 17.0k | |
2844 | 17.0k | // Now, compare the new count to zero. If it is zero skip the vector loop and |
2845 | 17.0k | // jump to the scalar loop. This check also covers the case where the |
2846 | 17.0k | // backedge-taken count is uint##_max: adding one to it will overflow leading |
2847 | 17.0k | // to an incorrect trip count of zero. In this (rare) case we will also jump |
2848 | 17.0k | // to the scalar loop. |
2849 | 17.0k | emitMinimumIterationCountCheck(Lp, ScalarPH); |
2850 | 17.0k | |
2851 | 17.0k | // Generate the code to check any assumptions that we've made for SCEV |
2852 | 17.0k | // expressions. |
2853 | 17.0k | emitSCEVChecks(Lp, ScalarPH); |
2854 | 17.0k | |
2855 | 17.0k | // Generate the code that checks in runtime if arrays overlap. We put the |
2856 | 17.0k | // checks into a separate block to make the more common case of few elements |
2857 | 17.0k | // faster. |
2858 | 17.0k | emitMemRuntimeChecks(Lp, ScalarPH); |
2859 | 17.0k | |
2860 | 17.0k | // Generate the induction variable. |
2861 | 17.0k | // The loop step is equal to the vectorization factor (num of SIMD elements) |
2862 | 17.0k | // times the unroll factor (num of SIMD instructions). |
2863 | 17.0k | Value *CountRoundDown = getOrCreateVectorTripCount(Lp); |
2864 | 17.0k | Constant *Step = ConstantInt::get(IdxTy, VF * UF); |
2865 | 17.0k | Induction = |
2866 | 17.0k | createInductionVariable(Lp, StartIdx, CountRoundDown, Step, |
2867 | 17.0k | getDebugLocFromInstOrOperands(OldInduction)); |
2868 | 17.0k | |
2869 | 17.0k | // We are going to resume the execution of the scalar loop. |
2870 | 17.0k | // Go over all of the induction variables that we found and fix the |
2871 | 17.0k | // PHIs that are left in the scalar version of the loop. |
2872 | 17.0k | // The starting values of PHI nodes depend on the counter of the last |
2873 | 17.0k | // iteration in the vectorized loop. |
2874 | 17.0k | // If we come from a bypass edge then we need to start from the original |
2875 | 17.0k | // start value. |
2876 | 17.0k | |
2877 | 17.0k | // This variable saves the new starting index for the scalar loop. It is used |
2878 | 17.0k | // to test if there are any tail iterations left once the vector loop has |
2879 | 17.0k | // completed. |
2880 | 17.0k | LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); |
2881 | 20.3k | for (auto &InductionEntry : *List) { |
2882 | 20.3k | PHINode *OrigPhi = InductionEntry.first; |
2883 | 20.3k | InductionDescriptor II = InductionEntry.second; |
2884 | 20.3k | |
2885 | 20.3k | // Create phi nodes to merge from the backedge-taken check block. |
2886 | 20.3k | PHINode *BCResumeVal = PHINode::Create( |
2887 | 20.3k | OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator()); |
2888 | 20.3k | // Copy original phi DL over to the new one. |
2889 | 20.3k | BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); |
2890 | 20.3k | Value *&EndValue = IVEndValues[OrigPhi]; |
2891 | 20.3k | if (OrigPhi == OldInduction) { |
2892 | 13.2k | // We know what the end value is. |
2893 | 13.2k | EndValue = CountRoundDown; |
2894 | 13.2k | } else { |
2895 | 7.04k | IRBuilder<> B(Lp->getLoopPreheader()->getTerminator()); |
2896 | 7.04k | Type *StepType = II.getStep()->getType(); |
2897 | 7.04k | Instruction::CastOps CastOp = |
2898 | 7.04k | CastInst::getCastOpcode(CountRoundDown, true, StepType, true); |
2899 | 7.04k | Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd"); |
2900 | 7.04k | const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); |
2901 | 7.04k | EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); |
2902 | 7.04k | EndValue->setName("ind.end"); |
2903 | 7.04k | } |
2904 | 20.3k | |
2905 | 20.3k | // The new PHI merges the original incoming value, in case of a bypass, |
2906 | 20.3k | // or the value at the end of the vectorized loop. |
2907 | 20.3k | BCResumeVal->addIncoming(EndValue, MiddleBlock); |
2908 | 20.3k | |
2909 | 20.3k | // Fix the scalar body counter (PHI node). |
2910 | 20.3k | // The old induction's phi node in the scalar body needs the truncated |
2911 | 20.3k | // value. |
2912 | 20.3k | for (BasicBlock *BB : LoopBypassBlocks) |
2913 | 24.3k | BCResumeVal->addIncoming(II.getStartValue(), BB); |
2914 | 20.3k | OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal); |
2915 | 20.3k | } |
2916 | 17.0k | |
2917 | 17.0k | // We need the OrigLoop (scalar loop part) latch terminator to help |
2918 | 17.0k | // produce correct debug info for the middle block BB instructions. |
2919 | 17.0k | // The legality check stage guarantees that the loop will have a single |
2920 | 17.0k | // latch. |
2921 | 17.0k | assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && |
2922 | 17.0k | "Scalar loop latch terminator isn't a branch"); |
2923 | 17.0k | BranchInst *ScalarLatchBr = |
2924 | 17.0k | cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()); |
2925 | 17.0k | |
2926 | 17.0k | // Add a check in the middle block to see if we have completed |
2927 | 17.0k | // all of the iterations in the first vector loop. |
2928 | 17.0k | // If (N - N%VF) == N, then we *don't* need to run the remainder. |
2929 | 17.0k | // If tail is to be folded, we know we don't need to run the remainder. |
2930 | 17.0k | Value *CmpN = Builder.getTrue(); |
2931 | 17.0k | if (!Cost->foldTailByMasking()) { |
2932 | 17.0k | CmpN = |
2933 | 17.0k | CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, |
2934 | 17.0k | CountRoundDown, "cmp.n", MiddleBlock->getTerminator()); |
2935 | 17.0k | |
2936 | 17.0k | // Here we use the same DebugLoc as the scalar loop latch branch instead |
2937 | 17.0k | // of the corresponding compare because they may have ended up with |
2938 | 17.0k | // different line numbers and we want to avoid awkward line stepping while |
2939 | 17.0k | // debugging. Eg. if the compare has got a line number inside the loop. |
2940 | 17.0k | cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); |
2941 | 17.0k | } |
2942 | 17.0k | |
2943 | 17.0k | BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN); |
2944 | 17.0k | BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); |
2945 | 17.0k | ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst); |
2946 | 17.0k | |
2947 | 17.0k | // Get ready to start creating new instructions into the vectorized body. |
2948 | 17.0k | Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt()); |
2949 | 17.0k | |
2950 | 17.0k | // Save the state. |
2951 | 17.0k | LoopVectorPreHeader = Lp->getLoopPreheader(); |
2952 | 17.0k | LoopScalarPreHeader = ScalarPH; |
2953 | 17.0k | LoopMiddleBlock = MiddleBlock; |
2954 | 17.0k | LoopExitBlock = ExitBlock; |
2955 | 17.0k | LoopVectorBody = VecBody; |
2956 | 17.0k | LoopScalarBody = OldBasicBlock; |
2957 | 17.0k | |
2958 | 17.0k | Optional<MDNode *> VectorizedLoopID = |
2959 | 17.0k | makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, |
2960 | 17.0k | LLVMLoopVectorizeFollowupVectorized}); |
2961 | 17.0k | if (VectorizedLoopID.hasValue()) { |
2962 | 1 | Lp->setLoopID(VectorizedLoopID.getValue()); |
2963 | 1 | |
2964 | 1 | // Do not setAlreadyVectorized if loop attributes have been defined |
2965 | 1 | // explicitly. |
2966 | 1 | return LoopVectorPreHeader; |
2967 | 1 | } |
2968 | 17.0k | |
2969 | 17.0k | // Keep all loop hints from the original loop on the vector loop (we'll |
2970 | 17.0k | // replace the vectorizer-specific hints below). |
2971 | 17.0k | if (MDNode *LID = OrigLoop->getLoopID()) |
2972 | 1.06k | Lp->setLoopID(LID); |
2973 | 17.0k | |
2974 | 17.0k | LoopVectorizeHints Hints(Lp, true, *ORE); |
2975 | 17.0k | Hints.setAlreadyVectorized(); |
2976 | 17.0k | |
2977 | 17.0k | return LoopVectorPreHeader; |
2978 | 17.0k | } |
2979 | | |
2980 | | // Fix up external users of the induction variable. At this point, we are |
2981 | | // in LCSSA form, with all external PHIs that use the IV having one input value, |
2982 | | // coming from the remainder loop. We need those PHIs to also have a correct |
2983 | | // value for the IV when arriving directly from the middle block. |
2984 | | void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, |
2985 | | const InductionDescriptor &II, |
2986 | | Value *CountRoundDown, Value *EndValue, |
2987 | 20.3k | BasicBlock *MiddleBlock) { |
2988 | 20.3k | // There are two kinds of external IV usages - those that use the value |
2989 | 20.3k | // computed in the last iteration (the PHI) and those that use the penultimate |
2990 | 20.3k | // value (the value that feeds into the phi from the loop latch). |
2991 | 20.3k | // We allow both, but they, obviously, have different values. |
2992 | 20.3k | |
2993 | 20.3k | assert(OrigLoop->getExitBlock() && "Expected a single exit block"); |
2994 | 20.3k | |
2995 | 20.3k | DenseMap<Value *, Value *> MissingVals; |
2996 | 20.3k | |
2997 | 20.3k | // An external user of the last iteration's value should see the value that |
2998 | 20.3k | // the remainder loop uses to initialize its own IV. |
2999 | 20.3k | Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); |
3000 | 45.4k | for (User *U : PostInc->users()) { |
3001 | 45.4k | Instruction *UI = cast<Instruction>(U); |
3002 | 45.4k | if (!OrigLoop->contains(UI)) { |
3003 | 1.16k | assert(isa<PHINode>(UI) && "Expected LCSSA form"); |
3004 | 1.16k | MissingVals[UI] = EndValue; |
3005 | 1.16k | } |
3006 | 45.4k | } |
3007 | 20.3k | |
3008 | 20.3k | // An external user of the penultimate value need to see EndValue - Step. |
3009 | 20.3k | // The simplest way to get this is to recompute it from the constituent SCEVs, |
3010 | 20.3k | // that is Start + (Step * (CRD - 1)). |
3011 | 46.5k | for (User *U : OrigPhi->users()) { |
3012 | 46.5k | auto *UI = cast<Instruction>(U); |
3013 | 46.5k | if (!OrigLoop->contains(UI)) { |
3014 | 37 | const DataLayout &DL = |
3015 | 37 | OrigLoop->getHeader()->getModule()->getDataLayout(); |
3016 | 37 | assert(isa<PHINode>(UI) && "Expected LCSSA form"); |
3017 | 37 | |
3018 | 37 | IRBuilder<> B(MiddleBlock->getTerminator()); |
3019 | 37 | Value *CountMinusOne = B.CreateSub( |
3020 | 37 | CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); |
3021 | 37 | Value *CMO = |
3022 | 37 | !II.getStep()->getType()->isIntegerTy() |
3023 | 37 | ? B.CreateCast(Instruction::SIToFP, CountMinusOne, |
3024 | 1 | II.getStep()->getType()) |
3025 | 37 | : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType())36 ; |
3026 | 37 | CMO->setName("cast.cmo"); |
3027 | 37 | Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); |
3028 | 37 | Escape->setName("ind.escape"); |
3029 | 37 | MissingVals[UI] = Escape; |
3030 | 37 | } |
3031 | 46.5k | } |
3032 | 20.3k | |
3033 | 20.3k | for (auto &I : MissingVals) { |
3034 | 1.19k | PHINode *PHI = cast<PHINode>(I.first); |
3035 | 1.19k | // One corner case we have to handle is two IVs "chasing" each-other, |
3036 | 1.19k | // that is %IV2 = phi [...], [ %IV1, %latch ] |
3037 | 1.19k | // In this case, if IV1 has an external use, we need to avoid adding both |
3038 | 1.19k | // "last value of IV1" and "penultimate value of IV2". So, verify that we |
3039 | 1.19k | // don't already have an incoming value for the middle block. |
3040 | 1.19k | if (PHI->getBasicBlockIndex(MiddleBlock) == -1) |
3041 | 1.19k | PHI->addIncoming(I.second, MiddleBlock); |
3042 | 1.19k | } |
3043 | 20.3k | } |
3044 | | |
3045 | | namespace { |
3046 | | |
3047 | | struct CSEDenseMapInfo { |
3048 | 509k | static bool canHandle(const Instruction *I) { |
3049 | 509k | return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I)501k || |
3050 | 509k | isa<ShuffleVectorInst>(I)499k || isa<GetElementPtrInst>(I)489k ; |
3051 | 509k | } |
3052 | | |
3053 | 3.40M | static inline Instruction *getEmptyKey() { |
3054 | 3.40M | return DenseMapInfo<Instruction *>::getEmptyKey(); |
3055 | 3.40M | } |
3056 | | |
3057 | 907k | static inline Instruction *getTombstoneKey() { |
3058 | 907k | return DenseMapInfo<Instruction *>::getTombstoneKey(); |
3059 | 907k | } |
3060 | | |
3061 | 275k | static unsigned getHashValue(const Instruction *I) { |
3062 | 275k | assert(canHandle(I) && "Unknown instruction!"); |
3063 | 275k | return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), |
3064 | 275k | I->value_op_end())); |
3065 | 275k | } |
3066 | | |
3067 | 2.13M | static bool isEqual(const Instruction *LHS, const Instruction *RHS) { |
3068 | 2.13M | if (LHS == getEmptyKey() || RHS == getEmptyKey()799k || |
3069 | 2.13M | LHS == getTombstoneKey()291k || RHS == getTombstoneKey()291k ) |
3070 | 2.07M | return LHS == RHS; |
3071 | 57.2k | return LHS->isIdenticalTo(RHS); |
3072 | 57.2k | } |
3073 | | }; |
3074 | | |
3075 | | } // end anonymous namespace |
3076 | | |
3077 | | ///Perform cse of induction variable instructions. |
3078 | 17.0k | static void cse(BasicBlock *BB) { |
3079 | 17.0k | // Perform simple cse. |
3080 | 17.0k | SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; |
3081 | 526k | for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { |
3082 | 509k | Instruction *In = &*I++; |
3083 | 509k | |
3084 | 509k | if (!CSEDenseMapInfo::canHandle(In)) |
3085 | 395k | continue; |
3086 | 113k | |
3087 | 113k | // Check if we can replace this instruction with any of the |
3088 | 113k | // visited instructions. |
3089 | 113k | if (Instruction *V = CSEMap.lookup(In)) { |
3090 | 1.49k | In->replaceAllUsesWith(V); |
3091 | 1.49k | In->eraseFromParent(); |
3092 | 1.49k | continue; |
3093 | 1.49k | } |
3094 | 112k | |
3095 | 112k | CSEMap[In] = In; |
3096 | 112k | } |
3097 | 17.0k | } |
3098 | | |
3099 | | unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, |
3100 | | unsigned VF, |
3101 | 1.93k | bool &NeedToScalarize) { |
3102 | 1.93k | Function *F = CI->getCalledFunction(); |
3103 | 1.93k | StringRef FnName = CI->getCalledFunction()->getName(); |
3104 | 1.93k | Type *ScalarRetTy = CI->getType(); |
3105 | 1.93k | SmallVector<Type *, 4> Tys, ScalarTys; |
3106 | 1.93k | for (auto &ArgOp : CI->arg_operands()) |
3107 | 2.17k | ScalarTys.push_back(ArgOp->getType()); |
3108 | 1.93k | |
3109 | 1.93k | // Estimate cost of scalarized vector call. The source operands are assumed |
3110 | 1.93k | // to be vectors, so we need to extract individual elements from there, |
3111 | 1.93k | // execute VF scalar calls, and then gather the result into the vector return |
3112 | 1.93k | // value. |
3113 | 1.93k | unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys); |
3114 | 1.93k | if (VF == 1) |
3115 | 363 | return ScalarCallCost; |
3116 | 1.57k | |
3117 | 1.57k | // Compute corresponding vector type for return value and arguments. |
3118 | 1.57k | Type *RetTy = ToVectorTy(ScalarRetTy, VF); |
3119 | 1.57k | for (Type *ScalarTy : ScalarTys) |
3120 | 1.77k | Tys.push_back(ToVectorTy(ScalarTy, VF)); |
3121 | 1.57k | |
3122 | 1.57k | // Compute costs of unpacking argument values for the scalar calls and |
3123 | 1.57k | // packing the return values to a vector. |
3124 | 1.57k | unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); |
3125 | 1.57k | |
3126 | 1.57k | unsigned Cost = ScalarCallCost * VF + ScalarizationCost; |
3127 | 1.57k | |
3128 | 1.57k | // If we can't emit a vector call for this function, then the currently found |
3129 | 1.57k | // cost is the cost we need to return. |
3130 | 1.57k | NeedToScalarize = true; |
3131 | 1.57k | if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin()378 ) |
3132 | 1.20k | return Cost; |
3133 | 376 | |
3134 | 376 | // If the corresponding vector cost is cheaper, return its cost. |
3135 | 376 | unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys); |
3136 | 376 | if (VectorCallCost < Cost) { |
3137 | 376 | NeedToScalarize = false; |
3138 | 376 | return VectorCallCost; |
3139 | 376 | } |
3140 | 0 | return Cost; |
3141 | 0 | } |
3142 | | |
3143 | | unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, |
3144 | 1.57k | unsigned VF) { |
3145 | 1.57k | Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); |
3146 | 1.57k | assert(ID && "Expected intrinsic call!"); |
3147 | 1.57k | |
3148 | 1.57k | FastMathFlags FMF; |
3149 | 1.57k | if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) |
3150 | 1.30k | FMF = FPMO->getFastMathFlags(); |
3151 | 1.57k | |
3152 | 1.57k | SmallVector<Value *, 4> Operands(CI->arg_operands()); |
3153 | 1.57k | return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF); |
3154 | 1.57k | } |
3155 | | |
3156 | 334 | static Type *smallestIntegerVectorType(Type *T1, Type *T2) { |
3157 | 334 | auto *I1 = cast<IntegerType>(T1->getVectorElementType()); |
3158 | 334 | auto *I2 = cast<IntegerType>(T2->getVectorElementType()); |
3159 | 334 | return I1->getBitWidth() < I2->getBitWidth() ? T10 : T2; |
3160 | 334 | } |
3161 | 275 | static Type *largestIntegerVectorType(Type *T1, Type *T2) { |
3162 | 275 | auto *I1 = cast<IntegerType>(T1->getVectorElementType()); |
3163 | 275 | auto *I2 = cast<IntegerType>(T2->getVectorElementType()); |
3164 | 275 | return I1->getBitWidth() > I2->getBitWidth() ? T13 : T2272 ; |
3165 | 275 | } |
3166 | | |
3167 | 15.3k | void InnerLoopVectorizer::truncateToMinimalBitwidths() { |
3168 | 15.3k | // For every instruction `I` in MinBWs, truncate the operands, create a |
3169 | 15.3k | // truncated version of `I` and reextend its result. InstCombine runs |
3170 | 15.3k | // later and will remove any ext/trunc pairs. |
3171 | 15.3k | SmallPtrSet<Value *, 4> Erased; |
3172 | 15.3k | for (const auto &KV : Cost->getMinimalBitwidths()) { |
3173 | 186 | // If the value wasn't vectorized, we must maintain the original scalar |
3174 | 186 | // type. The absence of the value from VectorLoopValueMap indicates that it |
3175 | 186 | // wasn't vectorized. |
3176 | 186 | if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) |
3177 | 2 | continue; |
3178 | 420 | for (unsigned Part = 0; 184 Part < UF; ++Part236 ) { |
3179 | 236 | Value *I = getOrCreateVectorValue(KV.first, Part); |
3180 | 236 | if (Erased.find(I) != Erased.end() || I->use_empty() || |
3181 | 236 | !isa<Instruction>(I)232 ) |
3182 | 6 | continue; |
3183 | 230 | Type *OriginalTy = I->getType(); |
3184 | 230 | Type *ScalarTruncatedTy = |
3185 | 230 | IntegerType::get(OriginalTy->getContext(), KV.second); |
3186 | 230 | Type *TruncatedTy = VectorType::get(ScalarTruncatedTy, |
3187 | 230 | OriginalTy->getVectorNumElements()); |
3188 | 230 | if (TruncatedTy == OriginalTy) |
3189 | 17 | continue; |
3190 | 213 | |
3191 | 213 | IRBuilder<> B(cast<Instruction>(I)); |
3192 | 240 | auto ShrinkOperand = [&](Value *V) -> Value * { |
3193 | 240 | if (auto *ZI = dyn_cast<ZExtInst>(V)) |
3194 | 63 | if (ZI->getSrcTy() == TruncatedTy) |
3195 | 5 | return ZI->getOperand(0); |
3196 | 235 | return B.CreateZExtOrTrunc(V, TruncatedTy); |
3197 | 235 | }; |
3198 | 213 | |
3199 | 213 | // The actual instruction modification depends on the instruction type, |
3200 | 213 | // unfortunately. |
3201 | 213 | Value *NewI = nullptr; |
3202 | 213 | if (auto *BO = dyn_cast<BinaryOperator>(I)) { |
3203 | 112 | NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), |
3204 | 112 | ShrinkOperand(BO->getOperand(1))); |
3205 | 112 | |
3206 | 112 | // Any wrapping introduced by shrinking this operation shouldn't be |
3207 | 112 | // considered undefined behavior. So, we can't unconditionally copy |
3208 | 112 | // arithmetic wrapping flags to NewI. |
3209 | 112 | cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); |
3210 | 112 | } else if (auto *101 CI101 = dyn_cast<ICmpInst>(I)) { |
3211 | 0 | NewI = |
3212 | 0 | B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), |
3213 | 0 | ShrinkOperand(CI->getOperand(1))); |
3214 | 101 | } else if (auto *SI = dyn_cast<SelectInst>(I)) { |
3215 | 0 | NewI = B.CreateSelect(SI->getCondition(), |
3216 | 0 | ShrinkOperand(SI->getTrueValue()), |
3217 | 0 | ShrinkOperand(SI->getFalseValue())); |
3218 | 101 | } else if (auto *CI = dyn_cast<CastInst>(I)) { |
3219 | 75 | switch (CI->getOpcode()) { |
3220 | 75 | default: |
3221 | 0 | llvm_unreachable("Unhandled cast!"); |
3222 | 75 | case Instruction::Trunc: |
3223 | 16 | NewI = ShrinkOperand(CI->getOperand(0)); |
3224 | 16 | break; |
3225 | 75 | case Instruction::SExt: |
3226 | 1 | NewI = B.CreateSExtOrTrunc( |
3227 | 1 | CI->getOperand(0), |
3228 | 1 | smallestIntegerVectorType(OriginalTy, TruncatedTy)); |
3229 | 1 | break; |
3230 | 75 | case Instruction::ZExt: |
3231 | 58 | NewI = B.CreateZExtOrTrunc( |
3232 | 58 | CI->getOperand(0), |
3233 | 58 | smallestIntegerVectorType(OriginalTy, TruncatedTy)); |
3234 | 58 | break; |
3235 | 26 | } |
3236 | 26 | } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { |
3237 | 24 | auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements(); |
3238 | 24 | auto *O0 = B.CreateZExtOrTrunc( |
3239 | 24 | SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); |
3240 | 24 | auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements(); |
3241 | 24 | auto *O1 = B.CreateZExtOrTrunc( |
3242 | 24 | SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); |
3243 | 24 | |
3244 | 24 | NewI = B.CreateShuffleVector(O0, O1, SI->getMask()); |
3245 | 24 | } else if (2 isa<LoadInst>(I)2 || isa<PHINode>(I)2 ) { |
3246 | 1 | // Don't do anything with the operands, just extend the result. |
3247 | 1 | continue; |
3248 | 1 | } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { |
3249 | 1 | auto Elements = IE->getOperand(0)->getType()->getVectorNumElements(); |
3250 | 1 | auto *O0 = B.CreateZExtOrTrunc( |
3251 | 1 | IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); |
3252 | 1 | auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); |
3253 | 1 | NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); |
3254 | 1 | } else if (auto *0 EE0 = dyn_cast<ExtractElementInst>(I)) { |
3255 | 0 | auto Elements = EE->getOperand(0)->getType()->getVectorNumElements(); |
3256 | 0 | auto *O0 = B.CreateZExtOrTrunc( |
3257 | 0 | EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); |
3258 | 0 | NewI = B.CreateExtractElement(O0, EE->getOperand(2)); |
3259 | 0 | } else { |
3260 | 0 | // If we don't know what to do, be conservative and don't do anything. |
3261 | 0 | continue; |
3262 | 0 | } |
3263 | 212 | |
3264 | 212 | // Lastly, extend the result. |
3265 | 212 | NewI->takeName(cast<Instruction>(I)); |
3266 | 212 | Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); |
3267 | 212 | I->replaceAllUsesWith(Res); |
3268 | 212 | cast<Instruction>(I)->eraseFromParent(); |
3269 | 212 | Erased.insert(I); |
3270 | 212 | VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); |
3271 | 212 | } |
3272 | 184 | } |
3273 | 15.3k | |
3274 | 15.3k | // We'll have created a bunch of ZExts that are now parentless. Clean up. |
3275 | 15.3k | for (const auto &KV : Cost->getMinimalBitwidths()) { |
3276 | 186 | // If the value wasn't vectorized, we must maintain the original scalar |
3277 | 186 | // type. The absence of the value from VectorLoopValueMap indicates that it |
3278 | 186 | // wasn't vectorized. |
3279 | 186 | if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) |
3280 | 2 | continue; |
3281 | 420 | for (unsigned Part = 0; 184 Part < UF; ++Part236 ) { |
3282 | 236 | Value *I = getOrCreateVectorValue(KV.first, Part); |
3283 | 236 | ZExtInst *Inst = dyn_cast<ZExtInst>(I); |
3284 | 236 | if (Inst && Inst->use_empty()198 ) { |
3285 | 3 | Value *NewI = Inst->getOperand(0); |
3286 | 3 | Inst->eraseFromParent(); |
3287 | 3 | VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); |
3288 | 3 | } |
3289 | 236 | } |
3290 | 184 | } |
3291 | 15.3k | } |
3292 | | |
3293 | 17.0k | void InnerLoopVectorizer::fixVectorizedLoop() { |
3294 | 17.0k | // Insert truncates and extends for any truncated instructions as hints to |
3295 | 17.0k | // InstCombine. |
3296 | 17.0k | if (VF > 1) |
3297 | 15.3k | truncateToMinimalBitwidths(); |
3298 | 17.0k | |
3299 | 17.0k | // Fix widened non-induction PHIs by setting up the PHI operands. |
3300 | 17.0k | if (OrigPHIsToFix.size()) { |
3301 | 7 | assert(EnableVPlanNativePath && |
3302 | 7 | "Unexpected non-induction PHIs for fixup in non VPlan-native path"); |
3303 | 7 | fixNonInductionPHIs(); |
3304 | 7 | } |
3305 | 17.0k | |
3306 | 17.0k | // At this point every instruction in the original loop is widened to a |
3307 | 17.0k | // vector form. Now we need to fix the recurrences in the loop. These PHI |
3308 | 17.0k | // nodes are currently empty because we did not want to introduce cycles. |
3309 | 17.0k | // This is the second stage of vectorizing recurrences. |
3310 | 17.0k | fixCrossIterationPHIs(); |
3311 | 17.0k | |
3312 | 17.0k | // Update the dominator tree. |
3313 | 17.0k | // |
3314 | 17.0k | // FIXME: After creating the structure of the new loop, the dominator tree is |
3315 | 17.0k | // no longer up-to-date, and it remains that way until we update it |
3316 | 17.0k | // here. An out-of-date dominator tree is problematic for SCEV, |
3317 | 17.0k | // because SCEVExpander uses it to guide code generation. The |
3318 | 17.0k | // vectorizer use SCEVExpanders in several places. Instead, we should |
3319 | 17.0k | // keep the dominator tree up-to-date as we go. |
3320 | 17.0k | updateAnalysis(); |
3321 | 17.0k | |
3322 | 17.0k | // Fix-up external users of the induction variables. |
3323 | 17.0k | for (auto &Entry : *Legal->getInductionVars()) |
3324 | 20.3k | fixupIVUsers(Entry.first, Entry.second, |
3325 | 20.3k | getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), |
3326 | 20.3k | IVEndValues[Entry.first], LoopMiddleBlock); |
3327 | 17.0k | |
3328 | 17.0k | fixLCSSAPHIs(); |
3329 | 17.0k | for (Instruction *PI : PredicatedInstructions) |
3330 | 948 | sinkScalarOperands(&*PI); |
3331 | 17.0k | |
3332 | 17.0k | // Remove redundant induction instructions. |
3333 | 17.0k | cse(LoopVectorBody); |
3334 | 17.0k | } |
3335 | | |
3336 | 17.0k | void InnerLoopVectorizer::fixCrossIterationPHIs() { |
3337 | 17.0k | // In order to support recurrences we need to be able to vectorize Phi nodes. |
3338 | 17.0k | // Phi nodes have cycles, so we need to vectorize them in two stages. This is |
3339 | 17.0k | // stage #2: We now need to fix the recurrences by adding incoming edges to |
3340 | 17.0k | // the currently empty PHI nodes. At this point every instruction in the |
3341 | 17.0k | // original loop is widened to a vector form so we can use them to construct |
3342 | 17.0k | // the incoming edges. |
3343 | 21.7k | for (PHINode &Phi : OrigLoop->getHeader()->phis()) { |
3344 | 21.7k | // Handle first-order recurrences and reductions that need to be fixed. |
3345 | 21.7k | if (Legal->isFirstOrderRecurrence(&Phi)) |
3346 | 90 | fixFirstOrderRecurrence(&Phi); |
3347 | 21.6k | else if (Legal->isReductionVariable(&Phi)) |
3348 | 1.36k | fixReduction(&Phi); |
3349 | 21.7k | } |
3350 | 17.0k | } |
3351 | | |
3352 | 90 | void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { |
3353 | 90 | // This is the second phase of vectorizing first-order recurrences. An |
3354 | 90 | // overview of the transformation is described below. Suppose we have the |
3355 | 90 | // following loop. |
3356 | 90 | // |
3357 | 90 | // for (int i = 0; i < n; ++i) |
3358 | 90 | // b[i] = a[i] - a[i - 1]; |
3359 | 90 | // |
3360 | 90 | // There is a first-order recurrence on "a". For this loop, the shorthand |
3361 | 90 | // scalar IR looks like: |
3362 | 90 | // |
3363 | 90 | // scalar.ph: |
3364 | 90 | // s_init = a[-1] |
3365 | 90 | // br scalar.body |
3366 | 90 | // |
3367 | 90 | // scalar.body: |
3368 | 90 | // i = phi [0, scalar.ph], [i+1, scalar.body] |
3369 | 90 | // s1 = phi [s_init, scalar.ph], [s2, scalar.body] |
3370 | 90 | // s2 = a[i] |
3371 | 90 | // b[i] = s2 - s1 |
3372 | 90 | // br cond, scalar.body, ... |
3373 | 90 | // |
3374 | 90 | // In this example, s1 is a recurrence because it's value depends on the |
3375 | 90 | // previous iteration. In the first phase of vectorization, we created a |
3376 | 90 | // temporary value for s1. We now complete the vectorization and produce the |
3377 | 90 | // shorthand vector IR shown below (for VF = 4, UF = 1). |
3378 | 90 | // |
3379 | 90 | // vector.ph: |
3380 | 90 | // v_init = vector(..., ..., ..., a[-1]) |
3381 | 90 | // br vector.body |
3382 | 90 | // |
3383 | 90 | // vector.body |
3384 | 90 | // i = phi [0, vector.ph], [i+4, vector.body] |
3385 | 90 | // v1 = phi [v_init, vector.ph], [v2, vector.body] |
3386 | 90 | // v2 = a[i, i+1, i+2, i+3]; |
3387 | 90 | // v3 = vector(v1(3), v2(0, 1, 2)) |
3388 | 90 | // b[i, i+1, i+2, i+3] = v2 - v3 |
3389 | 90 | // br cond, vector.body, middle.block |
3390 | 90 | // |
3391 | 90 | // middle.block: |
3392 | 90 | // x = v2(3) |
3393 | 90 | // br scalar.ph |
3394 | 90 | // |
3395 | 90 | // scalar.ph: |
3396 | 90 | // s_init = phi [x, middle.block], [a[-1], otherwise] |
3397 | 90 | // br scalar.body |
3398 | 90 | // |
3399 | 90 | // After execution completes the vector loop, we extract the next value of |
3400 | 90 | // the recurrence (x) to use as the initial value in the scalar loop. |
3401 | 90 | |
3402 | 90 | // Get the original loop preheader and single loop latch. |
3403 | 90 | auto *Preheader = OrigLoop->getLoopPreheader(); |
3404 | 90 | auto *Latch = OrigLoop->getLoopLatch(); |
3405 | 90 | |
3406 | 90 | // Get the initial and previous values of the scalar recurrence. |
3407 | 90 | auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); |
3408 | 90 | auto *Previous = Phi->getIncomingValueForBlock(Latch); |
3409 | 90 | |
3410 | 90 | // Create a vector from the initial value. |
3411 | 90 | auto *VectorInit = ScalarInit; |
3412 | 90 | if (VF > 1) { |
3413 | 79 | Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); |
3414 | 79 | VectorInit = Builder.CreateInsertElement( |
3415 | 79 | UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, |
3416 | 79 | Builder.getInt32(VF - 1), "vector.recur.init"); |
3417 | 79 | } |
3418 | 90 | |
3419 | 90 | // We constructed a temporary phi node in the first phase of vectorization. |
3420 | 90 | // This phi node will eventually be deleted. |
3421 | 90 | Builder.SetInsertPoint( |
3422 | 90 | cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); |
3423 | 90 | |
3424 | 90 | // Create a phi node for the new recurrence. The current value will either be |
3425 | 90 | // the initial value inserted into a vector or loop-varying vector value. |
3426 | 90 | auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); |
3427 | 90 | VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); |
3428 | 90 | |
3429 | 90 | // Get the vectorized previous value of the last part UF - 1. It appears last |
3430 | 90 | // among all unrolled iterations, due to the order of their construction. |
3431 | 90 | Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); |
3432 | 90 | |
3433 | 90 | // Set the insertion point after the previous value if it is an instruction. |
3434 | 90 | // Note that the previous value may have been constant-folded so it is not |
3435 | 90 | // guaranteed to be an instruction in the vector loop. Also, if the previous |
3436 | 90 | // value is a phi node, we should insert after all the phi nodes to avoid |
3437 | 90 | // breaking basic block verification. |
3438 | 90 | if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) || |
3439 | 90 | isa<PHINode>(PreviousLastPart)85 ) |
3440 | 7 | Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); |
3441 | 83 | else |
3442 | 83 | Builder.SetInsertPoint( |
3443 | 83 | &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart))); |
3444 | 90 | |
3445 | 90 | // We will construct a vector for the recurrence by combining the values for |
3446 | 90 | // the current and previous iterations. This is the required shuffle mask. |
3447 | 90 | SmallVector<Constant *, 8> ShuffleMask(VF); |
3448 | 90 | ShuffleMask[0] = Builder.getInt32(VF - 1); |
3449 | 323 | for (unsigned I = 1; I < VF; ++I233 ) |
3450 | 233 | ShuffleMask[I] = Builder.getInt32(I + VF - 1); |
3451 | 90 | |
3452 | 90 | // The vector from which to take the initial value for the current iteration |
3453 | 90 | // (actual or unrolled). Initially, this is the vector phi node. |
3454 | 90 | Value *Incoming = VecPhi; |
3455 | 90 | |
3456 | 90 | // Shuffle the current and previous vector and update the vector parts. |
3457 | 221 | for (unsigned Part = 0; Part < UF; ++Part131 ) { |
3458 | 131 | Value *PreviousPart = getOrCreateVectorValue(Previous, Part); |
3459 | 131 | Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); |
3460 | 131 | auto *Shuffle = |
3461 | 131 | VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart, |
3462 | 109 | ConstantVector::get(ShuffleMask)) |
3463 | 131 | : Incoming22 ; |
3464 | 131 | PhiPart->replaceAllUsesWith(Shuffle); |
3465 | 131 | cast<Instruction>(PhiPart)->eraseFromParent(); |
3466 | 131 | VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); |
3467 | 131 | Incoming = PreviousPart; |
3468 | 131 | } |
3469 | 90 | |
3470 | 90 | // Fix the latch value of the new recurrence in the vector loop. |
3471 | 90 | VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); |
3472 | 90 | |
3473 | 90 | // Extract the last vector element in the middle block. This will be the |
3474 | 90 | // initial value for the recurrence when jumping to the scalar loop. |
3475 | 90 | auto *ExtractForScalar = Incoming; |
3476 | 90 | if (VF > 1) { |
3477 | 79 | Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); |
3478 | 79 | ExtractForScalar = Builder.CreateExtractElement( |
3479 | 79 | ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract"); |
3480 | 79 | } |
3481 | 90 | // Extract the second last element in the middle block if the |
3482 | 90 | // Phi is used outside the loop. We need to extract the phi itself |
3483 | 90 | // and not the last element (the phi update in the current iteration). This |
3484 | 90 | // will be the value when jumping to the exit block from the LoopMiddleBlock, |
3485 | 90 | // when the scalar loop is not run at all. |
3486 | 90 | Value *ExtractForPhiUsedOutsideLoop = nullptr; |
3487 | 90 | if (VF > 1) |
3488 | 79 | ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( |
3489 | 79 | Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi"); |
3490 | 11 | // When loop is unrolled without vectorizing, initialize |
3491 | 11 | // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of |
3492 | 11 | // `Incoming`. This is analogous to the vectorized case above: extracting the |
3493 | 11 | // second last element when VF > 1. |
3494 | 11 | else if (UF > 1) |
3495 | 11 | ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); |
3496 | 90 | |
3497 | 90 | // Fix the initial value of the original recurrence in the scalar loop. |
3498 | 90 | Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); |
3499 | 90 | auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); |
3500 | 205 | for (auto *BB : predecessors(LoopScalarPreHeader)) { |
3501 | 205 | auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar90 : ScalarInit115 ; |
3502 | 205 | Start->addIncoming(Incoming, BB); |
3503 | 205 | } |
3504 | 90 | |
3505 | 90 | Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); |
3506 | 90 | Phi->setName("scalar.recur"); |
3507 | 90 | |
3508 | 90 | // Finally, fix users of the recurrence outside the loop. The users will need |
3509 | 90 | // either the last value of the scalar recurrence or the last value of the |
3510 | 90 | // vector recurrence we extracted in the middle block. Since the loop is in |
3511 | 90 | // LCSSA form, we just need to find all the phi nodes for the original scalar |
3512 | 90 | // recurrence in the exit block, and then add an edge for the middle block. |
3513 | 90 | for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { |
3514 | 49 | if (LCSSAPhi.getIncomingValue(0) == Phi) { |
3515 | 9 | LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); |
3516 | 9 | } |
3517 | 49 | } |
3518 | 90 | } |
3519 | | |
3520 | 1.36k | void InnerLoopVectorizer::fixReduction(PHINode *Phi) { |
3521 | 1.36k | Constant *Zero = Builder.getInt32(0); |
3522 | 1.36k | |
3523 | 1.36k | // Get it's reduction variable descriptor. |
3524 | 1.36k | assert(Legal->isReductionVariable(Phi) && |
3525 | 1.36k | "Unable to find the reduction variable"); |
3526 | 1.36k | RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi]; |
3527 | 1.36k | |
3528 | 1.36k | RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); |
3529 | 1.36k | TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); |
3530 | 1.36k | Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); |
3531 | 1.36k | RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = |
3532 | 1.36k | RdxDesc.getMinMaxRecurrenceKind(); |
3533 | 1.36k | setDebugLocFromInst(Builder, ReductionStartValue); |
3534 | 1.36k | |
3535 | 1.36k | // We need to generate a reduction vector from the incoming scalar. |
3536 | 1.36k | // To do so, we need to generate the 'identity' vector and override |
3537 | 1.36k | // one of the elements with the incoming scalar reduction. We need |
3538 | 1.36k | // to do it in the vector-loop preheader. |
3539 | 1.36k | Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); |
3540 | 1.36k | |
3541 | 1.36k | // This is the vector-clone of the value that leaves the loop. |
3542 | 1.36k | Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); |
3543 | 1.36k | |
3544 | 1.36k | // Find the reduction identity variable. Zero for addition, or, xor, |
3545 | 1.36k | // one for multiplication, -1 for And. |
3546 | 1.36k | Value *Identity; |
3547 | 1.36k | Value *VectorStart; |
3548 | 1.36k | if (RK == RecurrenceDescriptor::RK_IntegerMinMax || |
3549 | 1.36k | RK == RecurrenceDescriptor::RK_FloatMinMax1.27k ) { |
3550 | 107 | // MinMax reduction have the start value as their identify. |
3551 | 107 | if (VF == 1) { |
3552 | 44 | VectorStart = Identity = ReductionStartValue; |
3553 | 63 | } else { |
3554 | 63 | VectorStart = Identity = |
3555 | 63 | Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); |
3556 | 63 | } |
3557 | 1.25k | } else { |
3558 | 1.25k | // Handle other reduction kinds: |
3559 | 1.25k | Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( |
3560 | 1.25k | RK, VecTy->getScalarType()); |
3561 | 1.25k | if (VF == 1) { |
3562 | 261 | Identity = Iden; |
3563 | 261 | // This vector is the Identity vector where the first element is the |
3564 | 261 | // incoming scalar reduction. |
3565 | 261 | VectorStart = ReductionStartValue; |
3566 | 994 | } else { |
3567 | 994 | Identity = ConstantVector::getSplat(VF, Iden); |
3568 | 994 | |
3569 | 994 | // This vector is the Identity vector where the first element is the |
3570 | 994 | // incoming scalar reduction. |
3571 | 994 | VectorStart = |
3572 | 994 | Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); |
3573 | 994 | } |
3574 | 1.25k | } |
3575 | 1.36k | |
3576 | 1.36k | // Fix the vector-loop phi. |
3577 | 1.36k | |
3578 | 1.36k | // Reductions do not have to start at zero. They can start with |
3579 | 1.36k | // any loop invariant values. |
3580 | 1.36k | BasicBlock *Latch = OrigLoop->getLoopLatch(); |
3581 | 1.36k | Value *LoopVal = Phi->getIncomingValueForBlock(Latch); |
3582 | 3.98k | for (unsigned Part = 0; Part < UF; ++Part2.62k ) { |
3583 | 2.62k | Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); |
3584 | 2.62k | Value *Val = getOrCreateVectorValue(LoopVal, Part); |
3585 | 2.62k | // Make sure to add the reduction stat value only to the |
3586 | 2.62k | // first unroll part. |
3587 | 2.62k | Value *StartVal = (Part == 0) ? VectorStart1.36k : Identity1.26k ; |
3588 | 2.62k | cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); |
3589 | 2.62k | cast<PHINode>(VecRdxPhi) |
3590 | 2.62k | ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); |
3591 | 2.62k | } |
3592 | 1.36k | |
3593 | 1.36k | // Before each round, move the insertion point right between |
3594 | 1.36k | // the PHIs and the values we are going to write. |
3595 | 1.36k | // This allows us to write both PHINodes and the extractelement |
3596 | 1.36k | // instructions. |
3597 | 1.36k | Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); |
3598 | 1.36k | |
3599 | 1.36k | setDebugLocFromInst(Builder, LoopExitInst); |
3600 | 1.36k | |
3601 | 1.36k | // If the vector reduction can be performed in a smaller type, we truncate |
3602 | 1.36k | // then extend the loop exit value to enable InstCombine to evaluate the |
3603 | 1.36k | // entire expression in the smaller type. |
3604 | 1.36k | if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()1.05k ) { |
3605 | 6 | Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); |
3606 | 6 | Builder.SetInsertPoint( |
3607 | 6 | LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); |
3608 | 6 | VectorParts RdxParts(UF); |
3609 | 13 | for (unsigned Part = 0; Part < UF; ++Part7 ) { |
3610 | 7 | RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); |
3611 | 7 | Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); |
3612 | 7 | Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)1 |
3613 | 7 | : Builder.CreateZExt(Trunc, VecTy)6 ; |
3614 | 7 | for (Value::user_iterator UI = RdxParts[Part]->user_begin(); |
3615 | 21 | UI != RdxParts[Part]->user_end();) |
3616 | 14 | if (*UI != Trunc) { |
3617 | 7 | (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); |
3618 | 7 | RdxParts[Part] = Extnd; |
3619 | 7 | } else { |
3620 | 7 | ++UI; |
3621 | 7 | } |
3622 | 7 | } |
3623 | 6 | Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); |
3624 | 13 | for (unsigned Part = 0; Part < UF; ++Part7 ) { |
3625 | 7 | RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); |
3626 | 7 | VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); |
3627 | 7 | } |
3628 | 6 | } |
3629 | 1.36k | |
3630 | 1.36k | // Reduce all of the unrolled parts into a single vector. |
3631 | 1.36k | Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); |
3632 | 1.36k | unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); |
3633 | 1.36k | |
3634 | 1.36k | // The middle block terminator has already been assigned a DebugLoc here (the |
3635 | 1.36k | // OrigLoop's single latch terminator). We want the whole middle block to |
3636 | 1.36k | // appear to execute on this line because: (a) it is all compiler generated, |
3637 | 1.36k | // (b) these instructions are always executed after evaluating the latch |
3638 | 1.36k | // conditional branch, and (c) other passes may add new predecessors which |
3639 | 1.36k | // terminate on this line. This is the easiest way to ensure we don't |
3640 | 1.36k | // accidentally cause an extra step back into the loop while debugging. |
3641 | 1.36k | setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); |
3642 | 2.62k | for (unsigned Part = 1; Part < UF; ++Part1.26k ) { |
3643 | 1.26k | Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); |
3644 | 1.26k | if (Op != Instruction::ICmp && Op != Instruction::FCmp1.19k ) |
3645 | 1.19k | // Floating point operations had to be 'fast' to enable the reduction. |
3646 | 1.19k | ReducedPartRdx = addFastMathFlag( |
3647 | 1.19k | Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, |
3648 | 1.19k | ReducedPartRdx, "bin.rdx"), |
3649 | 1.19k | RdxDesc.getFastMathFlags()); |
3650 | 73 | else |
3651 | 73 | ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, |
3652 | 73 | RdxPart); |
3653 | 1.26k | } |
3654 | 1.36k | |
3655 | 1.36k | if (VF > 1) { |
3656 | 1.05k | bool NoNaN = Legal->hasFunNoNaNAttr(); |
3657 | 1.05k | ReducedPartRdx = |
3658 | 1.05k | createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); |
3659 | 1.05k | // If the reduction can be performed in a smaller type, we need to extend |
3660 | 1.05k | // the reduction to the wider type before we branch to the original loop. |
3661 | 1.05k | if (Phi->getType() != RdxDesc.getRecurrenceType()) |
3662 | 6 | ReducedPartRdx = |
3663 | 6 | RdxDesc.isSigned() |
3664 | 6 | ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())1 |
3665 | 6 | : Builder.CreateZExt(ReducedPartRdx, Phi->getType())5 ; |
3666 | 1.05k | } |
3667 | 1.36k | |
3668 | 1.36k | // Create a phi node that merges control-flow from the backedge-taken check |
3669 | 1.36k | // block and the middle block. |
3670 | 1.36k | PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", |
3671 | 1.36k | LoopScalarPreHeader->getTerminator()); |
3672 | 2.76k | for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I1.40k ) |
3673 | 1.40k | BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); |
3674 | 1.36k | BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); |
3675 | 1.36k | |
3676 | 1.36k | // Now, we need to fix the users of the reduction variable |
3677 | 1.36k | // inside and outside of the scalar remainder loop. |
3678 | 1.36k | // We know that the loop is in LCSSA form. We need to update the |
3679 | 1.36k | // PHI nodes in the exit blocks. |
3680 | 1.70k | for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { |
3681 | 1.70k | // All PHINodes need to have a single entry edge, or two if |
3682 | 1.70k | // we already fixed them. |
3683 | 1.70k | assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); |
3684 | 1.70k | |
3685 | 1.70k | // We found a reduction value exit-PHI. Update it with the |
3686 | 1.70k | // incoming bypass edge. |
3687 | 1.70k | if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) |
3688 | 1.36k | LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); |
3689 | 1.70k | } // end of the LCSSA phi scan. |
3690 | 1.36k | |
3691 | 1.36k | // Fix the scalar loop reduction variable with the incoming reduction sum |
3692 | 1.36k | // from the vector body and from the backedge value. |
3693 | 1.36k | int IncomingEdgeBlockIdx = |
3694 | 1.36k | Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); |
3695 | 1.36k | assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); |
3696 | 1.36k | // Pick the other block. |
3697 | 1.36k | int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0851 : 1511 ); |
3698 | 1.36k | Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); |
3699 | 1.36k | Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); |
3700 | 1.36k | } |
3701 | | |
3702 | 17.0k | void InnerLoopVectorizer::fixLCSSAPHIs() { |
3703 | 17.0k | for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { |
3704 | 2.58k | if (LCSSAPhi.getNumIncomingValues() == 1) { |
3705 | 15 | auto *IncomingValue = LCSSAPhi.getIncomingValue(0); |
3706 | 15 | // Non-instruction incoming values will have only one value. |
3707 | 15 | unsigned LastLane = 0; |
3708 | 15 | if (isa<Instruction>(IncomingValue)) |
3709 | 13 | LastLane = Cost->isUniformAfterVectorization( |
3710 | 13 | cast<Instruction>(IncomingValue), VF) |
3711 | 13 | ? 02 |
3712 | 13 | : VF - 111 ; |
3713 | 15 | // Can be a loop invariant incoming value or the last scalar value to be |
3714 | 15 | // extracted from the vectorized loop. |
3715 | 15 | Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); |
3716 | 15 | Value *lastIncomingValue = |
3717 | 15 | getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); |
3718 | 15 | LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); |
3719 | 15 | } |
3720 | 2.58k | } |
3721 | 17.0k | } |
3722 | | |
3723 | 948 | void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { |
3724 | 948 | // The basic block and loop containing the predicated instruction. |
3725 | 948 | auto *PredBB = PredInst->getParent(); |
3726 | 948 | auto *VectorLoop = LI->getLoopFor(PredBB); |
3727 | 948 | |
3728 | 948 | // Initialize a worklist with the operands of the predicated instruction. |
3729 | 948 | SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); |
3730 | 948 | |
3731 | 948 | // Holds instructions that we need to analyze again. An instruction may be |
3732 | 948 | // reanalyzed if we don't yet know if we can sink it or not. |
3733 | 948 | SmallVector<Instruction *, 8> InstsToReanalyze; |
3734 | 948 | |
3735 | 948 | // Returns true if a given use occurs in the predicated block. Phi nodes use |
3736 | 948 | // their operands in their corresponding predecessor blocks. |
3737 | 3.39k | auto isBlockOfUsePredicated = [&](Use &U) -> bool { |
3738 | 3.39k | auto *I = cast<Instruction>(U.getUser()); |
3739 | 3.39k | BasicBlock *BB = I->getParent(); |
3740 | 3.39k | if (auto *Phi = dyn_cast<PHINode>(I)) |
3741 | 0 | BB = Phi->getIncomingBlock( |
3742 | 0 | PHINode::getIncomingValueNumForOperand(U.getOperandNo())); |
3743 | 3.39k | return BB == PredBB; |
3744 | 3.39k | }; |
3745 | 948 | |
3746 | 948 | // Iteratively sink the scalarized operands of the predicated instruction |
3747 | 948 | // into the block we created for it. When an instruction is sunk, it's |
3748 | 948 | // operands are then added to the worklist. The algorithm ends after one pass |
3749 | 948 | // through the worklist doesn't sink a single instruction. |
3750 | 948 | bool Changed; |
3751 | 1.66k | do { |
3752 | 1.66k | // Add the instructions that need to be reanalyzed to the worklist, and |
3753 | 1.66k | // reset the changed indicator. |
3754 | 1.66k | Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); |
3755 | 1.66k | InstsToReanalyze.clear(); |
3756 | 1.66k | Changed = false; |
3757 | 1.66k | |
3758 | 6.86k | while (!Worklist.empty()) { |
3759 | 5.19k | auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); |
3760 | 5.19k | |
3761 | 5.19k | // We can't sink an instruction if it is a phi node, is already in the |
3762 | 5.19k | // predicated block, is not in the loop, or may have side effects. |
3763 | 5.19k | if (!I || isa<PHINode>(I)3.50k || I->getParent() == PredBB3.23k || |
3764 | 5.19k | !VectorLoop->contains(I)3.00k || I->mayHaveSideEffects()2.76k ) |
3765 | 2.42k | continue; |
3766 | 2.76k | |
3767 | 2.76k | // It's legal to sink the instruction if all its uses occur in the |
3768 | 2.76k | // predicated block. Otherwise, there's nothing to do yet, and we may |
3769 | 2.76k | // need to reanalyze the instruction. |
3770 | 2.76k | if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { |
3771 | 1.41k | InstsToReanalyze.push_back(I); |
3772 | 1.41k | continue; |
3773 | 1.41k | } |
3774 | 1.34k | |
3775 | 1.34k | // Move the instruction to the beginning of the predicated block, and add |
3776 | 1.34k | // it's operands to the worklist. |
3777 | 1.34k | I->moveBefore(&*PredBB->getFirstInsertionPt()); |
3778 | 1.34k | Worklist.insert(I->op_begin(), I->op_end()); |
3779 | 1.34k | |
3780 | 1.34k | // The sinking may have enabled other instructions to be sunk, so we will |
3781 | 1.34k | // need to iterate. |
3782 | 1.34k | Changed = true; |
3783 | 1.34k | } |
3784 | 1.66k | } while (Changed); |
3785 | 948 | } |
3786 | | |
3787 | 7 | void InnerLoopVectorizer::fixNonInductionPHIs() { |
3788 | 9 | for (PHINode *OrigPhi : OrigPHIsToFix) { |
3789 | 9 | PHINode *NewPhi = |
3790 | 9 | cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); |
3791 | 9 | unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); |
3792 | 9 | |
3793 | 9 | SmallVector<BasicBlock *, 2> ScalarBBPredecessors( |
3794 | 9 | predecessors(OrigPhi->getParent())); |
3795 | 9 | SmallVector<BasicBlock *, 2> VectorBBPredecessors( |
3796 | 9 | predecessors(NewPhi->getParent())); |
3797 | 9 | assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && |
3798 | 9 | "Scalar and Vector BB should have the same number of predecessors"); |
3799 | 9 | |
3800 | 9 | // The insertion point in Builder may be invalidated by the time we get |
3801 | 9 | // here. Force the Builder insertion point to something valid so that we do |
3802 | 9 | // not run into issues during insertion point restore in |
3803 | 9 | // getOrCreateVectorValue calls below. |
3804 | 9 | Builder.SetInsertPoint(NewPhi); |
3805 | 9 | |
3806 | 9 | // The predecessor order is preserved and we can rely on mapping between |
3807 | 9 | // scalar and vector block predecessors. |
3808 | 26 | for (unsigned i = 0; i < NumIncomingValues; ++i17 ) { |
3809 | 17 | BasicBlock *NewPredBB = VectorBBPredecessors[i]; |
3810 | 17 | |
3811 | 17 | // When looking up the new scalar/vector values to fix up, use incoming |
3812 | 17 | // values from original phi. |
3813 | 17 | Value *ScIncV = |
3814 | 17 | OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); |
3815 | 17 | |
3816 | 17 | // Scalar incoming value may need a broadcast |
3817 | 17 | Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); |
3818 | 17 | NewPhi->addIncoming(NewIncV, NewPredBB); |
3819 | 17 | } |
3820 | 9 | } |
3821 | 7 | } |
3822 | | |
3823 | | void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, |
3824 | 5.05k | unsigned VF) { |
3825 | 5.05k | PHINode *P = cast<PHINode>(PN); |
3826 | 5.05k | if (EnableVPlanNativePath) { |
3827 | 9 | // Currently we enter here in the VPlan-native path for non-induction |
3828 | 9 | // PHIs where all control flow is uniform. We simply widen these PHIs. |
3829 | 9 | // Create a vector phi with no operands - the vector phi operands will be |
3830 | 9 | // set at the end of vector code generation. |
3831 | 9 | Type *VecTy = |
3832 | 9 | (VF == 1) ? PN->getType()0 : VectorType::get(PN->getType(), VF); |
3833 | 9 | Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); |
3834 | 9 | VectorLoopValueMap.setVectorValue(P, 0, VecPhi); |
3835 | 9 | OrigPHIsToFix.push_back(P); |
3836 | 9 | |
3837 | 9 | return; |
3838 | 9 | } |
3839 | 5.04k | |
3840 | 5.04k | assert(PN->getParent() == OrigLoop->getHeader() && |
3841 | 5.04k | "Non-header phis should have been handled elsewhere"); |
3842 | 5.04k | |
3843 | 5.04k | // In order to support recurrences we need to be able to vectorize Phi nodes. |
3844 | 5.04k | // Phi nodes have cycles, so we need to vectorize them in two stages. This is |
3845 | 5.04k | // stage #1: We create a new vector PHI node with no incoming edges. We'll use |
3846 | 5.04k | // this value when we vectorize all of the instructions that use the PHI. |
3847 | 5.04k | if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)3.68k ) { |
3848 | 4.20k | for (unsigned Part = 0; Part < UF; ++Part2.75k ) { |
3849 | 2.75k | // This is phase one of vectorizing PHIs. |
3850 | 2.75k | Type *VecTy = |
3851 | 2.75k | (VF == 1) ? PN->getType()640 : VectorType::get(PN->getType(), VF)2.11k ; |
3852 | 2.75k | Value *EntryPart = PHINode::Create( |
3853 | 2.75k | VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); |
3854 | 2.75k | VectorLoopValueMap.setVectorValue(P, Part, EntryPart); |
3855 | 2.75k | } |
3856 | 1.45k | return; |
3857 | 1.45k | } |
3858 | 3.59k | |
3859 | 3.59k | setDebugLocFromInst(Builder, P); |
3860 | 3.59k | |
3861 | 3.59k | // This PHINode must be an induction variable. |
3862 | 3.59k | // Make sure that we know about it. |
3863 | 3.59k | assert(Legal->getInductionVars()->count(P) && "Not an induction variable"); |
3864 | 3.59k | |
3865 | 3.59k | InductionDescriptor II = Legal->getInductionVars()->lookup(P); |
3866 | 3.59k | const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); |
3867 | 3.59k | |
3868 | 3.59k | // FIXME: The newly created binary instructions should contain nsw/nuw flags, |
3869 | 3.59k | // which can be found from the original scalar operations. |
3870 | 3.59k | switch (II.getKind()) { |
3871 | 3.59k | case InductionDescriptor::IK_NoInduction: |
3872 | 0 | llvm_unreachable("Unknown induction"); |
3873 | 3.59k | case InductionDescriptor::IK_IntInduction: |
3874 | 0 | case InductionDescriptor::IK_FpInduction: |
3875 | 0 | llvm_unreachable("Integer/fp induction is handled elsewhere."); |
3876 | 3.59k | case InductionDescriptor::IK_PtrInduction: { |
3877 | 3.59k | // Handle the pointer induction variable case. |
3878 | 3.59k | assert(P->getType()->isPointerTy() && "Unexpected type."); |
3879 | 3.59k | // This is the normalized GEP that starts counting at zero. |
3880 | 3.59k | Value *PtrInd = Induction; |
3881 | 3.59k | PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType()); |
3882 | 3.59k | // Determine the number of scalars we need to generate for each unroll |
3883 | 3.59k | // iteration. If the instruction is uniform, we only need to generate the |
3884 | 3.59k | // first lane. Otherwise, we generate all VF values. |
3885 | 3.59k | unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 13.56k : VF25 ; |
3886 | 3.59k | // These are the scalar results. Notice that we don't generate vector GEPs |
3887 | 3.59k | // because scalar GEPs result in better code. |
3888 | 10.9k | for (unsigned Part = 0; Part < UF; ++Part7.33k ) { |
3889 | 14.7k | for (unsigned Lane = 0; Lane < Lanes; ++Lane7.38k ) { |
3890 | 7.38k | Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF); |
3891 | 7.38k | Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); |
3892 | 7.38k | Value *SclrGep = |
3893 | 7.38k | emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); |
3894 | 7.38k | SclrGep->setName("next.gep"); |
3895 | 7.38k | VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); |
3896 | 7.38k | } |
3897 | 7.33k | } |
3898 | 3.59k | return; |
3899 | 0 | } |
3900 | 3.59k | } |
3901 | 3.59k | } |
3902 | | |
3903 | | /// A helper function for checking whether an integer division-related |
3904 | | /// instruction may divide by zero (in which case it must be predicated if |
3905 | | /// executed conditionally in the scalar code). |
3906 | | /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). |
3907 | | /// Non-zero divisors that are non compile-time constants will not be |
3908 | | /// converted into multiplication, so we will still end up scalarizing |
3909 | | /// the division, but can do so w/o predication. |
3910 | 575 | static bool mayDivideByZero(Instruction &I) { |
3911 | 575 | assert((I.getOpcode() == Instruction::UDiv || |
3912 | 575 | I.getOpcode() == Instruction::SDiv || |
3913 | 575 | I.getOpcode() == Instruction::URem || |
3914 | 575 | I.getOpcode() == Instruction::SRem) && |
3915 | 575 | "Unexpected instruction"); |
3916 | 575 | Value *Divisor = I.getOperand(1); |
3917 | 575 | auto *CInt = dyn_cast<ConstantInt>(Divisor); |
3918 | 575 | return !CInt || CInt->isZero()117 ; |
3919 | 575 | } |
3920 | | |
3921 | 50.4k | void InnerLoopVectorizer::widenInstruction(Instruction &I) { |
3922 | 50.4k | switch (I.getOpcode()) { |
3923 | 50.4k | case Instruction::Br: |
3924 | 0 | case Instruction::PHI: |
3925 | 0 | llvm_unreachable("This instruction is handled by a different recipe."); |
3926 | 93 | case Instruction::GetElementPtr: { |
3927 | 93 | // Construct a vector GEP by widening the operands of the scalar GEP as |
3928 | 93 | // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP |
3929 | 93 | // results in a vector of pointers when at least one operand of the GEP |
3930 | 93 | // is vector-typed. Thus, to keep the representation compact, we only use |
3931 | 93 | // vector-typed operands for loop-varying values. |
3932 | 93 | auto *GEP = cast<GetElementPtrInst>(&I); |
3933 | 93 | |
3934 | 93 | if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) { |
3935 | 1 | // If we are vectorizing, but the GEP has only loop-invariant operands, |
3936 | 1 | // the GEP we build (by only using vector-typed operands for |
3937 | 1 | // loop-varying values) would be a scalar pointer. Thus, to ensure we |
3938 | 1 | // produce a vector of pointers, we need to either arbitrarily pick an |
3939 | 1 | // operand to broadcast, or broadcast a clone of the original GEP. |
3940 | 1 | // Here, we broadcast a clone of the original. |
3941 | 1 | // |
3942 | 1 | // TODO: If at some point we decide to scalarize instructions having |
3943 | 1 | // loop-invariant operands, this special case will no longer be |
3944 | 1 | // required. We would add the scalarization decision to |
3945 | 1 | // collectLoopScalars() and teach getVectorValue() to broadcast |
3946 | 1 | // the lane-zero scalar value. |
3947 | 1 | auto *Clone = Builder.Insert(GEP->clone()); |
3948 | 2 | for (unsigned Part = 0; Part < UF; ++Part1 ) { |
3949 | 1 | Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); |
3950 | 1 | VectorLoopValueMap.setVectorValue(&I, Part, EntryPart); |
3951 | 1 | addMetadata(EntryPart, GEP); |
3952 | 1 | } |
3953 | 92 | } else { |
3954 | 92 | // If the GEP has at least one loop-varying operand, we are sure to |
3955 | 92 | // produce a vector of pointers. But if we are only unrolling, we want |
3956 | 92 | // to produce a scalar GEP for each unroll part. Thus, the GEP we |
3957 | 92 | // produce with the code below will be scalar (if VF == 1) or vector |
3958 | 92 | // (otherwise). Note that for the unroll-only case, we still maintain |
3959 | 92 | // values in the vector mapping with initVector, as we do for other |
3960 | 92 | // instructions. |
3961 | 248 | for (unsigned Part = 0; Part < UF; ++Part156 ) { |
3962 | 156 | // The pointer operand of the new GEP. If it's loop-invariant, we |
3963 | 156 | // won't broadcast it. |
3964 | 156 | auto *Ptr = |
3965 | 156 | OrigLoop->isLoopInvariant(GEP->getPointerOperand()) |
3966 | 156 | ? GEP->getPointerOperand()141 |
3967 | 156 | : getOrCreateVectorValue(GEP->getPointerOperand(), Part)15 ; |
3968 | 156 | |
3969 | 156 | // Collect all the indices for the new GEP. If any index is |
3970 | 156 | // loop-invariant, we won't broadcast it. |
3971 | 156 | SmallVector<Value *, 4> Indices; |
3972 | 215 | for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) { |
3973 | 215 | if (OrigLoop->isLoopInvariant(U.get())) |
3974 | 63 | Indices.push_back(U.get()); |
3975 | 152 | else |
3976 | 152 | Indices.push_back(getOrCreateVectorValue(U.get(), Part)); |
3977 | 215 | } |
3978 | 156 | |
3979 | 156 | // Create the new GEP. Note that this GEP may be a scalar if VF == 1, |
3980 | 156 | // but it should be a vector, otherwise. |
3981 | 156 | auto *NewGEP = |
3982 | 156 | GEP->isInBounds() |
3983 | 156 | ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, |
3984 | 154 | Indices) |
3985 | 156 | : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices)2 ; |
3986 | 156 | assert((VF == 1 || NewGEP->getType()->isVectorTy()) && |
3987 | 156 | "NewGEP is not a pointer vector"); |
3988 | 156 | VectorLoopValueMap.setVectorValue(&I, Part, NewGEP); |
3989 | 156 | addMetadata(NewGEP, GEP); |
3990 | 156 | } |
3991 | 92 | } |
3992 | 93 | |
3993 | 93 | break; |
3994 | 0 | } |
3995 | 30.6k | case Instruction::UDiv: |
3996 | 30.6k | case Instruction::SDiv: |
3997 | 30.6k | case Instruction::SRem: |
3998 | 30.6k | case Instruction::URem: |
3999 | 30.6k | case Instruction::Add: |
4000 | 30.6k | case Instruction::FAdd: |
4001 | 30.6k | case Instruction::Sub: |
4002 | 30.6k | case Instruction::FSub: |
4003 | 30.6k | case Instruction::FNeg: |
4004 | 30.6k | case Instruction::Mul: |
4005 | 30.6k | case Instruction::FMul: |
4006 | 30.6k | case Instruction::FDiv: |
4007 | 30.6k | case Instruction::FRem: |
4008 | 30.6k | case Instruction::Shl: |
4009 | 30.6k | case Instruction::LShr: |
4010 | 30.6k | case Instruction::AShr: |
4011 | 30.6k | case Instruction::And: |
4012 | 30.6k | case Instruction::Or: |
4013 | 30.6k | case Instruction::Xor: { |
4014 | 30.6k | // Just widen unops and binops. |
4015 | 30.6k | setDebugLocFromInst(Builder, &I); |
4016 | 30.6k | |
4017 | 87.2k | for (unsigned Part = 0; Part < UF; ++Part56.5k ) { |
4018 | 56.5k | SmallVector<Value *, 2> Ops; |
4019 | 56.5k | for (Value *Op : I.operands()) |
4020 | 113k | Ops.push_back(getOrCreateVectorValue(Op, Part)); |
4021 | 56.5k | |
4022 | 56.5k | Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); |
4023 | 56.5k | |
4024 | 56.5k | if (auto *VecOp = dyn_cast<Instruction>(V)) |
4025 | 56.5k | VecOp->copyIRFlags(&I); |
4026 | 56.5k | |
4027 | 56.5k | // Use this vector value for all users of the original instruction. |
4028 | 56.5k | VectorLoopValueMap.setVectorValue(&I, Part, V); |
4029 | 56.5k | addMetadata(V, &I); |
4030 | 56.5k | } |
4031 | 30.6k | |
4032 | 30.6k | break; |
4033 | 30.6k | } |
4034 | 30.6k | case Instruction::Select: { |
4035 | 596 | // Widen selects. |
4036 | 596 | // If the selector is loop invariant we can create a select |
4037 | 596 | // instruction with a scalar condition. Otherwise, use vector-select. |
4038 | 596 | auto *SE = PSE.getSE(); |
4039 | 596 | bool InvariantCond = |
4040 | 596 | SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop); |
4041 | 596 | setDebugLocFromInst(Builder, &I); |
4042 | 596 | |
4043 | 596 | // The condition can be loop invariant but still defined inside the |
4044 | 596 | // loop. This means that we can't just use the original 'cond' value. |
4045 | 596 | // We have to take the 'vectorized' value and pick the first lane. |
4046 | 596 | // Instcombine will make this a no-op. |
4047 | 596 | |
4048 | 596 | auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0}); |
4049 | 596 | |
4050 | 1.49k | for (unsigned Part = 0; Part < UF; ++Part902 ) { |
4051 | 902 | Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part); |
4052 | 902 | Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part); |
4053 | 902 | Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part); |
4054 | 902 | Value *Sel = |
4055 | 902 | Builder.CreateSelect(InvariantCond ? ScalarCond1 : Cond901 , Op0, Op1); |
4056 | 902 | VectorLoopValueMap.setVectorValue(&I, Part, Sel); |
4057 | 902 | addMetadata(Sel, &I); |
4058 | 902 | } |
4059 | 596 | |
4060 | 596 | break; |
4061 | 30.6k | } |
4062 | 30.6k | |
4063 | 30.6k | case Instruction::ICmp: |
4064 | 907 | case Instruction::FCmp: { |
4065 | 907 | // Widen compares. Generate vector compares. |
4066 | 907 | bool FCmp = (I.getOpcode() == Instruction::FCmp); |
4067 | 907 | auto *Cmp = dyn_cast<CmpInst>(&I); |
4068 | 907 | setDebugLocFromInst(Builder, Cmp); |
4069 | 2.31k | for (unsigned Part = 0; Part < UF; ++Part1.40k ) { |
4070 | 1.40k | Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part); |
4071 | 1.40k | Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part); |
4072 | 1.40k | Value *C = nullptr; |
4073 | 1.40k | if (FCmp) { |
4074 | 181 | // Propagate fast math flags. |
4075 | 181 | IRBuilder<>::FastMathFlagGuard FMFG(Builder); |
4076 | 181 | Builder.setFastMathFlags(Cmp->getFastMathFlags()); |
4077 | 181 | C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); |
4078 | 1.22k | } else { |
4079 | 1.22k | C = Builder.CreateICmp(Cmp->getPredicate(), A, B); |
4080 | 1.22k | } |
4081 | 1.40k | VectorLoopValueMap.setVectorValue(&I, Part, C); |
4082 | 1.40k | addMetadata(C, &I); |
4083 | 1.40k | } |
4084 | 907 | |
4085 | 907 | break; |
4086 | 907 | } |
4087 | 907 | |
4088 | 17.8k | case Instruction::ZExt: |
4089 | 17.8k | case Instruction::SExt: |
4090 | 17.8k | case Instruction::FPToUI: |
4091 | 17.8k | case Instruction::FPToSI: |
4092 | 17.8k | case Instruction::FPExt: |
4093 | 17.8k | case Instruction::PtrToInt: |
4094 | 17.8k | case Instruction::IntToPtr: |
4095 | 17.8k | case Instruction::SIToFP: |
4096 | 17.8k | case Instruction::UIToFP: |
4097 | 17.8k | case Instruction::Trunc: |
4098 | 17.8k | case Instruction::FPTrunc: |
4099 | 17.8k | case Instruction::BitCast: { |
4100 | 17.8k | auto *CI = dyn_cast<CastInst>(&I); |
4101 | 17.8k | setDebugLocFromInst(Builder, CI); |
4102 | 17.8k | |
4103 | 17.8k | /// Vectorize casts. |
4104 | 17.8k | Type *DestTy = |
4105 | 17.8k | (VF == 1) ? CI->getType()0 : VectorType::get(CI->getType(), VF); |
4106 | 17.8k | |
4107 | 52.0k | for (unsigned Part = 0; Part < UF; ++Part34.1k ) { |
4108 | 34.1k | Value *A = getOrCreateVectorValue(CI->getOperand(0), Part); |
4109 | 34.1k | Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); |
4110 | 34.1k | VectorLoopValueMap.setVectorValue(&I, Part, Cast); |
4111 | 34.1k | addMetadata(Cast, &I); |
4112 | 34.1k | } |
4113 | 17.8k | break; |
4114 | 17.8k | } |
4115 | 17.8k | |
4116 | 17.8k | case Instruction::Call: { |
4117 | 297 | // Ignore dbg intrinsics. |
4118 | 297 | if (isa<DbgInfoIntrinsic>(I)) |
4119 | 0 | break; |
4120 | 297 | setDebugLocFromInst(Builder, &I); |
4121 | 297 | |
4122 | 297 | Module *M = I.getParent()->getParent()->getParent(); |
4123 | 297 | auto *CI = cast<CallInst>(&I); |
4124 | 297 | |
4125 | 297 | StringRef FnName = CI->getCalledFunction()->getName(); |
4126 | 297 | Function *F = CI->getCalledFunction(); |
4127 | 297 | Type *RetTy = ToVectorTy(CI->getType(), VF); |
4128 | 297 | SmallVector<Type *, 4> Tys; |
4129 | 297 | for (Value *ArgOperand : CI->arg_operands()) |
4130 | 342 | Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); |
4131 | 297 | |
4132 | 297 | Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); |
4133 | 297 | |
4134 | 297 | // The flag shows whether we use Intrinsic or a usual Call for vectorized |
4135 | 297 | // version of the instruction. |
4136 | 297 | // Is it beneficial to perform intrinsic call compared to lib call? |
4137 | 297 | bool NeedToScalarize; |
4138 | 297 | unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); |
4139 | 297 | bool UseVectorIntrinsic = |
4140 | 297 | ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost232 ; |
4141 | 297 | assert((UseVectorIntrinsic || !NeedToScalarize) && |
4142 | 297 | "Instruction should be scalarized elsewhere."); |
4143 | 297 | |
4144 | 652 | for (unsigned Part = 0; Part < UF; ++Part355 ) { |
4145 | 355 | SmallVector<Value *, 4> Args; |
4146 | 761 | for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i406 ) { |
4147 | 406 | Value *Arg = CI->getArgOperand(i); |
4148 | 406 | // Some intrinsics have a scalar argument - don't replace it with a |
4149 | 406 | // vector. |
4150 | 406 | if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)262 ) |
4151 | 403 | Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part); |
4152 | 406 | Args.push_back(Arg); |
4153 | 406 | } |
4154 | 355 | |
4155 | 355 | Function *VectorF; |
4156 | 355 | if (UseVectorIntrinsic) { |
4157 | 221 | // Use vector version of the intrinsic. |
4158 | 221 | Type *TysForDecl[] = {CI->getType()}; |
4159 | 221 | if (VF > 1) |
4160 | 221 | TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); |
4161 | 221 | VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); |
4162 | 221 | } else { |
4163 | 134 | // Use vector version of the library call. |
4164 | 134 | StringRef VFnName = TLI->getVectorizedFunction(FnName, VF); |
4165 | 134 | assert(!VFnName.empty() && "Vector function name is empty."); |
4166 | 134 | VectorF = M->getFunction(VFnName); |
4167 | 134 | if (!VectorF) { |
4168 | 83 | // Generate a declaration |
4169 | 83 | FunctionType *FTy = FunctionType::get(RetTy, Tys, false); |
4170 | 83 | VectorF = |
4171 | 83 | Function::Create(FTy, Function::ExternalLinkage, VFnName, M); |
4172 | 83 | VectorF->copyAttributesFrom(F); |
4173 | 83 | } |
4174 | 134 | } |
4175 | 355 | assert(VectorF && "Can't create vector function."); |
4176 | 355 | |
4177 | 355 | SmallVector<OperandBundleDef, 1> OpBundles; |
4178 | 355 | CI->getOperandBundlesAsDefs(OpBundles); |
4179 | 355 | CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); |
4180 | 355 | |
4181 | 355 | if (isa<FPMathOperator>(V)) |
4182 | 335 | V->copyFastMathFlags(CI); |
4183 | 355 | |
4184 | 355 | VectorLoopValueMap.setVectorValue(&I, Part, V); |
4185 | 355 | addMetadata(V, &I); |
4186 | 355 | } |
4187 | 297 | |
4188 | 297 | break; |
4189 | 297 | } |
4190 | 297 | |
4191 | 297 | default: |
4192 | 0 | // This instruction is not vectorized by simple widening. |
4193 | 0 | LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); |
4194 | 0 | llvm_unreachable("Unhandled instruction!"); |
4195 | 50.4k | } // end of switch. |
4196 | 50.4k | } |
4197 | | |
4198 | 17.0k | void InnerLoopVectorizer::updateAnalysis() { |
4199 | 17.0k | // Forget the original basic block. |
4200 | 17.0k | PSE.getSE()->forgetLoop(OrigLoop); |
4201 | 17.0k | |
4202 | 17.0k | // DT is not kept up-to-date for outer loop vectorization |
4203 | 17.0k | if (EnableVPlanNativePath) |
4204 | 7 | return; |
4205 | 17.0k | |
4206 | 17.0k | // Update the dominator tree information. |
4207 | 17.0k | assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && |
4208 | 17.0k | "Entry does not dominate exit."); |
4209 | 17.0k | |
4210 | 17.0k | DT->addNewBlock(LoopMiddleBlock, |
4211 | 17.0k | LI->getLoopFor(LoopVectorBody)->getLoopLatch()); |
4212 | 17.0k | DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]); |
4213 | 17.0k | DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); |
4214 | 17.0k | DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]); |
4215 | 17.0k | assert(DT->verify(DominatorTree::VerificationLevel::Fast)); |
4216 | 17.0k | } |
4217 | | |
4218 | 32.3k | void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { |
4219 | 32.3k | // We should not collect Scalars more than once per VF. Right now, this |
4220 | 32.3k | // function is called from collectUniformsAndScalars(), which already does |
4221 | 32.3k | // this check. Collecting Scalars for VF=1 does not make any sense. |
4222 | 32.3k | assert(VF >= 2 && Scalars.find(VF) == Scalars.end() && |
4223 | 32.3k | "This function should not be visited twice for the same VF"); |
4224 | 32.3k | |
4225 | 32.3k | SmallSetVector<Instruction *, 8> Worklist; |
4226 | 32.3k | |
4227 | 32.3k | // These sets are used to seed the analysis with pointers used by memory |
4228 | 32.3k | // accesses that will remain scalar. |
4229 | 32.3k | SmallSetVector<Instruction *, 8> ScalarPtrs; |
4230 | 32.3k | SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; |
4231 | 32.3k | |
4232 | 32.3k | // A helper that returns true if the use of Ptr by MemAccess will be scalar. |
4233 | 32.3k | // The pointer operands of loads and stores will be scalar as long as the |
4234 | 32.3k | // memory access is not a gather or scatter operation. The value operand of a |
4235 | 32.3k | // store will remain scalar if the store is scalarized. |
4236 | 32.3k | auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { |
4237 | 16.3k | InstWidening WideningDecision = getWideningDecision(MemAccess, VF); |
4238 | 16.3k | assert(WideningDecision != CM_Unknown && |
4239 | 16.3k | "Widening decision should be ready at this moment"); |
4240 | 16.3k | if (auto *Store = dyn_cast<StoreInst>(MemAccess)) |
4241 | 6.44k | if (Ptr == Store->getValueOperand()) |
4242 | 82 | return WideningDecision == CM_Scalarize; |
4243 | 16.2k | assert(Ptr == getLoadStorePointerOperand(MemAccess) && |
4244 | 16.2k | "Ptr is neither a value or pointer operand"); |
4245 | 16.2k | return WideningDecision != CM_GatherScatter; |
4246 | 16.2k | }; |
4247 | 32.3k | |
4248 | 32.3k | // A helper that returns true if the given value is a bitcast or |
4249 | 32.3k | // getelementptr instruction contained in the loop. |
4250 | 240k | auto isLoopVaryingBitCastOrGEP = [&](Value *V) { |
4251 | 240k | return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()8.18k ) || |
4252 | 240k | isa<GetElementPtrInst>(V)232k ) && |
4253 | 240k | !TheLoop->isLoopInvariant(V)76.6k ; |
4254 | 240k | }; |
4255 | 32.3k | |
4256 | 32.3k | // A helper that evaluates a memory access's use of a pointer. If the use |
4257 | 32.3k | // will be a scalar use, and the pointer is only used by memory accesses, we |
4258 | 32.3k | // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in |
4259 | 32.3k | // PossibleNonScalarPtrs. |
4260 | 96.9k | auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { |
4261 | 96.9k | // We only care about bitcast and getelementptr instructions contained in |
4262 | 96.9k | // the loop. |
4263 | 96.9k | if (!isLoopVaryingBitCastOrGEP(Ptr)) |
4264 | 36.5k | return; |
4265 | 60.3k | |
4266 | 60.3k | // If the pointer has already been identified as scalar (e.g., if it was |
4267 | 60.3k | // also identified as uniform), there's nothing to do. |
4268 | 60.3k | auto *I = cast<Instruction>(Ptr); |
4269 | 60.3k | if (Worklist.count(I)) |
4270 | 46.0k | return; |
4271 | 14.3k | |
4272 | 14.3k | // If the use of the pointer will be a scalar use, and all users of the |
4273 | 14.3k | // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, |
4274 | 14.3k | // place the pointer in PossibleNonScalarPtrs. |
4275 | 16.5k | if (14.3k isScalarUse(MemAccess, Ptr)14.3k && llvm::all_of(I->users(), [&](User *U) 14.1k { |
4276 | 16.5k | return isa<LoadInst>(U) || isa<StoreInst>(U)6.66k ; |
4277 | 16.5k | })) |
4278 | 14.1k | ScalarPtrs.insert(I); |
4279 | 196 | else |
4280 | 196 | PossibleNonScalarPtrs.insert(I); |
4281 | 14.3k | }; |
4282 | 32.3k | |
4283 | 32.3k | // We seed the scalars analysis with three classes of instructions: (1) |
4284 | 32.3k | // instructions marked uniform-after-vectorization, (2) bitcast and |
4285 | 32.3k | // getelementptr instructions used by memory accesses requiring a scalar use, |
4286 | 32.3k | // and (3) pointer induction variables and their update instructions (we |
4287 | 32.3k | // currently only scalarize these). |
4288 | 32.3k | // |
4289 | 32.3k | // (1) Add to the worklist all instructions that have been identified as |
4290 | 32.3k | // uniform-after-vectorization. |
4291 | 32.3k | Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); |
4292 | 32.3k | |
4293 | 32.3k | // (2) Add to the worklist all bitcast and getelementptr instructions used by |
4294 | 32.3k | // memory accesses requiring a scalar use. The pointer operands of loads and |
4295 | 32.3k | // stores will be scalar as long as the memory accesses is not a gather or |
4296 | 32.3k | // scatter operation. The value operand of a store will remain scalar if the |
4297 | 32.3k | // store is scalarized. |
4298 | 32.3k | for (auto *BB : TheLoop->blocks()) |
4299 | 389k | for (auto &I : *BB)35.8k { |
4300 | 389k | if (auto *Load = dyn_cast<LoadInst>(&I)) { |
4301 | 31.6k | evaluatePtrUse(Load, Load->getPointerOperand()); |
4302 | 357k | } else if (auto *Store = dyn_cast<StoreInst>(&I)) { |
4303 | 32.6k | evaluatePtrUse(Store, Store->getPointerOperand()); |
4304 | 32.6k | evaluatePtrUse(Store, Store->getValueOperand()); |
4305 | 32.6k | } |
4306 | 389k | } |
4307 | 32.3k | for (auto *I : ScalarPtrs) |
4308 | 13.0k | if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) { |
4309 | 13.0k | LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); |
4310 | 13.0k | Worklist.insert(I); |
4311 | 13.0k | } |
4312 | 32.3k | |
4313 | 32.3k | // (3) Add to the worklist all pointer induction variables and their update |
4314 | 32.3k | // instructions. |
4315 | 32.3k | // |
4316 | 32.3k | // TODO: Once we are able to vectorize pointer induction variables we should |
4317 | 32.3k | // no longer insert them into the worklist here. |
4318 | 32.3k | auto *Latch = TheLoop->getLoopLatch(); |
4319 | 39.0k | for (auto &Induction : *Legal->getInductionVars()) { |
4320 | 39.0k | auto *Ind = Induction.first; |
4321 | 39.0k | auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); |
4322 | 39.0k | if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction) |
4323 | 31.8k | continue; |
4324 | 7.21k | Worklist.insert(Ind); |
4325 | 7.21k | Worklist.insert(IndUpdate); |
4326 | 7.21k | LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); |
4327 | 7.21k | LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate |
4328 | 7.21k | << "\n"); |
4329 | 7.21k | } |
4330 | 32.3k | |
4331 | 32.3k | // Insert the forced scalars. |
4332 | 32.3k | // FIXME: Currently widenPHIInstruction() often creates a dead vector |
4333 | 32.3k | // induction variable when the PHI user is scalarized. |
4334 | 32.3k | auto ForcedScalar = ForcedScalars.find(VF); |
4335 | 32.3k | if (ForcedScalar != ForcedScalars.end()) |
4336 | 13 | for (auto *I : ForcedScalar->second) |
4337 | 60 | Worklist.insert(I); |
4338 | 32.3k | |
4339 | 32.3k | // Expand the worklist by looking through any bitcasts and getelementptr |
4340 | 32.3k | // instructions we've already identified as scalar. This is similar to the |
4341 | 32.3k | // expansion step in collectLoopUniforms(); however, here we're only |
4342 | 32.3k | // expanding to include additional bitcasts and getelementptr instructions. |
4343 | 32.3k | unsigned Idx = 0; |
4344 | 175k | while (Idx != Worklist.size()) { |
4345 | 143k | Instruction *Dst = Worklist[Idx++]; |
4346 | 143k | if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) |
4347 | 132k | continue; |
4348 | 10.9k | auto *Src = cast<Instruction>(Dst->getOperand(0)); |
4349 | 30.6k | if (llvm::all_of(Src->users(), [&](User *U) -> bool 10.9k { |
4350 | 30.6k | auto *J = cast<Instruction>(U); |
4351 | 30.6k | return !TheLoop->contains(J) || Worklist.count(J)28.7k || |
4352 | 30.6k | (1.99k (1.99k isa<LoadInst>(J)1.99k || isa<StoreInst>(J)1.03k ) && |
4353 | 1.99k | isScalarUse(J, Src)); |
4354 | 30.6k | })) { |
4355 | 10.9k | Worklist.insert(Src); |
4356 | 10.9k | LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); |
4357 | 10.9k | } |
4358 | 10.9k | } |
4359 | 32.3k | |
4360 | 32.3k | // An induction variable will remain scalar if all users of the induction |
4361 | 32.3k | // variable and induction variable update remain scalar. |
4362 | 39.0k | for (auto &Induction : *Legal->getInductionVars()) { |
4363 | 39.0k | auto *Ind = Induction.first; |
4364 | 39.0k | auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); |
4365 | 39.0k | |
4366 | 39.0k | // We already considered pointer induction variables, so there's no reason |
4367 | 39.0k | // to look at their users again. |
4368 | 39.0k | // |
4369 | 39.0k | // TODO: Once we are able to vectorize pointer induction variables we |
4370 | 39.0k | // should no longer skip over them here. |
4371 | 39.0k | if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction) |
4372 | 7.21k | continue; |
4373 | 31.8k | |
4374 | 31.8k | // Determine if all users of the induction variable are scalar after |
4375 | 31.8k | // vectorization. |
4376 | 73.1k | auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool 31.8k { |
4377 | 73.1k | auto *I = cast<Instruction>(U); |
4378 | 73.1k | return I == IndUpdate || !TheLoop->contains(I)42.7k || Worklist.count(I)42.6k ; |
4379 | 73.1k | }); |
4380 | 31.8k | if (!ScalarInd) |
4381 | 4.16k | continue; |
4382 | 27.6k | |
4383 | 27.6k | // Determine if all users of the induction variable update instruction are |
4384 | 27.6k | // scalar after vectorization. |
4385 | 27.6k | auto ScalarIndUpdate = |
4386 | 65.9k | llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { |
4387 | 65.9k | auto *I = cast<Instruction>(U); |
4388 | 65.9k | return I == Ind || !TheLoop->contains(I)38.3k || Worklist.count(I)37.6k ; |
4389 | 65.9k | }); |
4390 | 27.6k | if (!ScalarIndUpdate) |
4391 | 9.78k | continue; |
4392 | 17.8k | |
4393 | 17.8k | // The induction variable and its update instruction will remain scalar. |
4394 | 17.8k | Worklist.insert(Ind); |
4395 | 17.8k | Worklist.insert(IndUpdate); |
4396 | 17.8k | LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); |
4397 | 17.8k | LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate |
4398 | 17.8k | << "\n"); |
4399 | 17.8k | } |
4400 | 32.3k | |
4401 | 32.3k | Scalars[VF].insert(Worklist.begin(), Worklist.end()); |
4402 | 32.3k | } |
4403 | | |
4404 | 749k | bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) { |
4405 | 749k | if (!blockNeedsPredication(I->getParent())) |
4406 | 708k | return false; |
4407 | 41.0k | switch(I->getOpcode()) { |
4408 | 41.0k | default: |
4409 | 27.6k | break; |
4410 | 41.0k | case Instruction::Load: |
4411 | 12.7k | case Instruction::Store: { |
4412 | 12.7k | if (!Legal->isMaskRequired(I)) |
4413 | 86 | return false; |
4414 | 12.7k | auto *Ptr = getLoadStorePointerOperand(I); |
4415 | 12.7k | auto *Ty = getMemInstValueType(I); |
4416 | 12.7k | // We have already decided how to vectorize this instruction, get that |
4417 | 12.7k | // result. |
4418 | 12.7k | if (VF > 1) { |
4419 | 4.54k | InstWidening WideningDecision = getWideningDecision(I, VF); |
4420 | 4.54k | assert(WideningDecision != CM_Unknown && |
4421 | 4.54k | "Widening decision should be ready at this moment"); |
4422 | 4.54k | return WideningDecision == CM_Scalarize; |
4423 | 4.54k | } |
4424 | 8.16k | return isa<LoadInst>(I) ? |
4425 | 3.64k | !(isLegalMaskedLoad(Ty, Ptr) || isLegalMaskedGather(Ty)3.41k ) |
4426 | 8.16k | : !(4.52k isLegalMaskedStore(Ty, Ptr)4.52k || isLegalMaskedScatter(Ty)4.17k ); |
4427 | 8.16k | } |
4428 | 8.16k | case Instruction::UDiv: |
4429 | 575 | case Instruction::SDiv: |
4430 | 575 | case Instruction::SRem: |
4431 | 575 | case Instruction::URem: |
4432 | 575 | return mayDivideByZero(*I); |
4433 | 27.6k | } |
4434 | 27.6k | return false; |
4435 | 27.6k | } |
4436 | | |
4437 | | bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, |
4438 | 2.26k | unsigned VF) { |
4439 | 2.26k | assert(isAccessInterleaved(I) && "Expecting interleaved access."); |
4440 | 2.26k | assert(getWideningDecision(I, VF) == CM_Unknown && |
4441 | 2.26k | "Decision should not be set yet."); |
4442 | 2.26k | auto *Group = getInterleavedAccessGroup(I); |
4443 | 2.26k | assert(Group && "Must have a group."); |
4444 | 2.26k | |
4445 | 2.26k | // If the instruction's allocated size doesn't equal it's type size, it |
4446 | 2.26k | // requires padding and will be scalarized. |
4447 | 2.26k | auto &DL = I->getModule()->getDataLayout(); |
4448 | 2.26k | auto *ScalarTy = getMemInstValueType(I); |
4449 | 2.26k | if (hasIrregularType(ScalarTy, DL, VF)) |
4450 | 1 | return false; |
4451 | 2.26k | |
4452 | 2.26k | // Check if masking is required. |
4453 | 2.26k | // A Group may need masking for one of two reasons: it resides in a block that |
4454 | 2.26k | // needs predication, or it was decided to use masking to deal with gaps. |
4455 | 2.26k | bool PredicatedAccessRequiresMasking = |
4456 | 2.26k | Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I)8 ; |
4457 | 2.26k | bool AccessWithGapsRequiresMasking = |
4458 | 2.26k | Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed342 ; |
4459 | 2.26k | if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking2.25k ) |
4460 | 2.25k | return true; |
4461 | 10 | |
4462 | 10 | // If masked interleaving is required, we expect that the user/target had |
4463 | 10 | // enabled it, because otherwise it either wouldn't have been created or |
4464 | 10 | // it should have been invalidated by the CostModel. |
4465 | 10 | assert(useMaskedInterleavedAccesses(TTI) && |
4466 | 10 | "Masked interleave-groups for predicated accesses are not enabled."); |
4467 | 10 | |
4468 | 10 | auto *Ty = getMemInstValueType(I); |
4469 | 10 | return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty)8 |
4470 | 10 | : TTI.isLegalMaskedStore(Ty)2 ; |
4471 | 10 | } |
4472 | | |
4473 | | bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, |
4474 | 63.6k | unsigned VF) { |
4475 | 63.6k | // Get and ensure we have a valid memory instruction. |
4476 | 63.6k | LoadInst *LI = dyn_cast<LoadInst>(I); |
4477 | 63.6k | StoreInst *SI = dyn_cast<StoreInst>(I); |
4478 | 63.6k | assert((LI || SI) && "Invalid memory instruction"); |
4479 | 63.6k | |
4480 | 63.6k | auto *Ptr = getLoadStorePointerOperand(I); |
4481 | 63.6k | |
4482 | 63.6k | // In order to be widened, the pointer should be consecutive, first of all. |
4483 | 63.6k | if (!Legal->isConsecutivePtr(Ptr)) |
4484 | 19.1k | return false; |
4485 | 44.5k | |
4486 | 44.5k | // If the instruction is a store located in a predicated block, it will be |
4487 | 44.5k | // scalarized. |
4488 | 44.5k | if (isScalarWithPredication(I)) |
4489 | 1.06k | return false; |
4490 | 43.4k | |
4491 | 43.4k | // If the instruction's allocated size doesn't equal it's type size, it |
4492 | 43.4k | // requires padding and will be scalarized. |
4493 | 43.4k | auto &DL = I->getModule()->getDataLayout(); |
4494 | 43.4k | auto *ScalarTy = LI ? LI->getType()18.8k : SI->getValueOperand()->getType()24.5k ; |
4495 | 43.4k | if (hasIrregularType(ScalarTy, DL, VF)) |
4496 | 1 | return false; |
4497 | 43.4k | |
4498 | 43.4k | return true; |
4499 | 43.4k | } |
4500 | | |
4501 | 32.3k | void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { |
4502 | 32.3k | // We should not collect Uniforms more than once per VF. Right now, |
4503 | 32.3k | // this function is called from collectUniformsAndScalars(), which |
4504 | 32.3k | // already does this check. Collecting Uniforms for VF=1 does not make any |
4505 | 32.3k | // sense. |
4506 | 32.3k | |
4507 | 32.3k | assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() && |
4508 | 32.3k | "This function should not be visited twice for the same VF"); |
4509 | 32.3k | |
4510 | 32.3k | // Visit the list of Uniforms. If we'll not find any uniform value, we'll |
4511 | 32.3k | // not analyze again. Uniforms.count(VF) will return 1. |
4512 | 32.3k | Uniforms[VF].clear(); |
4513 | 32.3k | |
4514 | 32.3k | // We now know that the loop is vectorizable! |
4515 | 32.3k | // Collect instructions inside the loop that will remain uniform after |
4516 | 32.3k | // vectorization. |
4517 | 32.3k | |
4518 | 32.3k | // Global values, params and instructions outside of current loop are out of |
4519 | 32.3k | // scope. |
4520 | 197k | auto isOutOfScope = [&](Value *V) -> bool { |
4521 | 197k | Instruction *I = dyn_cast<Instruction>(V); |
4522 | 197k | return (!I || !TheLoop->contains(I)113k ); |
4523 | 197k | }; |
4524 | 32.3k | |
4525 | 32.3k | SetVector<Instruction *> Worklist; |
4526 | 32.3k | BasicBlock *Latch = TheLoop->getLoopLatch(); |
4527 | 32.3k | |
4528 | 32.3k | // Start with the conditional branch. If the branch condition is an |
4529 | 32.3k | // instruction contained in the loop that is only used by the branch, it is |
4530 | 32.3k | // uniform. |
4531 | 32.3k | auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); |
4532 | 32.3k | if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) { |
4533 | 32.3k | Worklist.insert(Cmp); |
4534 | 32.3k | LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n"); |
4535 | 32.3k | } |
4536 | 32.3k | |
4537 | 32.3k | // Holds consecutive and consecutive-like pointers. Consecutive-like pointers |
4538 | 32.3k | // are pointers that are treated like consecutive pointers during |
4539 | 32.3k | // vectorization. The pointer operands of interleaved accesses are an |
4540 | 32.3k | // example. |
4541 | 32.3k | SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs; |
4542 | 32.3k | |
4543 | 32.3k | // Holds pointer operands of instructions that are possibly non-uniform. |
4544 | 32.3k | SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs; |
4545 | 32.3k | |
4546 | 64.0k | auto isUniformDecision = [&](Instruction *I, unsigned VF) { |
4547 | 64.0k | InstWidening WideningDecision = getWideningDecision(I, VF); |
4548 | 64.0k | assert(WideningDecision != CM_Unknown && |
4549 | 64.0k | "Widening decision should be ready at this moment"); |
4550 | 64.0k | |
4551 | 64.0k | return (WideningDecision == CM_Widen || |
4552 | 64.0k | WideningDecision == CM_Widen_Reverse23.5k || |
4553 | 64.0k | WideningDecision == CM_Interleave20.5k ); |
4554 | 64.0k | }; |
4555 | 32.3k | // Iterate over the instructions in the loop, and collect all |
4556 | 32.3k | // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible |
4557 | 32.3k | // that a consecutive-like pointer operand will be scalarized, we collect it |
4558 | 32.3k | // in PossibleNonUniformPtrs instead. We use two sets here because a single |
4559 | 32.3k | // getelementptr instruction can be used by both vectorized and scalarized |
4560 | 32.3k | // memory instructions. For example, if a loop loads and stores from the same |
4561 | 32.3k | // location, but the store is conditional, the store will be scalarized, and |
4562 | 32.3k | // the getelementptr won't remain uniform. |
4563 | 32.3k | for (auto *BB : TheLoop->blocks()) |
4564 | 389k | for (auto &I : *BB)35.8k { |
4565 | 389k | // If there's no pointer operand, there's nothing to do. |
4566 | 389k | auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); |
4567 | 389k | if (!Ptr) |
4568 | 324k | continue; |
4569 | 64.1k | |
4570 | 64.1k | // True if all users of Ptr are memory accesses that have Ptr as their |
4571 | 64.1k | // pointer operand. |
4572 | 64.1k | auto UsersAreMemAccesses = |
4573 | 73.7k | llvm::all_of(Ptr->users(), [&](User *U) -> bool { |
4574 | 73.7k | return getLoadStorePointerOperand(U) == Ptr; |
4575 | 73.7k | }); |
4576 | 64.1k | |
4577 | 64.1k | // Ensure the memory instruction will not be scalarized or used by |
4578 | 64.1k | // gather/scatter, making its pointer operand non-uniform. If the pointer |
4579 | 64.1k | // operand is used by any instruction other than a memory access, we |
4580 | 64.1k | // conservatively assume the pointer operand may be non-uniform. |
4581 | 64.1k | if (!UsersAreMemAccesses || !isUniformDecision(&I, VF)59.8k ) |
4582 | 18.8k | PossibleNonUniformPtrs.insert(Ptr); |
4583 | 45.2k | |
4584 | 45.2k | // If the memory instruction will be vectorized and its pointer operand |
4585 | 45.2k | // is consecutive-like, or interleaving - the pointer operand should |
4586 | 45.2k | // remain uniform. |
4587 | 45.2k | else |
4588 | 45.2k | ConsecutiveLikePtrs.insert(Ptr); |
4589 | 64.1k | } |
4590 | 32.3k | |
4591 | 32.3k | // Add to the Worklist all consecutive and consecutive-like pointers that |
4592 | 32.3k | // aren't also identified as possibly non-uniform. |
4593 | 32.3k | for (auto *V : ConsecutiveLikePtrs) |
4594 | 43.1k | if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) { |
4595 | 42.8k | LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n"); |
4596 | 42.8k | Worklist.insert(V); |
4597 | 42.8k | } |
4598 | 32.3k | |
4599 | 32.3k | // Expand Worklist in topological order: whenever a new instruction |
4600 | 32.3k | // is added , its users should be already inside Worklist. It ensures |
4601 | 32.3k | // a uniform instruction will only be used by uniform instructions. |
4602 | 32.3k | unsigned idx = 0; |
4603 | 115k | while (idx != Worklist.size()) { |
4604 | 82.6k | Instruction *I = Worklist[idx++]; |
4605 | 82.6k | |
4606 | 197k | for (auto OV : I->operand_values()) { |
4607 | 197k | // isOutOfScope operands cannot be uniform instructions. |
4608 | 197k | if (isOutOfScope(OV)) |
4609 | 114k | continue; |
4610 | 82.8k | // First order recurrence Phi's should typically be considered |
4611 | 82.8k | // non-uniform. |
4612 | 82.8k | auto *OP = dyn_cast<PHINode>(OV); |
4613 | 82.8k | if (OP && Legal->isFirstOrderRecurrence(OP)39.5k ) |
4614 | 0 | continue; |
4615 | 82.8k | // If all the users of the operand are uniform, then add the |
4616 | 82.8k | // operand into the uniform worklist. |
4617 | 82.8k | auto *OI = cast<Instruction>(OV); |
4618 | 180k | if (llvm::all_of(OI->users(), [&](User *U) -> bool 82.8k { |
4619 | 180k | auto *J = cast<Instruction>(U); |
4620 | 180k | return Worklist.count(J) || |
4621 | 180k | (74.2k OI == getLoadStorePointerOperand(J)74.2k && |
4622 | 74.2k | isUniformDecision(J, VF)282 ); |
4623 | 180k | })) { |
4624 | 8.88k | Worklist.insert(OI); |
4625 | 8.88k | LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n"); |
4626 | 8.88k | } |
4627 | 82.8k | } |
4628 | 82.6k | } |
4629 | 32.3k | |
4630 | 32.3k | // Returns true if Ptr is the pointer operand of a memory access instruction |
4631 | 32.3k | // I, and I is known to not require scalarization. |
4632 | 32.3k | auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { |
4633 | 19.3k | return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF)3.89k ; |
4634 | 19.3k | }; |
4635 | 32.3k | |
4636 | 32.3k | // For an instruction to be added into Worklist above, all its users inside |
4637 | 32.3k | // the loop should also be in Worklist. However, this condition cannot be |
4638 | 32.3k | // true for phi nodes that form a cyclic dependence. We must process phi |
4639 | 32.3k | // nodes separately. An induction variable will remain uniform if all users |
4640 | 32.3k | // of the induction variable and induction variable update remain uniform. |
4641 | 32.3k | // The code below handles both pointer and non-pointer induction variables. |
4642 | 39.0k | for (auto &Induction : *Legal->getInductionVars()) { |
4643 | 39.0k | auto *Ind = Induction.first; |
4644 | 39.0k | auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); |
4645 | 39.0k | |
4646 | 39.0k | // Determine if all users of the induction variable are uniform after |
4647 | 39.0k | // vectorization. |
4648 | 83.6k | auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { |
4649 | 83.6k | auto *I = cast<Instruction>(U); |
4650 | 83.6k | return I == IndUpdate || !TheLoop->contains(I)47.0k || Worklist.count(I)46.8k || |
4651 | 83.6k | isVectorizedMemAccessUse(I, Ind)8.65k ; |
4652 | 83.6k | }); |
4653 | 39.0k | if (!UniformInd) |
4654 | 5.77k | continue; |
4655 | 33.2k | |
4656 | 33.2k | // Determine if all users of the induction variable update instruction are |
4657 | 33.2k | // uniform after vectorization. |
4658 | 33.2k | auto UniformIndUpdate = |
4659 | 75.9k | llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { |
4660 | 75.9k | auto *I = cast<Instruction>(U); |
4661 | 75.9k | return I == Ind || !TheLoop->contains(I)42.7k || Worklist.count(I)39.4k || |
4662 | 75.9k | isVectorizedMemAccessUse(I, IndUpdate)10.7k ; |
4663 | 75.9k | }); |
4664 | 33.2k | if (!UniformIndUpdate) |
4665 | 9.82k | continue; |
4666 | 23.4k | |
4667 | 23.4k | // The induction variable and its update instruction will remain uniform. |
4668 | 23.4k | Worklist.insert(Ind); |
4669 | 23.4k | Worklist.insert(IndUpdate); |
4670 | 23.4k | LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n"); |
4671 | 23.4k | LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate |
4672 | 23.4k | << "\n"); |
4673 | 23.4k | } |
4674 | 32.3k | |
4675 | 32.3k | Uniforms[VF].insert(Worklist.begin(), Worklist.end()); |
4676 | 32.3k | } |
4677 | | |
4678 | 19.9k | Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) { |
4679 | 19.9k | if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()3.16k ) { |
4680 | 2 | // TODO: It may by useful to do since it's still likely to be dynamically |
4681 | 2 | // uniform if the target can skip. |
4682 | 2 | LLVM_DEBUG( |
4683 | 2 | dbgs() << "LV: Not inserting runtime ptr check for divergent target"); |
4684 | 2 | |
4685 | 2 | ORE->emit( |
4686 | 2 | createMissedAnalysis("CantVersionLoopWithDivergentTarget") |
4687 | 2 | << "runtime pointer checks needed. Not enabled for divergent target"); |
4688 | 2 | |
4689 | 2 | return None; |
4690 | 2 | } |
4691 | 19.9k | |
4692 | 19.9k | unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); |
4693 | 19.9k | if (!OptForSize) // Remaining checks deal with scalar loop when OptForSize. |
4694 | 19.7k | return computeFeasibleMaxVF(OptForSize, TC); |
4695 | 212 | |
4696 | 212 | if (Legal->getRuntimePointerChecking()->Need) { |
4697 | 37 | ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize") |
4698 | 37 | << "runtime pointer checks needed. Enable vectorization of this " |
4699 | 37 | "loop with '#pragma clang loop vectorize(enable)' when " |
4700 | 37 | "compiling with -Os/-Oz"); |
4701 | 37 | LLVM_DEBUG( |
4702 | 37 | dbgs() |
4703 | 37 | << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n"); |
4704 | 37 | return None; |
4705 | 37 | } |
4706 | 175 | |
4707 | 175 | if (!PSE.getUnionPredicate().getPredicates().empty()) { |
4708 | 9 | ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize") |
4709 | 9 | << "runtime SCEV checks needed. Enable vectorization of this " |
4710 | 9 | "loop with '#pragma clang loop vectorize(enable)' when " |
4711 | 9 | "compiling with -Os/-Oz"); |
4712 | 9 | LLVM_DEBUG( |
4713 | 9 | dbgs() |
4714 | 9 | << "LV: Aborting. Runtime SCEV check is required with -Os/-Oz.\n"); |
4715 | 9 | return None; |
4716 | 9 | } |
4717 | 166 | |
4718 | 166 | // FIXME: Avoid specializing for stride==1 instead of bailing out. |
4719 | 166 | if (!Legal->getLAI()->getSymbolicStrides().empty()) { |
4720 | 1 | ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize") |
4721 | 1 | << "runtime stride == 1 checks needed. Enable vectorization of " |
4722 | 1 | "this loop with '#pragma clang loop vectorize(enable)' when " |
4723 | 1 | "compiling with -Os/-Oz"); |
4724 | 1 | LLVM_DEBUG( |
4725 | 1 | dbgs() |
4726 | 1 | << "LV: Aborting. Runtime stride check is required with -Os/-Oz.\n"); |
4727 | 1 | return None; |
4728 | 1 | } |
4729 | 165 | |
4730 | 165 | // If we optimize the program for size, avoid creating the tail loop. |
4731 | 165 | LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); |
4732 | 165 | |
4733 | 165 | if (TC == 1) { |
4734 | 5 | ORE->emit(createMissedAnalysis("SingleIterationLoop") |
4735 | 5 | << "loop trip count is one, irrelevant for vectorization"); |
4736 | 5 | LLVM_DEBUG(dbgs() << "LV: Aborting, single iteration (non) loop.\n"); |
4737 | 5 | return None; |
4738 | 5 | } |
4739 | 160 | |
4740 | 160 | // Record that scalar epilogue is not allowed. |
4741 | 160 | LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); |
4742 | 160 | |
4743 | 160 | IsScalarEpilogueAllowed = !OptForSize; |
4744 | 160 | |
4745 | 160 | // We don't create an epilogue when optimizing for size. |
4746 | 160 | // Invalidate interleave groups that require an epilogue if we can't mask |
4747 | 160 | // the interleave-group. |
4748 | 160 | if (!useMaskedInterleavedAccesses(TTI)) |
4749 | 153 | InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); |
4750 | 160 | |
4751 | 160 | unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC); |
4752 | 160 | |
4753 | 160 | if (TC > 0 && TC % MaxVF == 0129 ) { |
4754 | 85 | LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); |
4755 | 85 | return MaxVF; |
4756 | 85 | } |
4757 | 75 | |
4758 | 75 | // If we don't know the precise trip count, or if the trip count that we |
4759 | 75 | // found modulo the vectorization factor is not zero, try to fold the tail |
4760 | 75 | // by masking. |
4761 | 75 | // FIXME: look for a smaller MaxVF that does divide TC rather than masking. |
4762 | 75 | if (Legal->canFoldTailByMasking()) { |
4763 | 26 | FoldTailByMasking = true; |
4764 | 26 | return MaxVF; |
4765 | 26 | } |
4766 | 49 | |
4767 | 49 | if (TC == 0) { |
4768 | 16 | ORE->emit( |
4769 | 16 | createMissedAnalysis("UnknownLoopCountComplexCFG") |
4770 | 16 | << "unable to calculate the loop count due to complex control flow"); |
4771 | 16 | return None; |
4772 | 16 | } |
4773 | 33 | |
4774 | 33 | ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize") |
4775 | 33 | << "cannot optimize for size and vectorize at the same time. " |
4776 | 33 | "Enable vectorization of this loop with '#pragma clang loop " |
4777 | 33 | "vectorize(enable)' when compiling with -Os/-Oz"); |
4778 | 33 | return None; |
4779 | 33 | } |
4780 | | |
4781 | | unsigned |
4782 | | LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize, |
4783 | 19.9k | unsigned ConstTripCount) { |
4784 | 19.9k | MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); |
4785 | 19.9k | unsigned SmallestType, WidestType; |
4786 | 19.9k | std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); |
4787 | 19.9k | unsigned WidestRegister = TTI.getRegisterBitWidth(true); |
4788 | 19.9k | |
4789 | 19.9k | // Get the maximum safe dependence distance in bits computed by LAA. |
4790 | 19.9k | // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from |
4791 | 19.9k | // the memory accesses that is most restrictive (involved in the smallest |
4792 | 19.9k | // dependence distance). |
4793 | 19.9k | unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth(); |
4794 | 19.9k | |
4795 | 19.9k | WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); |
4796 | 19.9k | |
4797 | 19.9k | unsigned MaxVectorSize = WidestRegister / WidestType; |
4798 | 19.9k | |
4799 | 19.9k | LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType |
4800 | 19.9k | << " / " << WidestType << " bits.\n"); |
4801 | 19.9k | LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " |
4802 | 19.9k | << WidestRegister << " bits.\n"); |
4803 | 19.9k | |
4804 | 19.9k | assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements" |
4805 | 19.9k | " into one vector!"); |
4806 | 19.9k | if (MaxVectorSize == 0) { |
4807 | 76 | LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); |
4808 | 76 | MaxVectorSize = 1; |
4809 | 76 | return MaxVectorSize; |
4810 | 19.8k | } else if (ConstTripCount && ConstTripCount < MaxVectorSize11.0k && |
4811 | 19.8k | isPowerOf2_32(ConstTripCount)13 ) { |
4812 | 11 | // We need to clamp the VF to be the ConstTripCount. There is no point in |
4813 | 11 | // choosing a higher viable VF as done in the loop below. |
4814 | 11 | LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " |
4815 | 11 | << ConstTripCount << "\n"); |
4816 | 11 | MaxVectorSize = ConstTripCount; |
4817 | 11 | return MaxVectorSize; |
4818 | 11 | } |
4819 | 19.8k | |
4820 | 19.8k | unsigned MaxVF = MaxVectorSize; |
4821 | 19.8k | if (TTI.shouldMaximizeVectorBandwidth(OptForSize) || |
4822 | 19.8k | (19.8k MaximizeBandwidth19.8k && !OptForSize0 )) { |
4823 | 2 | // Collect all viable vectorization factors larger than the default MaxVF |
4824 | 2 | // (i.e. MaxVectorSize). |
4825 | 2 | SmallVector<unsigned, 8> VFs; |
4826 | 2 | unsigned NewMaxVectorSize = WidestRegister / SmallestType; |
4827 | 3 | for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 21 ) |
4828 | 1 | VFs.push_back(VS); |
4829 | 2 | |
4830 | 2 | // For each VF calculate its register usage. |
4831 | 2 | auto RUs = calculateRegisterUsage(VFs); |
4832 | 2 | |
4833 | 2 | // Select the largest VF which doesn't require more registers than existing |
4834 | 2 | // ones. |
4835 | 2 | unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true); |
4836 | 2 | for (int i = RUs.size() - 1; i >= 0; --i0 ) { |
4837 | 1 | if (RUs[i].MaxLocalUsers <= TargetNumRegisters) { |
4838 | 1 | MaxVF = VFs[i]; |
4839 | 1 | break; |
4840 | 1 | } |
4841 | 1 | } |
4842 | 2 | if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { |
4843 | 2 | if (MaxVF < MinVF) { |
4844 | 0 | LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF |
4845 | 0 | << ") with target's minimum: " << MinVF << '\n'); |
4846 | 0 | MaxVF = MinVF; |
4847 | 0 | } |
4848 | 2 | } |
4849 | 2 | } |
4850 | 19.8k | return MaxVF; |
4851 | 19.8k | } |
4852 | | |
4853 | | VectorizationFactor |
4854 | 19.1k | LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { |
4855 | 19.1k | float Cost = expectedCost(1).first; |
4856 | 19.1k | const float ScalarCost = Cost; |
4857 | 19.1k | unsigned Width = 1; |
4858 | 19.1k | LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); |
4859 | 19.1k | |
4860 | 19.1k | bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; |
4861 | 19.1k | if (ForceVectorization && MaxVF > 117 ) { |
4862 | 17 | // Ignore scalar width, because the user explicitly wants vectorization. |
4863 | 17 | // Initialize cost to max so that VF = 2 is, at least, chosen during cost |
4864 | 17 | // evaluation. |
4865 | 17 | Cost = std::numeric_limits<float>::max(); |
4866 | 17 | } |
4867 | 19.1k | |
4868 | 50.9k | for (unsigned i = 2; i <= MaxVF; i *= 231.7k ) { |
4869 | 31.7k | // Notice that the vector loop needs to be executed less times, so |
4870 | 31.7k | // we need to divide the cost of the vector loops by the width of |
4871 | 31.7k | // the vector elements. |
4872 | 31.7k | VectorizationCostTy C = expectedCost(i); |
4873 | 31.7k | float VectorCost = C.first / (float)i; |
4874 | 31.7k | LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i |
4875 | 31.7k | << " costs: " << (int)VectorCost << ".\n"); |
4876 | 31.7k | if (!C.second && !ForceVectorization2.93k ) { |
4877 | 2.93k | LLVM_DEBUG( |
4878 | 2.93k | dbgs() << "LV: Not considering vector loop of width " << i |
4879 | 2.93k | << " because it will not generate any vector instructions.\n"); |
4880 | 2.93k | continue; |
4881 | 2.93k | } |
4882 | 28.8k | if (VectorCost < Cost) { |
4883 | 23.9k | Cost = VectorCost; |
4884 | 23.9k | Width = i; |
4885 | 23.9k | } |
4886 | 28.8k | } |
4887 | 19.1k | |
4888 | 19.1k | if (!EnableCondStoresVectorization && NumPredStores0 ) { |
4889 | 0 | ORE->emit(createMissedAnalysis("ConditionalStore") |
4890 | 0 | << "store that is conditionally executed prevents vectorization"); |
4891 | 0 | LLVM_DEBUG( |
4892 | 0 | dbgs() << "LV: No vectorization. There are conditional stores.\n"); |
4893 | 0 | Width = 1; |
4894 | 0 | Cost = ScalarCost; |
4895 | 0 | } |
4896 | 19.1k | |
4897 | 19.1k | LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() |
4898 | 19.1k | << "LV: Vectorization seems to be not beneficial, " |
4899 | 19.1k | << "but was forced by a user.\n"); |
4900 | 19.1k | LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); |
4901 | 19.1k | VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)}; |
4902 | 19.1k | return Factor; |
4903 | 19.1k | } |
4904 | | |
4905 | | std::pair<unsigned, unsigned> |
4906 | 19.9k | LoopVectorizationCostModel::getSmallestAndWidestTypes() { |
4907 | 19.9k | unsigned MinWidth = -1U; |
4908 | 19.9k | unsigned MaxWidth = 8; |
4909 | 19.9k | const DataLayout &DL = TheFunction->getParent()->getDataLayout(); |
4910 | 19.9k | |
4911 | 19.9k | // For each block. |
4912 | 22.0k | for (BasicBlock *BB : TheLoop->blocks()) { |
4913 | 22.0k | // For each instruction in the loop. |
4914 | 251k | for (Instruction &I : BB->instructionsWithoutDebug()) { |
4915 | 251k | Type *T = I.getType(); |
4916 | 251k | |
4917 | 251k | // Skip ignored values. |
4918 | 251k | if (ValuesToIgnore.find(&I) != ValuesToIgnore.end()) |
4919 | 9 | continue; |
4920 | 251k | |
4921 | 251k | // Only examine Loads, Stores and PHINodes. |
4922 | 251k | if (!isa<LoadInst>(I) && !isa<StoreInst>(I)231k && !isa<PHINode>(I)211k ) |
4923 | 183k | continue; |
4924 | 68.4k | |
4925 | 68.4k | // Examine PHI nodes that are reduction variables. Update the type to |
4926 | 68.4k | // account for the recurrence type. |
4927 | 68.4k | if (auto *PN = dyn_cast<PHINode>(&I)) { |
4928 | 27.9k | if (!Legal->isReductionVariable(PN)) |
4929 | 24.4k | continue; |
4930 | 3.43k | RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN]; |
4931 | 3.43k | T = RdxDesc.getRecurrenceType(); |
4932 | 3.43k | } |
4933 | 68.4k | |
4934 | 68.4k | // Examine the stored values. |
4935 | 68.4k | if (auto *43.9k ST43.9k = dyn_cast<StoreInst>(&I)) |
4936 | 20.7k | T = ST->getValueOperand()->getType(); |
4937 | 43.9k | |
4938 | 43.9k | // Ignore loaded pointer types and stored pointer types that are not |
4939 | 43.9k | // vectorizable. |
4940 | 43.9k | // |
4941 | 43.9k | // FIXME: The check here attempts to predict whether a load or store will |
4942 | 43.9k | // be vectorized. We only know this for certain after a VF has |
4943 | 43.9k | // been selected. Here, we assume that if an access can be |
4944 | 43.9k | // vectorized, it will be. We should also look at extending this |
4945 | 43.9k | // optimization to non-pointer types. |
4946 | 43.9k | // |
4947 | 43.9k | if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I)627 && |
4948 | 43.9k | !isAccessInterleaved(&I)287 && !isLegalGatherOrScatter(&I)258 ) |
4949 | 258 | continue; |
4950 | 43.7k | |
4951 | 43.7k | MinWidth = std::min(MinWidth, |
4952 | 43.7k | (unsigned)DL.getTypeSizeInBits(T->getScalarType())); |
4953 | 43.7k | MaxWidth = std::max(MaxWidth, |
4954 | 43.7k | (unsigned)DL.getTypeSizeInBits(T->getScalarType())); |
4955 | 43.7k | } |
4956 | 22.0k | } |
4957 | 19.9k | |
4958 | 19.9k | return {MinWidth, MaxWidth}; |
4959 | 19.9k | } |
4960 | | |
4961 | | unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, |
4962 | | unsigned VF, |
4963 | 19.8k | unsigned LoopCost) { |
4964 | 19.8k | // -- The interleave heuristics -- |
4965 | 19.8k | // We interleave the loop in order to expose ILP and reduce the loop overhead. |
4966 | 19.8k | // There are many micro-architectural considerations that we can't predict |
4967 | 19.8k | // at this level. For example, frontend pressure (on decode or fetch) due to |
4968 | 19.8k | // code size, or the number and capabilities of the execution ports. |
4969 | 19.8k | // |
4970 | 19.8k | // We use the following heuristics to select the interleave count: |
4971 | 19.8k | // 1. If the code has reductions, then we interleave to break the cross |
4972 | 19.8k | // iteration dependency. |
4973 | 19.8k | // 2. If the loop is really small, then we interleave to reduce the loop |
4974 | 19.8k | // overhead. |
4975 | 19.8k | // 3. We don't interleave if we think that we will spill registers to memory |
4976 | 19.8k | // due to the increased register pressure. |
4977 | 19.8k | |
4978 | 19.8k | // When we optimize for size, we don't interleave. |
4979 | 19.8k | if (OptForSize) |
4980 | 111 | return 1; |
4981 | 19.7k | |
4982 | 19.7k | // We used the distance for the interleave count. |
4983 | 19.7k | if (Legal->getMaxSafeDepDistBytes() != -1U) |
4984 | 140 | return 1; |
4985 | 19.6k | |
4986 | 19.6k | // Do not interleave loops with a relatively small trip count. |
4987 | 19.6k | unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); |
4988 | 19.6k | if (TC > 1 && TC < TinyTripCountInterleaveThreshold10.8k ) |
4989 | 496 | return 1; |
4990 | 19.1k | |
4991 | 19.1k | unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1); |
4992 | 19.1k | LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters |
4993 | 19.1k | << " registers\n"); |
4994 | 19.1k | |
4995 | 19.1k | if (VF == 1) { |
4996 | 2.91k | if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) |
4997 | 1 | TargetNumRegisters = ForceTargetNumScalarRegs; |
4998 | 16.1k | } else { |
4999 | 16.1k | if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) |
5000 | 0 | TargetNumRegisters = ForceTargetNumVectorRegs; |
5001 | 16.1k | } |
5002 | 19.1k | |
5003 | 19.1k | RegisterUsage R = calculateRegisterUsage({VF})[0]; |
5004 | 19.1k | // We divide by these constants so assume that we have at least one |
5005 | 19.1k | // instruction that uses at least one register. |
5006 | 19.1k | R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U); |
5007 | 19.1k | |
5008 | 19.1k | // We calculate the interleave count using the following formula. |
5009 | 19.1k | // Subtract the number of loop invariants from the number of available |
5010 | 19.1k | // registers. These registers are used by all of the interleaved instances. |
5011 | 19.1k | // Next, divide the remaining registers by the number of registers that is |
5012 | 19.1k | // required by the loop, in order to estimate how many parallel instances |
5013 | 19.1k | // fit without causing spills. All of this is rounded down if necessary to be |
5014 | 19.1k | // a power of two. We want power of two interleave count to simplify any |
5015 | 19.1k | // addressing operations or alignment considerations. |
5016 | 19.1k | // We also want power of two interleave counts to ensure that the induction |
5017 | 19.1k | // variable of the vector loop wraps to zero, when tail is folded by masking; |
5018 | 19.1k | // this currently happens when OptForSize, in which case IC is set to 1 above. |
5019 | 19.1k | unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) / |
5020 | 19.1k | R.MaxLocalUsers); |
5021 | 19.1k | |
5022 | 19.1k | // Don't count the induction variable as interleaved. |
5023 | 19.1k | if (EnableIndVarRegisterHeur) |
5024 | 19.1k | IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) / |
5025 | 19.1k | std::max(1U, (R.MaxLocalUsers - 1))); |
5026 | 19.1k | |
5027 | 19.1k | // Clamp the interleave ranges to reasonable counts. |
5028 | 19.1k | unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); |
5029 | 19.1k | |
5030 | 19.1k | // Check if the user has overridden the max. |
5031 | 19.1k | if (VF == 1) { |
5032 | 2.91k | if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) |
5033 | 1 | MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; |
5034 | 16.1k | } else { |
5035 | 16.1k | if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) |
5036 | 0 | MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; |
5037 | 16.1k | } |
5038 | 19.1k | |
5039 | 19.1k | // If we did not calculate the cost for VF (because the user selected the VF) |
5040 | 19.1k | // then we calculate the cost of VF here. |
5041 | 19.1k | if (LoopCost == 0) |
5042 | 583 | LoopCost = expectedCost(VF).first; |
5043 | 19.1k | |
5044 | 19.1k | assert(LoopCost && "Non-zero loop cost expected"); |
5045 | 19.1k | |
5046 | 19.1k | // Clamp the calculated IC to be between the 1 and the max interleave count |
5047 | 19.1k | // that the target allows. |
5048 | 19.1k | if (IC > MaxInterleaveCount) |
5049 | 18.5k | IC = MaxInterleaveCount; |
5050 | 605 | else if (IC < 1) |
5051 | 118 | IC = 1; |
5052 | 19.1k | |
5053 | 19.1k | // Interleave if we vectorized this loop and there is a reduction that could |
5054 | 19.1k | // benefit from interleaving. |
5055 | 19.1k | if (VF > 1 && !Legal->getReductionVars()->empty()16.1k ) { |
5056 | 2.17k | LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); |
5057 | 2.17k | return IC; |
5058 | 2.17k | } |
5059 | 16.9k | |
5060 | 16.9k | // Note that if we've already vectorized the loop we will have done the |
5061 | 16.9k | // runtime check and so interleaving won't require further checks. |
5062 | 16.9k | bool InterleavingRequiresRuntimePointerCheck = |
5063 | 16.9k | (VF == 1 && Legal->getRuntimePointerChecking()->Need2.91k ); |
5064 | 16.9k | |
5065 | 16.9k | // We want to interleave small loops in order to reduce the loop overhead and |
5066 | 16.9k | // potentially expose ILP opportunities. |
5067 | 16.9k | LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); |
5068 | 16.9k | if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost16.4k ) { |
5069 | 15.6k | // We assume that the cost overhead is 1 and we use the cost model |
5070 | 15.6k | // to estimate the cost of the loop and interleave until the cost of the |
5071 | 15.6k | // loop overhead is about 5% of the cost of the loop. |
5072 | 15.6k | unsigned SmallIC = |
5073 | 15.6k | std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); |
5074 | 15.6k | |
5075 | 15.6k | // Interleave until store/load ports (estimated by max interleave count) are |
5076 | 15.6k | // saturated. |
5077 | 15.6k | unsigned NumStores = Legal->getNumStores(); |
5078 | 15.6k | unsigned NumLoads = Legal->getNumLoads(); |
5079 | 15.6k | unsigned StoresIC = IC / (NumStores ? NumStores14.6k : 1982 ); |
5080 | 15.6k | unsigned LoadsIC = IC / (NumLoads ? NumLoads4.58k : 111.0k ); |
5081 | 15.6k | |
5082 | 15.6k | // If we have a scalar reduction (vector reductions are already dealt with |
5083 | 15.6k | // by this point), we can increase the critical path length if the loop |
5084 | 15.6k | // we're interleaving is inside another loop. Limit, by default to 2, so the |
5085 | 15.6k | // critical path only gets increased by one reduction operation. |
5086 | 15.6k | if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1384 ) { |
5087 | 150 | unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); |
5088 | 150 | SmallIC = std::min(SmallIC, F); |
5089 | 150 | StoresIC = std::min(StoresIC, F); |
5090 | 150 | LoadsIC = std::min(LoadsIC, F); |
5091 | 150 | } |
5092 | 15.6k | |
5093 | 15.6k | if (EnableLoadStoreRuntimeInterleave && |
5094 | 15.6k | std::max(StoresIC, LoadsIC) > SmallIC) { |
5095 | 3.07k | LLVM_DEBUG( |
5096 | 3.07k | dbgs() << "LV: Interleaving to saturate store or load ports.\n"); |
5097 | 3.07k | return std::max(StoresIC, LoadsIC); |
5098 | 3.07k | } |
5099 | 12.5k | |
5100 | 12.5k | LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); |
5101 | 12.5k | return SmallIC; |
5102 | 12.5k | } |
5103 | 1.31k | |
5104 | 1.31k | // Interleave if this is a large loop (small loops are already dealt with by |
5105 | 1.31k | // this point) that could benefit from interleaving. |
5106 | 1.31k | bool HasReductions = !Legal->getReductionVars()->empty(); |
5107 | 1.31k | if (TTI.enableAggressiveInterleaving(HasReductions)) { |
5108 | 1 | LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); |
5109 | 1 | return IC; |
5110 | 1 | } |
5111 | 1.31k | |
5112 | 1.31k | LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); |
5113 | 1.31k | return 1; |
5114 | 1.31k | } |
5115 | | |
5116 | | SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> |
5117 | 19.1k | LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { |
5118 | 19.1k | // This function calculates the register usage by measuring the highest number |
5119 | 19.1k | // of values that are alive at a single location. Obviously, this is a very |
5120 | 19.1k | // rough estimation. We scan the loop in a topological order in order and |
5121 | 19.1k | // assign a number to each instruction. We use RPO to ensure that defs are |
5122 | 19.1k | // met before their users. We assume that each instruction that has in-loop |
5123 | 19.1k | // users starts an interval. We record every time that an in-loop value is |
5124 | 19.1k | // used, so we have a list of the first and last occurrences of each |
5125 | 19.1k | // instruction. Next, we transpose this data structure into a multi map that |
5126 | 19.1k | // holds the list of intervals that *end* at a specific location. This multi |
5127 | 19.1k | // map allows us to perform a linear search. We scan the instructions linearly |
5128 | 19.1k | // and record each time that a new interval starts, by placing it in a set. |
5129 | 19.1k | // If we find this value in the multi-map then we remove it from the set. |
5130 | 19.1k | // The max register usage is the maximum size of the set. |
5131 | 19.1k | // We also search for instructions that are defined outside the loop, but are |
5132 | 19.1k | // used inside the loop. We need this number separately from the max-interval |
5133 | 19.1k | // usage number because when we unroll, loop-invariant values do not take |
5134 | 19.1k | // more register. |
5135 | 19.1k | LoopBlocksDFS DFS(TheLoop); |
5136 | 19.1k | DFS.perform(LI); |
5137 | 19.1k | |
5138 | 19.1k | RegisterUsage RU; |
5139 | 19.1k | |
5140 | 19.1k | // Each 'key' in the map opens a new interval. The values |
5141 | 19.1k | // of the map are the index of the 'last seen' usage of the |
5142 | 19.1k | // instruction that is the key. |
5143 | 19.1k | using IntervalMap = DenseMap<Instruction *, unsigned>; |
5144 | 19.1k | |
5145 | 19.1k | // Maps instruction to its index. |
5146 | 19.1k | SmallVector<Instruction *, 64> IdxToInstr; |
5147 | 19.1k | // Marks the end of each interval. |
5148 | 19.1k | IntervalMap EndPoint; |
5149 | 19.1k | // Saves the list of instruction indices that are used in the loop. |
5150 | 19.1k | SmallPtrSet<Instruction *, 8> Ends; |
5151 | 19.1k | // Saves the list of values that are used in the loop but are |
5152 | 19.1k | // defined outside the loop, such as arguments and constants. |
5153 | 19.1k | SmallPtrSet<Value *, 8> LoopInvariants; |
5154 | 19.1k | |
5155 | 20.2k | for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { |
5156 | 227k | for (Instruction &I : BB->instructionsWithoutDebug()) { |
5157 | 227k | IdxToInstr.push_back(&I); |
5158 | 227k | |
5159 | 227k | // Save the end location of each USE. |
5160 | 455k | for (Value *U : I.operands()) { |
5161 | 455k | auto *Instr = dyn_cast<Instruction>(U); |
5162 | 455k | |
5163 | 455k | // Ignore non-instruction values such as arguments, constants, etc. |
5164 | 455k | if (!Instr) |
5165 | 162k | continue; |
5166 | 293k | |
5167 | 293k | // If this instruction is outside the loop then record it and continue. |
5168 | 293k | if (!TheLoop->contains(Instr)) { |
5169 | 33.3k | LoopInvariants.insert(Instr); |
5170 | 33.3k | continue; |
5171 | 33.3k | } |
5172 | 259k | |
5173 | 259k | // Overwrite previous end points. |
5174 | 259k | EndPoint[Instr] = IdxToInstr.size(); |
5175 | 259k | Ends.insert(Instr); |
5176 | 259k | } |
5177 | 227k | } |
5178 | 20.2k | } |
5179 | 19.1k | |
5180 | 19.1k | // Saves the list of intervals that end with the index in 'key'. |
5181 | 19.1k | using InstrList = SmallVector<Instruction *, 2>; |
5182 | 19.1k | DenseMap<unsigned, InstrList> TransposeEnds; |
5183 | 19.1k | |
5184 | 19.1k | // Transpose the EndPoints to a list of values that end at each index. |
5185 | 19.1k | for (auto &Interval : EndPoint) |
5186 | 187k | TransposeEnds[Interval.second].push_back(Interval.first); |
5187 | 19.1k | |
5188 | 19.1k | SmallPtrSet<Instruction *, 8> OpenIntervals; |
5189 | 19.1k | |
5190 | 19.1k | // Get the size of the widest register. |
5191 | 19.1k | unsigned MaxSafeDepDist = -1U; |
5192 | 19.1k | if (Legal->getMaxSafeDepDistBytes() != -1U) |
5193 | 0 | MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; |
5194 | 19.1k | unsigned WidestRegister = |
5195 | 19.1k | std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); |
5196 | 19.1k | const DataLayout &DL = TheFunction->getParent()->getDataLayout(); |
5197 | 19.1k | |
5198 | 19.1k | SmallVector<RegisterUsage, 8> RUs(VFs.size()); |
5199 | 19.1k | SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0); |
5200 | 19.1k | |
5201 | 19.1k | LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); |
5202 | 19.1k | |
5203 | 19.1k | // A lambda that gets the register usage for the given type and VF. |
5204 | 228k | auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { |
5205 | 228k | if (Ty->isTokenTy()) |
5206 | 1 | return 0U; |
5207 | 228k | unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); |
5208 | 228k | return std::max<unsigned>(1, VF * TypeSize / WidestRegister); |
5209 | 228k | }; |
5210 | 19.1k | |
5211 | 246k | for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i227k ) { |
5212 | 227k | Instruction *I = IdxToInstr[i]; |
5213 | 227k | |
5214 | 227k | // Remove all of the instructions that end at this location. |
5215 | 227k | InstrList &List = TransposeEnds[i]; |
5216 | 227k | for (Instruction *ToRemove : List) |
5217 | 168k | OpenIntervals.erase(ToRemove); |
5218 | 227k | |
5219 | 227k | // Ignore instructions that are never used within the loop. |
5220 | 227k | if (Ends.find(I) == Ends.end()) |
5221 | 39.1k | continue; |
5222 | 187k | |
5223 | 187k | // Skip ignored values. |
5224 | 187k | if (ValuesToIgnore.find(I) != ValuesToIgnore.end()) |
5225 | 4 | continue; |
5226 | 187k | |
5227 | 187k | // For each VF find the maximum usage of registers. |
5228 | 375k | for (unsigned j = 0, e = VFs.size(); 187k j < e; ++j187k ) { |
5229 | 187k | if (VFs[j] == 1) { |
5230 | 42.4k | MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size()); |
5231 | 42.4k | continue; |
5232 | 42.4k | } |
5233 | 145k | collectUniformsAndScalars(VFs[j]); |
5234 | 145k | // Count the number of live intervals. |
5235 | 145k | unsigned RegUsage = 0; |
5236 | 354k | for (auto Inst : OpenIntervals) { |
5237 | 354k | // Skip ignored values for VF > 1. |
5238 | 354k | if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end() || |
5239 | 354k | isScalarAfterVectorization(Inst, VFs[j])354k ) |
5240 | 150k | continue; |
5241 | 204k | RegUsage += GetRegUsage(Inst->getType(), VFs[j]); |
5242 | 204k | } |
5243 | 145k | MaxUsages[j] = std::max(MaxUsages[j], RegUsage); |
5244 | 145k | } |
5245 | 187k | |
5246 | 187k | LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " |
5247 | 187k | << OpenIntervals.size() << '\n'); |
5248 | 187k | |
5249 | 187k | // Add the current instruction to the list of open intervals. |
5250 | 187k | OpenIntervals.insert(I); |
5251 | 187k | } |
5252 | 19.1k | |
5253 | 38.2k | for (unsigned i = 0, e = VFs.size(); i < e; ++i19.1k ) { |
5254 | 19.1k | unsigned Invariant = 0; |
5255 | 19.1k | if (VFs[i] == 1) |
5256 | 2.91k | Invariant = LoopInvariants.size(); |
5257 | 16.1k | else { |
5258 | 16.1k | for (auto Inst : LoopInvariants) |
5259 | 24.0k | Invariant += GetRegUsage(Inst->getType(), VFs[i]); |
5260 | 16.1k | } |
5261 | 19.1k | |
5262 | 19.1k | LLVM_DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n'); |
5263 | 19.1k | LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n'); |
5264 | 19.1k | LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant |
5265 | 19.1k | << '\n'); |
5266 | 19.1k | |
5267 | 19.1k | RU.LoopInvariantRegs = Invariant; |
5268 | 19.1k | RU.MaxLocalUsers = MaxUsages[i]; |
5269 | 19.1k | RUs[i] = RU; |
5270 | 19.1k | } |
5271 | 19.1k | |
5272 | 19.1k | return RUs; |
5273 | 19.1k | } |
5274 | | |
5275 | 4.67k | bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ |
5276 | 4.67k | // TODO: Cost model for emulated masked load/store is completely |
5277 | 4.67k | // broken. This hack guides the cost model to use an artificially |
5278 | 4.67k | // high enough value to practically disable vectorization with such |
5279 | 4.67k | // operations, except where previously deployed legality hack allowed |
5280 | 4.67k | // using very low cost values. This is to avoid regressions coming simply |
5281 | 4.67k | // from moving "masked load/store" check from legality to cost model. |
5282 | 4.67k | // Masked Load/Gather emulation was previously never allowed. |
5283 | 4.67k | // Limited number of Masked Store/Scatter emulation was allowed. |
5284 | 4.67k | assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); |
5285 | 4.67k | return isa<LoadInst>(I) || |
5286 | 4.67k | (2.19k isa<StoreInst>(I)2.19k && |
5287 | 2.19k | NumPredStores > NumberOfStoresToPredicate2.12k ); |
5288 | 4.67k | } |
5289 | | |
5290 | 32.4k | void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { |
5291 | 32.4k | // If we aren't vectorizing the loop, or if we've already collected the |
5292 | 32.4k | // instructions to scalarize, there's nothing to do. Collection may already |
5293 | 32.4k | // have occurred if we have a user-selected VF and are now computing the |
5294 | 32.4k | // expected cost for interleaving. |
5295 | 32.4k | if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end()32.3k ) |
5296 | 36 | return; |
5297 | 32.3k | |
5298 | 32.3k | // Initialize a mapping for VF in InstsToScalalarize. If we find that it's |
5299 | 32.3k | // not profitable to scalarize any instructions, the presence of VF in the |
5300 | 32.3k | // map will indicate that we've analyzed it already. |
5301 | 32.3k | ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; |
5302 | 32.3k | |
5303 | 32.3k | // Find all the instructions that are scalar with predication in the loop and |
5304 | 32.3k | // determine if it would be better to not if-convert the blocks they are in. |
5305 | 32.3k | // If so, we also record the instructions to scalarize. |
5306 | 35.8k | for (BasicBlock *BB : TheLoop->blocks()) { |
5307 | 35.8k | if (!blockNeedsPredication(BB)) |
5308 | 33.4k | continue; |
5309 | 2.33k | for (Instruction &I : *BB) |
5310 | 12.3k | if (isScalarWithPredication(&I)) { |
5311 | 2.33k | ScalarCostsTy ScalarCosts; |
5312 | 2.33k | // Do not apply discount logic if hacked cost is needed |
5313 | 2.33k | // for emulated masked memrefs. |
5314 | 2.33k | if (!useEmulatedMaskMemRefHack(&I) && |
5315 | 2.33k | computePredInstDiscount(&I, ScalarCosts, VF) >= 0355 ) |
5316 | 323 | ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); |
5317 | 2.33k | // Remember that BB will remain after vectorization. |
5318 | 2.33k | PredicatedBBsAfterVectorization.insert(BB); |
5319 | 2.33k | } |
5320 | 2.33k | } |
5321 | 32.3k | } |
5322 | | |
5323 | | int LoopVectorizationCostModel::computePredInstDiscount( |
5324 | | Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, |
5325 | 355 | unsigned VF) { |
5326 | 355 | assert(!isUniformAfterVectorization(PredInst, VF) && |
5327 | 355 | "Instruction marked uniform-after-vectorization will be predicated"); |
5328 | 355 | |
5329 | 355 | // Initialize the discount to zero, meaning that the scalar version and the |
5330 | 355 | // vector version cost the same. |
5331 | 355 | int Discount = 0; |
5332 | 355 | |
5333 | 355 | // Holds instructions to analyze. The instructions we visit are mapped in |
5334 | 355 | // ScalarCosts. Those instructions are the ones that would be scalarized if |
5335 | 355 | // we find that the scalar version costs less. |
5336 | 355 | SmallVector<Instruction *, 8> Worklist; |
5337 | 355 | |
5338 | 355 | // Returns true if the given instruction can be scalarized. |
5339 | 979 | auto canBeScalarized = [&](Instruction *I) -> bool { |
5340 | 979 | // We only attempt to scalarize instructions forming a single-use chain |
5341 | 979 | // from the original predicated block that would otherwise be vectorized. |
5342 | 979 | // Although not strictly necessary, we give up on instructions we know will |
5343 | 979 | // already be scalar to avoid traversing chains that are unlikely to be |
5344 | 979 | // beneficial. |
5345 | 979 | if (!I->hasOneUse() || PredInst->getParent() != I->getParent()475 || |
5346 | 979 | isScalarAfterVectorization(I, VF)435 ) |
5347 | 595 | return false; |
5348 | 384 | |
5349 | 384 | // If the instruction is scalar with predication, it will be analyzed |
5350 | 384 | // separately. We ignore it within the context of PredInst. |
5351 | 384 | if (isScalarWithPredication(I)) |
5352 | 151 | return false; |
5353 | 233 | |
5354 | 233 | // If any of the instruction's operands are uniform after vectorization, |
5355 | 233 | // the instruction cannot be scalarized. This prevents, for example, a |
5356 | 233 | // masked load from being scalarized. |
5357 | 233 | // |
5358 | 233 | // We assume we will only emit a value for lane zero of an instruction |
5359 | 233 | // marked uniform after vectorization, rather than VF identical values. |
5360 | 233 | // Thus, if we scalarize an instruction that uses a uniform, we would |
5361 | 233 | // create uses of values corresponding to the lanes we aren't emitting code |
5362 | 233 | // for. This behavior can be changed by allowing getScalarValue to clone |
5363 | 233 | // the lane zero values for uniforms rather than asserting. |
5364 | 233 | for (Use &U : I->operands()) |
5365 | 430 | if (auto *J = dyn_cast<Instruction>(U.get())) |
5366 | 384 | if (isUniformAfterVectorization(J, VF)) |
5367 | 6 | return false; |
5368 | 233 | |
5369 | 233 | // Otherwise, we can scalarize the instruction. |
5370 | 233 | return true227 ; |
5371 | 233 | }; |
5372 | 355 | |
5373 | 355 | // Compute the expected cost discount from scalarizing the entire expression |
5374 | 355 | // feeding the predicated instruction. We currently only consider expressions |
5375 | 355 | // that are single-use instruction chains. |
5376 | 355 | Worklist.push_back(PredInst); |
5377 | 937 | while (!Worklist.empty()) { |
5378 | 582 | Instruction *I = Worklist.pop_back_val(); |
5379 | 582 | |
5380 | 582 | // If we've already analyzed the instruction, there's nothing to do. |
5381 | 582 | if (ScalarCosts.find(I) != ScalarCosts.end()) |
5382 | 0 | continue; |
5383 | 582 | |
5384 | 582 | // Compute the cost of the vector instruction. Note that this cost already |
5385 | 582 | // includes the scalarization overhead of the predicated instruction. |
5386 | 582 | unsigned VectorCost = getInstructionCost(I, VF).first; |
5387 | 582 | |
5388 | 582 | // Compute the cost of the scalarized instruction. This cost is the cost of |
5389 | 582 | // the instruction as if it wasn't if-converted and instead remained in the |
5390 | 582 | // predicated block. We will scale this cost by block probability after |
5391 | 582 | // computing the scalarization overhead. |
5392 | 582 | unsigned ScalarCost = VF * getInstructionCost(I, 1).first; |
5393 | 582 | |
5394 | 582 | // Compute the scalarization overhead of needed insertelement instructions |
5395 | 582 | // and phi nodes. |
5396 | 582 | if (isScalarWithPredication(I) && !I->getType()->isVoidTy()355 ) { |
5397 | 76 | ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF), |
5398 | 76 | true, false); |
5399 | 76 | ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI); |
5400 | 76 | } |
5401 | 582 | |
5402 | 582 | // Compute the scalarization overhead of needed extractelement |
5403 | 582 | // instructions. For each of the instruction's operands, if the operand can |
5404 | 582 | // be scalarized, add it to the worklist; otherwise, account for the |
5405 | 582 | // overhead. |
5406 | 582 | for (Use &U : I->operands()) |
5407 | 1.13k | if (auto *J = dyn_cast<Instruction>(U.get())) { |
5408 | 979 | assert(VectorType::isValidElementType(J->getType()) && |
5409 | 979 | "Instruction has non-scalar type"); |
5410 | 979 | if (canBeScalarized(J)) |
5411 | 227 | Worklist.push_back(J); |
5412 | 752 | else if (needsExtract(J, VF)) |
5413 | 384 | ScalarCost += TTI.getScalarizationOverhead( |
5414 | 384 | ToVectorTy(J->getType(),VF), false, true); |
5415 | 979 | } |
5416 | 582 | |
5417 | 582 | // Scale the total scalar cost by block probability. |
5418 | 582 | ScalarCost /= getReciprocalPredBlockProb(); |
5419 | 582 | |
5420 | 582 | // Compute the discount. A non-negative discount means the vector version |
5421 | 582 | // of the instruction costs more, and scalarizing would be beneficial. |
5422 | 582 | Discount += VectorCost - ScalarCost; |
5423 | 582 | ScalarCosts[I] = ScalarCost; |
5424 | 582 | } |
5425 | 355 | |
5426 | 355 | return Discount; |
5427 | 355 | } |
5428 | | |
5429 | | LoopVectorizationCostModel::VectorizationCostTy |
5430 | 51.5k | LoopVectorizationCostModel::expectedCost(unsigned VF) { |
5431 | 51.5k | VectorizationCostTy Cost; |
5432 | 51.5k | |
5433 | 51.5k | // For each block. |
5434 | 56.8k | for (BasicBlock *BB : TheLoop->blocks()) { |
5435 | 56.8k | VectorizationCostTy BlockCost; |
5436 | 56.8k | |
5437 | 56.8k | // For each instruction in the old loop. |
5438 | 631k | for (Instruction &I : BB->instructionsWithoutDebug()) { |
5439 | 631k | // Skip ignored values. |
5440 | 631k | if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() || |
5441 | 631k | (631k VF > 1631k && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end()388k )) |
5442 | 69 | continue; |
5443 | 631k | |
5444 | 631k | VectorizationCostTy C = getInstructionCost(&I, VF); |
5445 | 631k | |
5446 | 631k | // Check if we should override the cost. |
5447 | 631k | if (ForceTargetInstructionCost.getNumOccurrences() > 0) |
5448 | 10 | C.first = ForceTargetInstructionCost; |
5449 | 631k | |
5450 | 631k | BlockCost.first += C.first; |
5451 | 631k | BlockCost.second |= C.second; |
5452 | 631k | LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first |
5453 | 631k | << " for VF " << VF << " For instruction: " << I |
5454 | 631k | << '\n'); |
5455 | 631k | } |
5456 | 56.8k | |
5457 | 56.8k | // If we are vectorizing a predicated block, it will have been |
5458 | 56.8k | // if-converted. This means that the block's instructions (aside from |
5459 | 56.8k | // stores and instructions that may divide by zero) will now be |
5460 | 56.8k | // unconditionally executed. For the scalar case, we may not always execute |
5461 | 56.8k | // the predicated block. Thus, scale the block's cost by the probability of |
5462 | 56.8k | // executing it. |
5463 | 56.8k | if (VF == 1 && blockNeedsPredication(BB)21.1k ) |
5464 | 1.28k | BlockCost.first /= getReciprocalPredBlockProb(); |
5465 | 56.8k | |
5466 | 56.8k | Cost.first += BlockCost.first; |
5467 | 56.8k | Cost.second |= BlockCost.second; |
5468 | 56.8k | } |
5469 | 51.5k | |
5470 | 51.5k | return Cost; |
5471 | 51.5k | } |
5472 | | |
5473 | | /// Gets Address Access SCEV after verifying that the access pattern |
5474 | | /// is loop invariant except the induction variable dependence. |
5475 | | /// |
5476 | | /// This SCEV can be sent to the Target in order to estimate the address |
5477 | | /// calculation cost. |
5478 | | static const SCEV *getAddressAccessSCEV( |
5479 | | Value *Ptr, |
5480 | | LoopVectorizationLegality *Legal, |
5481 | | PredicatedScalarEvolution &PSE, |
5482 | 16.5k | const Loop *TheLoop) { |
5483 | 16.5k | |
5484 | 16.5k | auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); |
5485 | 16.5k | if (!Gep) |
5486 | 1.27k | return nullptr; |
5487 | 15.2k | |
5488 | 15.2k | // We are looking for a gep with all loop invariant indices except for one |
5489 | 15.2k | // which should be an induction variable. |
5490 | 15.2k | auto SE = PSE.getSE(); |
5491 | 15.2k | unsigned NumOperands = Gep->getNumOperands(); |
5492 | 42.6k | for (unsigned i = 1; i < NumOperands; ++i27.4k ) { |
5493 | 33.7k | Value *Opd = Gep->getOperand(i); |
5494 | 33.7k | if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && |
5495 | 33.7k | !Legal->isInductionVariable(Opd)12.3k ) |
5496 | 6.33k | return nullptr; |
5497 | 33.7k | } |
5498 | 15.2k | |
5499 | 15.2k | // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. |
5500 | 15.2k | return PSE.getSCEV(Ptr)8.92k ; |
5501 | 15.2k | } |
5502 | | |
5503 | 7.42k | static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { |
5504 | 7.42k | return Legal->hasStride(I->getOperand(0)) || |
5505 | 7.42k | Legal->hasStride(I->getOperand(1)); |
5506 | 7.42k | } |
5507 | | |
5508 | | unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, |
5509 | 16.5k | unsigned VF) { |
5510 | 16.5k | assert(VF > 1 && "Scalarization cost of instruction implies vectorization."); |
5511 | 16.5k | Type *ValTy = getMemInstValueType(I); |
5512 | 16.5k | auto SE = PSE.getSE(); |
5513 | 16.5k | |
5514 | 16.5k | unsigned Alignment = getLoadStoreAlignment(I); |
5515 | 16.5k | unsigned AS = getLoadStoreAddressSpace(I); |
5516 | 16.5k | Value *Ptr = getLoadStorePointerOperand(I); |
5517 | 16.5k | Type *PtrTy = ToVectorTy(Ptr->getType(), VF); |
5518 | 16.5k | |
5519 | 16.5k | // Figure out whether the access is strided and get the stride value |
5520 | 16.5k | // if it's known in compile time |
5521 | 16.5k | const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); |
5522 | 16.5k | |
5523 | 16.5k | // Get the cost of the scalar memory instruction and address computation. |
5524 | 16.5k | unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); |
5525 | 16.5k | |
5526 | 16.5k | // Don't pass *I here, since it is scalar but will actually be part of a |
5527 | 16.5k | // vectorized loop where the user of it is a vectorized instruction. |
5528 | 16.5k | Cost += VF * |
5529 | 16.5k | TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, |
5530 | 16.5k | AS); |
5531 | 16.5k | |
5532 | 16.5k | // Get the overhead of the extractelement and insertelement instructions |
5533 | 16.5k | // we might create due to scalarization. |
5534 | 16.5k | Cost += getScalarizationOverhead(I, VF); |
5535 | 16.5k | |
5536 | 16.5k | // If we have a predicated store, it may not be executed for each vector |
5537 | 16.5k | // lane. Scale the cost by the probability of executing the predicated |
5538 | 16.5k | // block. |
5539 | 16.5k | if (isPredicatedInst(I)) { |
5540 | 2.34k | Cost /= getReciprocalPredBlockProb(); |
5541 | 2.34k | |
5542 | 2.34k | if (useEmulatedMaskMemRefHack(I)) |
5543 | 1.84k | // Artificially setting to a high enough value to practically disable |
5544 | 1.84k | // vectorization with such operations. |
5545 | 1.84k | Cost = 3000000; |
5546 | 2.34k | } |
5547 | 16.5k | |
5548 | 16.5k | return Cost; |
5549 | 16.5k | } |
5550 | | |
5551 | | unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, |
5552 | 43.4k | unsigned VF) { |
5553 | 43.4k | Type *ValTy = getMemInstValueType(I); |
5554 | 43.4k | Type *VectorTy = ToVectorTy(ValTy, VF); |
5555 | 43.4k | unsigned Alignment = getLoadStoreAlignment(I); |
5556 | 43.4k | Value *Ptr = getLoadStorePointerOperand(I); |
5557 | 43.4k | unsigned AS = getLoadStoreAddressSpace(I); |
5558 | 43.4k | int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); |
5559 | 43.4k | |
5560 | 43.4k | assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && |
5561 | 43.4k | "Stride should be 1 or -1 for consecutive memory access"); |
5562 | 43.4k | unsigned Cost = 0; |
5563 | 43.4k | if (Legal->isMaskRequired(I)) |
5564 | 185 | Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); |
5565 | 43.2k | else |
5566 | 43.2k | Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I); |
5567 | 43.4k | |
5568 | 43.4k | bool Reverse = ConsecutiveStride < 0; |
5569 | 43.4k | if (Reverse) |
5570 | 2.98k | Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); |
5571 | 43.4k | return Cost; |
5572 | 43.4k | } |
5573 | | |
5574 | | unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, |
5575 | 636 | unsigned VF) { |
5576 | 636 | Type *ValTy = getMemInstValueType(I); |
5577 | 636 | Type *VectorTy = ToVectorTy(ValTy, VF); |
5578 | 636 | unsigned Alignment = getLoadStoreAlignment(I); |
5579 | 636 | unsigned AS = getLoadStoreAddressSpace(I); |
5580 | 636 | if (isa<LoadInst>(I)) { |
5581 | 621 | return TTI.getAddressComputationCost(ValTy) + |
5582 | 621 | TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) + |
5583 | 621 | TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); |
5584 | 621 | } |
5585 | 15 | StoreInst *SI = cast<StoreInst>(I); |
5586 | 15 | |
5587 | 15 | bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); |
5588 | 15 | return TTI.getAddressComputationCost(ValTy) + |
5589 | 15 | TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) + |
5590 | 15 | (isLoopInvariantStoreValue ? 08 : TTI.getVectorInstrCost( |
5591 | 7 | Instruction::ExtractElement, |
5592 | 7 | VectorTy, VF - 1)); |
5593 | 15 | } |
5594 | | |
5595 | | unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, |
5596 | 132 | unsigned VF) { |
5597 | 132 | Type *ValTy = getMemInstValueType(I); |
5598 | 132 | Type *VectorTy = ToVectorTy(ValTy, VF); |
5599 | 132 | unsigned Alignment = getLoadStoreAlignment(I); |
5600 | 132 | Value *Ptr = getLoadStorePointerOperand(I); |
5601 | 132 | |
5602 | 132 | return TTI.getAddressComputationCost(VectorTy) + |
5603 | 132 | TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, |
5604 | 132 | Legal->isMaskRequired(I), Alignment); |
5605 | 132 | } |
5606 | | |
5607 | | unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, |
5608 | 2.26k | unsigned VF) { |
5609 | 2.26k | Type *ValTy = getMemInstValueType(I); |
5610 | 2.26k | Type *VectorTy = ToVectorTy(ValTy, VF); |
5611 | 2.26k | unsigned AS = getLoadStoreAddressSpace(I); |
5612 | 2.26k | |
5613 | 2.26k | auto Group = getInterleavedAccessGroup(I); |
5614 | 2.26k | assert(Group && "Fail to get an interleaved access group."); |
5615 | 2.26k | |
5616 | 2.26k | unsigned InterleaveFactor = Group->getFactor(); |
5617 | 2.26k | Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); |
5618 | 2.26k | |
5619 | 2.26k | // Holds the indices of existing members in an interleaved load group. |
5620 | 2.26k | // An interleaved store group doesn't need this as it doesn't allow gaps. |
5621 | 2.26k | SmallVector<unsigned, 4> Indices; |
5622 | 2.26k | if (isa<LoadInst>(I)) { |
5623 | 5.49k | for (unsigned i = 0; i < InterleaveFactor; i++4.17k ) |
5624 | 4.17k | if (Group->getMember(i)) |
5625 | 3.29k | Indices.push_back(i); |
5626 | 1.32k | } |
5627 | 2.26k | |
5628 | 2.26k | // Calculate the cost of the whole interleaved group. |
5629 | 2.26k | bool UseMaskForGaps = |
5630 | 2.26k | Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed342 ; |
5631 | 2.26k | unsigned Cost = TTI.getInterleavedMemoryOpCost( |
5632 | 2.26k | I->getOpcode(), WideVecTy, Group->getFactor(), Indices, |
5633 | 2.26k | Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps); |
5634 | 2.26k | |
5635 | 2.26k | if (Group->isReverse()) { |
5636 | 298 | // TODO: Add support for reversed masked interleaved access. |
5637 | 298 | assert(!Legal->isMaskRequired(I) && |
5638 | 298 | "Reverse masked interleaved access not supported."); |
5639 | 298 | Cost += Group->getNumMembers() * |
5640 | 298 | TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); |
5641 | 298 | } |
5642 | 2.26k | return Cost; |
5643 | 2.26k | } |
5644 | | |
5645 | | unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, |
5646 | 103k | unsigned VF) { |
5647 | 103k | // Calculate scalar cost only. Vectorization cost should be ready at this |
5648 | 103k | // moment. |
5649 | 103k | if (VF == 1) { |
5650 | 39.4k | Type *ValTy = getMemInstValueType(I); |
5651 | 39.4k | unsigned Alignment = getLoadStoreAlignment(I); |
5652 | 39.4k | unsigned AS = getLoadStoreAddressSpace(I); |
5653 | 39.4k | |
5654 | 39.4k | return TTI.getAddressComputationCost(ValTy) + |
5655 | 39.4k | TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I); |
5656 | 39.4k | } |
5657 | 64.1k | return getWideningCost(I, VF); |
5658 | 64.1k | } |
5659 | | |
5660 | | LoopVectorizationCostModel::VectorizationCostTy |
5661 | 632k | LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { |
5662 | 632k | // If we know that this instruction will remain uniform, check the cost of |
5663 | 632k | // the scalar version. |
5664 | 632k | if (isUniformAfterVectorization(I, VF)) |
5665 | 373k | VF = 1; |
5666 | 632k | |
5667 | 632k | if (VF > 1 && isProfitableToScalarize(I, VF)259k ) |
5668 | 447 | return VectorizationCostTy(InstsToScalarize[VF][I], false); |
5669 | 632k | |
5670 | 632k | // Forced scalars do not have any scalarization overhead. |
5671 | 632k | auto ForcedScalar = ForcedScalars.find(VF); |
5672 | 632k | if (VF > 1 && ForcedScalar != ForcedScalars.end()259k ) { |
5673 | 123 | auto InstSet = ForcedScalar->second; |
5674 | 123 | if (InstSet.find(I) != InstSet.end()) |
5675 | 9 | return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false); |
5676 | 632k | } |
5677 | 632k | |
5678 | 632k | Type *VectorTy; |
5679 | 632k | unsigned C = getInstructionCost(I, VF, VectorTy); |
5680 | 632k | |
5681 | 632k | bool TypeNotScalarized = |
5682 | 632k | VF > 1 && VectorTy->isVectorTy()259k && TTI.getNumberOfParts(VectorTy) < VF191k ; |
5683 | 632k | return VectorizationCostTy(C, TypeNotScalarized); |
5684 | 632k | } |
5685 | | |
5686 | | unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, |
5687 | 18.2k | unsigned VF) { |
5688 | 18.2k | |
5689 | 18.2k | if (VF == 1) |
5690 | 3 | return 0; |
5691 | 18.2k | |
5692 | 18.2k | unsigned Cost = 0; |
5693 | 18.2k | Type *RetTy = ToVectorTy(I->getType(), VF); |
5694 | 18.2k | if (!RetTy->isVoidTy() && |
5695 | 18.2k | (11.7k !isa<LoadInst>(I)11.7k || !TTI.supportsEfficientVectorElementLoadStore()10.1k )) |
5696 | 11.7k | Cost += TTI.getScalarizationOverhead(RetTy, true, false); |
5697 | 18.2k | |
5698 | 18.2k | // Some targets keep addresses scalar. |
5699 | 18.2k | if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()10.1k ) |
5700 | 12 | return Cost; |
5701 | 18.2k | |
5702 | 18.2k | // Some targets support efficient element stores. |
5703 | 18.2k | if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()6.40k ) |
5704 | 2 | return Cost; |
5705 | 18.1k | |
5706 | 18.1k | // Collect operands to consider. |
5707 | 18.1k | CallInst *CI = dyn_cast<CallInst>(I); |
5708 | 18.1k | Instruction::op_range Ops = CI ? CI->arg_operands()1.57k : I->operands()16.6k ; |
5709 | 18.1k | |
5710 | 18.1k | // Skip operands that do not require extraction/scalarization and do not incur |
5711 | 18.1k | // any overhead. |
5712 | 18.1k | return Cost + TTI.getOperandsScalarizationOverhead( |
5713 | 18.1k | filterExtractingOperands(Ops, VF), VF); |
5714 | 18.1k | } |
5715 | | |
5716 | 32.3k | void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { |
5717 | 32.3k | if (VF == 1) |
5718 | 0 | return; |
5719 | 32.3k | NumPredStores = 0; |
5720 | 35.8k | for (BasicBlock *BB : TheLoop->blocks()) { |
5721 | 35.8k | // For each instruction in the old loop. |
5722 | 389k | for (Instruction &I : *BB) { |
5723 | 389k | Value *Ptr = getLoadStorePointerOperand(&I); |
5724 | 389k | if (!Ptr) |
5725 | 324k | continue; |
5726 | 64.2k | |
5727 | 64.2k | // TODO: We should generate better code and update the cost model for |
5728 | 64.2k | // predicated uniform stores. Today they are treated as any other |
5729 | 64.2k | // predicated store (see added test cases in |
5730 | 64.2k | // invariant-store-vectorization.ll). |
5731 | 64.2k | if (isa<StoreInst>(&I) && isScalarWithPredication(&I)32.6k ) |
5732 | 1.03k | NumPredStores++; |
5733 | 64.2k | |
5734 | 64.2k | if (Legal->isUniform(Ptr) && |
5735 | 64.2k | // Conditional loads and stores should be scalarized and predicated. |
5736 | 64.2k | // isScalarWithPredication cannot be used here since masked |
5737 | 64.2k | // gather/scatters are not considered scalar with predication. |
5738 | 64.2k | !Legal->blockNeedsPredication(I.getParent())842 ) { |
5739 | 636 | // TODO: Avoid replicating loads and stores instead of |
5740 | 636 | // relying on instcombine to remove them. |
5741 | 636 | // Load: Scalar load + broadcast |
5742 | 636 | // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract |
5743 | 636 | unsigned Cost = getUniformMemOpCost(&I, VF); |
5744 | 636 | setWideningDecision(&I, VF, CM_Scalarize, Cost); |
5745 | 636 | continue; |
5746 | 636 | } |
5747 | 63.6k | |
5748 | 63.6k | // We assume that widening is the best solution when possible. |
5749 | 63.6k | if (memoryInstructionCanBeWidened(&I, VF)) { |
5750 | 43.4k | unsigned Cost = getConsecutiveMemOpCost(&I, VF); |
5751 | 43.4k | int ConsecutiveStride = |
5752 | 43.4k | Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); |
5753 | 43.4k | assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && |
5754 | 43.4k | "Expected consecutive stride."); |
5755 | 43.4k | InstWidening Decision = |
5756 | 43.4k | ConsecutiveStride == 1 ? CM_Widen40.4k : CM_Widen_Reverse2.98k ; |
5757 | 43.4k | setWideningDecision(&I, VF, Decision, Cost); |
5758 | 43.4k | continue; |
5759 | 43.4k | } |
5760 | 20.1k | |
5761 | 20.1k | // Choose between Interleaving, Gather/Scatter or Scalarization. |
5762 | 20.1k | unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); |
5763 | 20.1k | unsigned NumAccesses = 1; |
5764 | 20.1k | if (isAccessInterleaved(&I)) { |
5765 | 5.90k | auto Group = getInterleavedAccessGroup(&I); |
5766 | 5.90k | assert(Group && "Fail to get an interleaved access group."); |
5767 | 5.90k | |
5768 | 5.90k | // Make one decision for the whole group. |
5769 | 5.90k | if (getWideningDecision(&I, VF) != CM_Unknown) |
5770 | 3.64k | continue; |
5771 | 2.26k | |
5772 | 2.26k | NumAccesses = Group->getNumMembers(); |
5773 | 2.26k | if (interleavedAccessCanBeWidened(&I, VF)) |
5774 | 2.26k | InterleaveCost = getInterleaveGroupCost(&I, VF); |
5775 | 2.26k | } |
5776 | 20.1k | |
5777 | 20.1k | unsigned GatherScatterCost = |
5778 | 16.5k | isLegalGatherOrScatter(&I) |
5779 | 16.5k | ? getGatherScatterCost(&I, VF) * NumAccesses132 |
5780 | 16.5k | : std::numeric_limits<unsigned>::max()16.4k ; |
5781 | 16.5k | |
5782 | 16.5k | unsigned ScalarizationCost = |
5783 | 16.5k | getMemInstScalarizationCost(&I, VF) * NumAccesses; |
5784 | 16.5k | |
5785 | 16.5k | // Choose better solution for the current VF, |
5786 | 16.5k | // write down this decision and use it during vectorization. |
5787 | 16.5k | unsigned Cost; |
5788 | 16.5k | InstWidening Decision; |
5789 | 16.5k | if (InterleaveCost <= GatherScatterCost && |
5790 | 16.5k | InterleaveCost < ScalarizationCost16.4k ) { |
5791 | 2.20k | Decision = CM_Interleave; |
5792 | 2.20k | Cost = InterleaveCost; |
5793 | 14.3k | } else if (GatherScatterCost < ScalarizationCost) { |
5794 | 105 | Decision = CM_GatherScatter; |
5795 | 105 | Cost = GatherScatterCost; |
5796 | 14.2k | } else { |
5797 | 14.2k | Decision = CM_Scalarize; |
5798 | 14.2k | Cost = ScalarizationCost; |
5799 | 14.2k | } |
5800 | 16.5k | // If the instructions belongs to an interleave group, the whole group |
5801 | 16.5k | // receives the same decision. The whole group receives the cost, but |
5802 | 16.5k | // the cost will actually be assigned to one instruction. |
5803 | 16.5k | if (auto Group = getInterleavedAccessGroup(&I)) |
5804 | 2.26k | setWideningDecision(Group, VF, Decision, Cost); |
5805 | 14.2k | else |
5806 | 14.2k | setWideningDecision(&I, VF, Decision, Cost); |
5807 | 16.5k | } |
5808 | 35.8k | } |
5809 | 32.3k | |
5810 | 32.3k | // Make sure that any load of address and any other address computation |
5811 | 32.3k | // remains scalar unless there is gather/scatter support. This avoids |
5812 | 32.3k | // inevitable extracts into address registers, and also has the benefit of |
5813 | 32.3k | // activating LSR more, since that pass can't optimize vectorized |
5814 | 32.3k | // addresses. |
5815 | 32.3k | if (TTI.prefersVectorizedAddressing()) |
5816 | 32.3k | return; |
5817 | 17 | |
5818 | 17 | // Start with all scalar pointer uses. |
5819 | 17 | SmallPtrSet<Instruction *, 8> AddrDefs; |
5820 | 17 | for (BasicBlock *BB : TheLoop->blocks()) |
5821 | 227 | for (Instruction &I : *BB)19 { |
5822 | 227 | Instruction *PtrDef = |
5823 | 227 | dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); |
5824 | 227 | if (PtrDef && TheLoop->contains(PtrDef)66 && |
5825 | 227 | getWideningDecision(&I, VF) != CM_GatherScatter66 ) |
5826 | 66 | AddrDefs.insert(PtrDef); |
5827 | 227 | } |
5828 | 17 | |
5829 | 17 | // Add all instructions used to generate the addresses. |
5830 | 17 | SmallVector<Instruction *, 4> Worklist; |
5831 | 17 | for (auto *I : AddrDefs) |
5832 | 55 | Worklist.push_back(I); |
5833 | 78 | while (!Worklist.empty()) { |
5834 | 61 | Instruction *I = Worklist.pop_back_val(); |
5835 | 61 | for (auto &Op : I->operands()) |
5836 | 121 | if (auto *InstOp = dyn_cast<Instruction>(Op)) |
5837 | 61 | if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp)60 && |
5838 | 61 | AddrDefs.insert(InstOp).second12 ) |
5839 | 6 | Worklist.push_back(InstOp); |
5840 | 61 | } |
5841 | 17 | |
5842 | 61 | for (auto *I : AddrDefs) { |
5843 | 61 | if (isa<LoadInst>(I)) { |
5844 | 1 | // Setting the desired widening decision should ideally be handled in |
5845 | 1 | // by cost functions, but since this involves the task of finding out |
5846 | 1 | // if the loaded register is involved in an address computation, it is |
5847 | 1 | // instead changed here when we know this is the case. |
5848 | 1 | InstWidening Decision = getWideningDecision(I, VF); |
5849 | 1 | if (Decision == CM_Widen || Decision == CM_Widen_Reverse0 ) |
5850 | 1 | // Scalarize a widened load of address. |
5851 | 1 | setWideningDecision(I, VF, CM_Scalarize, |
5852 | 1 | (VF * getMemoryInstructionCost(I, 1))); |
5853 | 0 | else if (auto Group = getInterleavedAccessGroup(I)) { |
5854 | 0 | // Scalarize an interleave group of address loads. |
5855 | 0 | for (unsigned I = 0; I < Group->getFactor(); ++I) { |
5856 | 0 | if (Instruction *Member = Group->getMember(I)) |
5857 | 0 | setWideningDecision(Member, VF, CM_Scalarize, |
5858 | 0 | (VF * getMemoryInstructionCost(Member, 1))); |
5859 | 0 | } |
5860 | 0 | } |
5861 | 1 | } else |
5862 | 60 | // Make sure I gets scalarized and a cost estimate without |
5863 | 60 | // scalarization overhead. |
5864 | 60 | ForcedScalars[VF].insert(I); |
5865 | 61 | } |
5866 | 17 | } |
5867 | | |
5868 | | unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, |
5869 | | unsigned VF, |
5870 | 632k | Type *&VectorTy) { |
5871 | 632k | Type *RetTy = I->getType(); |
5872 | 632k | if (canTruncateToMinimalBitwidth(I, VF)) |
5873 | 618 | RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); |
5874 | 632k | VectorTy = isScalarAfterVectorization(I, VF) ? RetTy389k : ToVectorTy(RetTy, VF)242k ; |
5875 | 632k | auto SE = PSE.getSE(); |
5876 | 632k | |
5877 | 632k | // TODO: We need to estimate the cost of intrinsic calls. |
5878 | 632k | switch (I->getOpcode()) { |
5879 | 632k | case Instruction::GetElementPtr: |
5880 | 98.5k | // We mark this instruction as zero-cost because the cost of GEPs in |
5881 | 98.5k | // vectorized code depends on whether the corresponding memory instruction |
5882 | 98.5k | // is scalarized or not. Therefore, we handle GEPs with the memory |
5883 | 98.5k | // instruction cost. |
5884 | 98.5k | return 0; |
5885 | 632k | case Instruction::Br: { |
5886 | 56.8k | // In cases of scalarized and predicated instructions, there will be VF |
5887 | 56.8k | // predicated blocks in the vectorized loop. Each branch around these |
5888 | 56.8k | // blocks requires also an extract of its vector compare i1 element. |
5889 | 56.8k | bool ScalarPredicatedBB = false; |
5890 | 56.8k | BranchInst *BI = cast<BranchInst>(I); |
5891 | 56.8k | if (VF > 1 && BI->isConditional()35.7k && |
5892 | 56.8k | (33.9k PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) != |
5893 | 33.9k | PredicatedBBsAfterVectorization.end() || |
5894 | 33.9k | PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) != |
5895 | 33.4k | PredicatedBBsAfterVectorization.end())) |
5896 | 926 | ScalarPredicatedBB = true; |
5897 | 56.8k | |
5898 | 56.8k | if (ScalarPredicatedBB) { |
5899 | 926 | // Return cost for branches around scalarized and predicated blocks. |
5900 | 926 | Type *Vec_i1Ty = |
5901 | 926 | VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); |
5902 | 926 | return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) + |
5903 | 926 | (TTI.getCFInstrCost(Instruction::Br) * VF)); |
5904 | 55.9k | } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 14.41k ) |
5905 | 53.4k | // The back-edge branch will remain, as will all scalar branches. |
5906 | 53.4k | return TTI.getCFInstrCost(Instruction::Br); |
5907 | 2.51k | else |
5908 | 2.51k | // This branch will be eliminated by if-conversion. |
5909 | 2.51k | return 0; |
5910 | 0 | // Note: We currently assume zero cost for an unconditional branch inside |
5911 | 0 | // a predicated block since it will become a fall-through, although we |
5912 | 0 | // may decide in the future to call TTI for all branches. |
5913 | 0 | } |
5914 | 71.8k | case Instruction::PHI: { |
5915 | 71.8k | auto *Phi = cast<PHINode>(I); |
5916 | 71.8k | |
5917 | 71.8k | // First-order recurrences are replaced by vector shuffles inside the loop. |
5918 | 71.8k | // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. |
5919 | 71.8k | if (VF > 1 && Legal->isFirstOrderRecurrence(Phi)21.7k ) |
5920 | 125 | return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, |
5921 | 125 | VectorTy, VF - 1, VectorType::get(RetTy, 1)); |
5922 | 71.7k | |
5923 | 71.7k | // Phi nodes in non-header blocks (not inductions, reductions, etc.) are |
5924 | 71.7k | // converted into select instructions. We require N - 1 selects per phi |
5925 | 71.7k | // node, where N is the number of incoming values. |
5926 | 71.7k | if (VF > 1 && Phi->getParent() != TheLoop->getHeader()21.6k ) |
5927 | 867 | return (Phi->getNumIncomingValues() - 1) * |
5928 | 867 | TTI.getCmpSelInstrCost( |
5929 | 867 | Instruction::Select, ToVectorTy(Phi->getType(), VF), |
5930 | 867 | ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF)); |
5931 | 70.9k | |
5932 | 70.9k | return TTI.getCFInstrCost(Instruction::PHI); |
5933 | 70.9k | } |
5934 | 70.9k | case Instruction::UDiv: |
5935 | 1.14k | case Instruction::SDiv: |
5936 | 1.14k | case Instruction::URem: |
5937 | 1.14k | case Instruction::SRem: |
5938 | 1.14k | // If we have a predicated instruction, it may not be executed for each |
5939 | 1.14k | // vector lane. Get the scalarization cost and scale this amount by the |
5940 | 1.14k | // probability of executing the predicated block. If the instruction is not |
5941 | 1.14k | // predicated, we fall through to the next case. |
5942 | 1.14k | if (VF > 1 && isScalarWithPredication(I)722 ) { |
5943 | 86 | unsigned Cost = 0; |
5944 | 86 | |
5945 | 86 | // These instructions have a non-void type, so account for the phi nodes |
5946 | 86 | // that we will create. This cost is likely to be zero. The phi node |
5947 | 86 | // cost, if any, should be scaled by the block probability because it |
5948 | 86 | // models a copy at the end of each predicated block. |
5949 | 86 | Cost += VF * TTI.getCFInstrCost(Instruction::PHI); |
5950 | 86 | |
5951 | 86 | // The cost of the non-predicated instruction. |
5952 | 86 | Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy); |
5953 | 86 | |
5954 | 86 | // The cost of insertelement and extractelement instructions needed for |
5955 | 86 | // scalarization. |
5956 | 86 | Cost += getScalarizationOverhead(I, VF); |
5957 | 86 | |
5958 | 86 | // Scale the cost by the probability of executing the predicated blocks. |
5959 | 86 | // This assumes the predicated block for each vector lane is equally |
5960 | 86 | // likely. |
5961 | 86 | return Cost / getReciprocalPredBlockProb(); |
5962 | 86 | } |
5963 | 1.05k | LLVM_FALLTHROUGH; |
5964 | 166k | case Instruction::Add: |
5965 | 166k | case Instruction::FAdd: |
5966 | 166k | case Instruction::Sub: |
5967 | 166k | case Instruction::FSub: |
5968 | 166k | case Instruction::Mul: |
5969 | 166k | case Instruction::FMul: |
5970 | 166k | case Instruction::FDiv: |
5971 | 166k | case Instruction::FRem: |
5972 | 166k | case Instruction::Shl: |
5973 | 166k | case Instruction::LShr: |
5974 | 166k | case Instruction::AShr: |
5975 | 166k | case Instruction::And: |
5976 | 166k | case Instruction::Or: |
5977 | 166k | case Instruction::Xor: { |
5978 | 166k | // Since we will replace the stride by 1 the multiplication should go away. |
5979 | 166k | if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)7.42k ) |
5980 | 64 | return 0; |
5981 | 166k | // Certain instructions can be cheaper to vectorize if they have a constant |
5982 | 166k | // second vector operand. One example of this are shifts on x86. |
5983 | 166k | Value *Op2 = I->getOperand(1); |
5984 | 166k | TargetTransformInfo::OperandValueProperties Op2VP; |
5985 | 166k | TargetTransformInfo::OperandValueKind Op2VK = |
5986 | 166k | TTI.getOperandInfo(Op2, Op2VP); |
5987 | 166k | if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)93.1k ) |
5988 | 4.71k | Op2VK = TargetTransformInfo::OK_UniformValue; |
5989 | 166k | |
5990 | 166k | SmallVector<const Value *, 4> Operands(I->operand_values()); |
5991 | 166k | unsigned N = isScalarAfterVectorization(I, VF) ? VF86.3k : 179.6k ; |
5992 | 166k | return N * TTI.getArithmeticInstrCost( |
5993 | 166k | I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, |
5994 | 166k | Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands); |
5995 | 166k | } |
5996 | 166k | case Instruction::FNeg: { |
5997 | 1 | unsigned N = isScalarAfterVectorization(I, VF) ? VF0 : 1; |
5998 | 1 | return N * TTI.getArithmeticInstrCost( |
5999 | 1 | I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, |
6000 | 1 | TargetTransformInfo::OK_AnyValue, |
6001 | 1 | TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, |
6002 | 1 | I->getOperand(0)); |
6003 | 166k | } |
6004 | 166k | case Instruction::Select: { |
6005 | 3.97k | SelectInst *SI = cast<SelectInst>(I); |
6006 | 3.97k | const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); |
6007 | 3.97k | bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); |
6008 | 3.97k | Type *CondTy = SI->getCondition()->getType(); |
6009 | 3.97k | if (!ScalarCond) |
6010 | 3.97k | CondTy = VectorType::get(CondTy, VF); |
6011 | 3.97k | |
6012 | 3.97k | return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I); |
6013 | 166k | } |
6014 | 166k | case Instruction::ICmp: |
6015 | 58.9k | case Instruction::FCmp: { |
6016 | 58.9k | Type *ValTy = I->getOperand(0)->getType(); |
6017 | 58.9k | Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); |
6018 | 58.9k | if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) |
6019 | 0 | ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); |
6020 | 58.9k | VectorTy = ToVectorTy(ValTy, VF); |
6021 | 58.9k | return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I); |
6022 | 58.9k | } |
6023 | 103k | case Instruction::Store: |
6024 | 103k | case Instruction::Load: { |
6025 | 103k | unsigned Width = VF; |
6026 | 103k | if (Width > 1) { |
6027 | 64.1k | InstWidening Decision = getWideningDecision(I, Width); |
6028 | 64.1k | assert(Decision != CM_Unknown && |
6029 | 64.1k | "CM decision should be taken at this point"); |
6030 | 64.1k | if (Decision == CM_Scalarize) |
6031 | 14.9k | Width = 1; |
6032 | 64.1k | } |
6033 | 103k | VectorTy = ToVectorTy(getMemInstValueType(I), Width); |
6034 | 103k | return getMemoryInstructionCost(I, VF); |
6035 | 103k | } |
6036 | 103k | case Instruction::ZExt: |
6037 | 71.2k | case Instruction::SExt: |
6038 | 71.2k | case Instruction::FPToUI: |
6039 | 71.2k | case Instruction::FPToSI: |
6040 | 71.2k | case Instruction::FPExt: |
6041 | 71.2k | case Instruction::PtrToInt: |
6042 | 71.2k | case Instruction::IntToPtr: |
6043 | 71.2k | case Instruction::SIToFP: |
6044 | 71.2k | case Instruction::UIToFP: |
6045 | 71.2k | case Instruction::Trunc: |
6046 | 71.2k | case Instruction::FPTrunc: |
6047 | 71.2k | case Instruction::BitCast: { |
6048 | 71.2k | // We optimize the truncation of induction variables having constant |
6049 | 71.2k | // integer steps. The cost of these truncations is the same as the scalar |
6050 | 71.2k | // operation. |
6051 | 71.2k | if (isOptimizableIVTruncate(I, VF)) { |
6052 | 5.40k | auto *Trunc = cast<TruncInst>(I); |
6053 | 5.40k | return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), |
6054 | 5.40k | Trunc->getSrcTy(), Trunc); |
6055 | 5.40k | } |
6056 | 65.8k | |
6057 | 65.8k | Type *SrcScalarTy = I->getOperand(0)->getType(); |
6058 | 65.8k | Type *SrcVecTy = |
6059 | 65.8k | VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF)32.0k : SrcScalarTy33.8k ; |
6060 | 65.8k | if (canTruncateToMinimalBitwidth(I, VF)) { |
6061 | 275 | // This cast is going to be shrunk. This may remove the cast or it might |
6062 | 275 | // turn it into slightly different cast. For example, if MinBW == 16, |
6063 | 275 | // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". |
6064 | 275 | // |
6065 | 275 | // Calculate the modified src and dest types. |
6066 | 275 | Type *MinVecTy = VectorTy; |
6067 | 275 | if (I->getOpcode() == Instruction::Trunc) { |
6068 | 99 | SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); |
6069 | 99 | VectorTy = |
6070 | 99 | largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); |
6071 | 176 | } else if (I->getOpcode() == Instruction::ZExt || |
6072 | 176 | I->getOpcode() == Instruction::SExt3 ) { |
6073 | 176 | SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); |
6074 | 176 | VectorTy = |
6075 | 176 | smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); |
6076 | 176 | } |
6077 | 275 | } |
6078 | 65.8k | |
6079 | 65.8k | unsigned N = isScalarAfterVectorization(I, VF) ? VF33.8k : 132.0k ; |
6080 | 65.8k | return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I); |
6081 | 65.8k | } |
6082 | 65.8k | case Instruction::Call: { |
6083 | 989 | bool NeedToScalarize; |
6084 | 989 | CallInst *CI = cast<CallInst>(I); |
6085 | 989 | unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); |
6086 | 989 | if (getVectorIntrinsicIDForCall(CI, TLI)) |
6087 | 825 | return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); |
6088 | 164 | return CallCost; |
6089 | 164 | } |
6090 | 164 | default: |
6091 | 6 | // The cost of executing VF copies of the scalar instruction. This opcode |
6092 | 6 | // is unknown. Assume that it is the same as 'mul'. |
6093 | 6 | return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) + |
6094 | 6 | getScalarizationOverhead(I, VF); |
6095 | 632k | } // end of switch. |
6096 | 632k | } |
6097 | | |
6098 | | char LoopVectorize::ID = 0; |
6099 | | |
6100 | | static const char lv_name[] = "Loop Vectorization"; |
6101 | | |
6102 | 48.9k | INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) |
6103 | 48.9k | INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) |
6104 | 48.9k | INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) |
6105 | 48.9k | INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) |
6106 | 48.9k | INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) |
6107 | 48.9k | INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) |
6108 | 48.9k | INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) |
6109 | 48.9k | INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) |
6110 | 48.9k | INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) |
6111 | 48.9k | INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) |
6112 | 48.9k | INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) |
6113 | 48.9k | INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) |
6114 | 48.9k | INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) |
6115 | 48.9k | INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) |
6116 | 48.9k | INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) |
6117 | | |
6118 | | namespace llvm { |
6119 | | |
6120 | 0 | Pass *createLoopVectorizePass() { return new LoopVectorize(); } |
6121 | | |
6122 | | Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, |
6123 | 13.4k | bool VectorizeOnlyWhenForced) { |
6124 | 13.4k | return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); |
6125 | 13.4k | } |
6126 | | |
6127 | | } // end namespace llvm |
6128 | | |
6129 | 627 | bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { |
6130 | 627 | // Check if the pointer operand of a load or store instruction is |
6131 | 627 | // consecutive. |
6132 | 627 | if (auto *Ptr = getLoadStorePointerOperand(Inst)) |
6133 | 627 | return Legal->isConsecutivePtr(Ptr); |
6134 | 0 | return false; |
6135 | 0 | } |
6136 | | |
6137 | 19.9k | void LoopVectorizationCostModel::collectValuesToIgnore() { |
6138 | 19.9k | // Ignore ephemeral values. |
6139 | 19.9k | CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); |
6140 | 19.9k | |
6141 | 19.9k | // Ignore type-promoting instructions we identified during reduction |
6142 | 19.9k | // detection. |
6143 | 19.9k | for (auto &Reduction : *Legal->getReductionVars()) { |
6144 | 3.44k | RecurrenceDescriptor &RedDes = Reduction.second; |
6145 | 3.44k | SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); |
6146 | 3.44k | VecValuesToIgnore.insert(Casts.begin(), Casts.end()); |
6147 | 3.44k | } |
6148 | 19.9k | // Ignore type-casting instructions we identified during induction |
6149 | 19.9k | // detection. |
6150 | 23.8k | for (auto &Induction : *Legal->getInductionVars()) { |
6151 | 23.8k | InductionDescriptor &IndDes = Induction.second; |
6152 | 23.8k | const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); |
6153 | 23.8k | VecValuesToIgnore.insert(Casts.begin(), Casts.end()); |
6154 | 23.8k | } |
6155 | 19.9k | } |
6156 | | |
6157 | | // TODO: we could return a pair of values that specify the max VF and |
6158 | | // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of |
6159 | | // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment |
6160 | | // doesn't have a cost model that can choose which plan to execute if |
6161 | | // more than one is generated. |
6162 | | static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, |
6163 | 5 | LoopVectorizationCostModel &CM) { |
6164 | 5 | unsigned WidestType; |
6165 | 5 | std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); |
6166 | 5 | return WidestVectorRegBits / WidestType; |
6167 | 5 | } |
6168 | | |
6169 | | VectorizationFactor |
6170 | | LoopVectorizationPlanner::planInVPlanNativePath(bool OptForSize, |
6171 | 7 | unsigned UserVF) { |
6172 | 7 | unsigned VF = UserVF; |
6173 | 7 | // Outer loop handling: They may require CFG and instruction level |
6174 | 7 | // transformations before even evaluating whether vectorization is profitable. |
6175 | 7 | // Since we cannot modify the incoming IR, we need to build VPlan upfront in |
6176 | 7 | // the vectorization pipeline. |
6177 | 7 | if (!OrigLoop->empty()) { |
6178 | 7 | // If the user doesn't provide a vectorization factor, determine a |
6179 | 7 | // reasonable one. |
6180 | 7 | if (!UserVF) { |
6181 | 5 | VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM); |
6182 | 5 | LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); |
6183 | 5 | |
6184 | 5 | // Make sure we have a VF > 1 for stress testing. |
6185 | 5 | if (VPlanBuildStressTest && VF < 20 ) { |
6186 | 0 | LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " |
6187 | 0 | << "overriding computed VF.\n"); |
6188 | 0 | VF = 4; |
6189 | 0 | } |
6190 | 5 | } |
6191 | 7 | assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); |
6192 | 7 | assert(isPowerOf2_32(VF) && "VF needs to be a power of two"); |
6193 | 7 | LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF |
6194 | 7 | << " to build VPlans.\n"); |
6195 | 7 | buildVPlans(VF, VF); |
6196 | 7 | |
6197 | 7 | // For VPlan build stress testing, we bail out after VPlan construction. |
6198 | 7 | if (VPlanBuildStressTest) |
6199 | 0 | return VectorizationFactor::Disabled(); |
6200 | 7 | |
6201 | 7 | return {VF, 0}; |
6202 | 7 | } |
6203 | 0 | |
6204 | 0 | LLVM_DEBUG( |
6205 | 0 | dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " |
6206 | 0 | "VPlan-native path.\n"); |
6207 | 0 | return VectorizationFactor::Disabled(); |
6208 | 0 | } |
6209 | | |
6210 | | Optional<VectorizationFactor> LoopVectorizationPlanner::plan(bool OptForSize, |
6211 | 19.9k | unsigned UserVF) { |
6212 | 19.9k | assert(OrigLoop->empty() && "Inner loop expected."); |
6213 | 19.9k | Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(OptForSize); |
6214 | 19.9k | if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. |
6215 | 103 | return None; |
6216 | 19.8k | |
6217 | 19.8k | // Invalidate interleave groups if all blocks of loop will be predicated. |
6218 | 19.8k | if (CM.blockNeedsPredication(OrigLoop->getHeader()) && |
6219 | 19.8k | !useMaskedInterleavedAccesses(*TTI)26 ) { |
6220 | 21 | LLVM_DEBUG( |
6221 | 21 | dbgs() |
6222 | 21 | << "LV: Invalidate all interleaved groups due to fold-tail by masking " |
6223 | 21 | "which requires masked-interleaved support.\n"); |
6224 | 21 | CM.InterleaveInfo.reset(); |
6225 | 21 | } |
6226 | 19.8k | |
6227 | 19.8k | if (UserVF) { |
6228 | 649 | LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); |
6229 | 649 | assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); |
6230 | 649 | // Collect the instructions (and their associated costs) that will be more |
6231 | 649 | // profitable to scalarize. |
6232 | 649 | CM.selectUserVectorizationFactor(UserVF); |
6233 | 649 | buildVPlansWithVPRecipes(UserVF, UserVF); |
6234 | 649 | LLVM_DEBUG(printPlans(dbgs())); |
6235 | 649 | return {{UserVF, 0}}; |
6236 | 649 | } |
6237 | 19.2k | |
6238 | 19.2k | unsigned MaxVF = MaybeMaxVF.getValue(); |
6239 | 19.2k | assert(MaxVF != 0 && "MaxVF is zero."); |
6240 | 19.2k | |
6241 | 70.1k | for (unsigned VF = 1; VF <= MaxVF; VF *= 250.9k ) { |
6242 | 50.9k | // Collect Uniform and Scalar instructions after vectorization with VF. |
6243 | 50.9k | CM.collectUniformsAndScalars(VF); |
6244 | 50.9k | |
6245 | 50.9k | // Collect the instructions (and their associated costs) that will be more |
6246 | 50.9k | // profitable to scalarize. |
6247 | 50.9k | if (VF > 1) |
6248 | 31.7k | CM.collectInstsToScalarize(VF); |
6249 | 50.9k | } |
6250 | 19.2k | |
6251 | 19.2k | buildVPlansWithVPRecipes(1, MaxVF); |
6252 | 19.2k | LLVM_DEBUG(printPlans(dbgs())); |
6253 | 19.2k | if (MaxVF == 1) |
6254 | 15 | return VectorizationFactor::Disabled(); |
6255 | 19.1k | |
6256 | 19.1k | // Select the optimal vectorization factor. |
6257 | 19.1k | return CM.selectVectorizationFactor(MaxVF); |
6258 | 19.1k | } |
6259 | | |
6260 | 17.0k | void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) { |
6261 | 17.0k | LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF |
6262 | 17.0k | << '\n'); |
6263 | 17.0k | BestVF = VF; |
6264 | 17.0k | BestUF = UF; |
6265 | 17.0k | |
6266 | 32.9k | erase_if(VPlans, [VF](const VPlanPtr &Plan) { |
6267 | 32.9k | return !Plan->hasVF(VF); |
6268 | 32.9k | }); |
6269 | 17.0k | assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); |
6270 | 17.0k | } |
6271 | | |
6272 | | void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, |
6273 | 17.0k | DominatorTree *DT) { |
6274 | 17.0k | // Perform the actual loop transformation. |
6275 | 17.0k | |
6276 | 17.0k | // 1. Create a new empty loop. Unlink the old loop and connect the new one. |
6277 | 17.0k | VPCallbackILV CallbackILV(ILV); |
6278 | 17.0k | |
6279 | 17.0k | VPTransformState State{BestVF, BestUF, LI, |
6280 | 17.0k | DT, ILV.Builder, ILV.VectorLoopValueMap, |
6281 | 17.0k | &ILV, CallbackILV}; |
6282 | 17.0k | State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); |
6283 | 17.0k | State.TripCount = ILV.getOrCreateTripCount(nullptr); |
6284 | 17.0k | |
6285 | 17.0k | //===------------------------------------------------===// |
6286 | 17.0k | // |
6287 | 17.0k | // Notice: any optimization or new instruction that go |
6288 | 17.0k | // into the code below should also be implemented in |
6289 | 17.0k | // the cost-model. |
6290 | 17.0k | // |
6291 | 17.0k | //===------------------------------------------------===// |
6292 | 17.0k | |
6293 | 17.0k | // 2. Copy and widen instructions from the old loop into the new loop. |
6294 | 17.0k | assert(VPlans.size() == 1 && "Not a single VPlan to execute."); |
6295 | 17.0k | VPlans.front()->execute(&State); |
6296 | 17.0k | |
6297 | 17.0k | // 3. Fix the vectorized code: take care of header phi's, live-outs, |
6298 | 17.0k | // predication, updating analyses. |
6299 | 17.0k | ILV.fixVectorizedLoop(); |
6300 | 17.0k | } |
6301 | | |
6302 | | void LoopVectorizationPlanner::collectTriviallyDeadInstructions( |
6303 | 19.8k | SmallPtrSetImpl<Instruction *> &DeadInstructions) { |
6304 | 19.8k | BasicBlock *Latch = OrigLoop->getLoopLatch(); |
6305 | 19.8k | |
6306 | 19.8k | // We create new control-flow for the vectorized loop, so the original |
6307 | 19.8k | // condition will be dead after vectorization if it's only used by the |
6308 | 19.8k | // branch. |
6309 | 19.8k | auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); |
6310 | 19.8k | if (Cmp && Cmp->hasOneUse()) |
6311 | 19.8k | DeadInstructions.insert(Cmp); |
6312 | 19.8k | |
6313 | 19.8k | // We create new "steps" for induction variable updates to which the original |
6314 | 19.8k | // induction variables map. An original update instruction will be dead if |
6315 | 19.8k | // all its users except the induction variable are dead. |
6316 | 23.7k | for (auto &Induction : *Legal->getInductionVars()) { |
6317 | 23.7k | PHINode *Ind = Induction.first; |
6318 | 23.7k | auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); |
6319 | 49.7k | if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool 23.7k { |
6320 | 49.7k | return U == Ind || DeadInstructions.find(cast<Instruction>(U)) != |
6321 | 28.1k | DeadInstructions.end(); |
6322 | 49.7k | })) |
6323 | 14.3k | DeadInstructions.insert(IndUpdate); |
6324 | 23.7k | |
6325 | 23.7k | // We record as "Dead" also the type-casting instructions we had identified |
6326 | 23.7k | // during induction analysis. We don't need any handling for them in the |
6327 | 23.7k | // vectorized loop because we have proven that, under a proper runtime |
6328 | 23.7k | // test guarding the vectorized loop, the value of the phi, and the casted |
6329 | 23.7k | // value of the phi, are the same. The last instruction in this casting chain |
6330 | 23.7k | // will get its scalar/vector/widened def from the scalar/vector/widened def |
6331 | 23.7k | // of the respective phi node. Any other casts in the induction def-use chain |
6332 | 23.7k | // have no other uses outside the phi update chain, and will be ignored. |
6333 | 23.7k | InductionDescriptor &IndDes = Induction.second; |
6334 | 23.7k | const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); |
6335 | 23.7k | DeadInstructions.insert(Casts.begin(), Casts.end()); |
6336 | 23.7k | } |
6337 | 19.8k | } |
6338 | | |
6339 | 0 | Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } |
6340 | | |
6341 | 1.84k | Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } |
6342 | | |
6343 | | Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, |
6344 | 3.64k | Instruction::BinaryOps BinOp) { |
6345 | 3.64k | // When unrolling and the VF is 1, we only need to add a simple scalar. |
6346 | 3.64k | Type *Ty = Val->getType(); |
6347 | 3.64k | assert(!Ty->isVectorTy() && "Val must be a scalar"); |
6348 | 3.64k | |
6349 | 3.64k | if (Ty->isFloatingPointTy()) { |
6350 | 12 | Constant *C = ConstantFP::get(Ty, (double)StartIdx); |
6351 | 12 | |
6352 | 12 | // Floating point operations had to be 'fast' to enable the unrolling. |
6353 | 12 | Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); |
6354 | 12 | return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); |
6355 | 12 | } |
6356 | 3.63k | Constant *C = ConstantInt::get(Ty, StartIdx); |
6357 | 3.63k | return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); |
6358 | 3.63k | } |
6359 | | |
6360 | 12.6k | static void AddRuntimeUnrollDisableMetaData(Loop *L) { |
6361 | 12.6k | SmallVector<Metadata *, 4> MDs; |
6362 | 12.6k | // Reserve first location for self reference to the LoopID metadata node. |
6363 | 12.6k | MDs.push_back(nullptr); |
6364 | 12.6k | bool IsUnrollMetadata = false; |
6365 | 12.6k | MDNode *LoopID = L->getLoopID(); |
6366 | 12.6k | if (LoopID) { |
6367 | 273 | // First find existing loop unrolling disable metadata. |
6368 | 802 | for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i529 ) { |
6369 | 529 | auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); |
6370 | 529 | if (MD) { |
6371 | 529 | const auto *S = dyn_cast<MDString>(MD->getOperand(0)); |
6372 | 529 | IsUnrollMetadata = |
6373 | 529 | S && S->getString().startswith("llvm.loop.unroll.disable")46 ; |
6374 | 529 | } |
6375 | 529 | MDs.push_back(LoopID->getOperand(i)); |
6376 | 529 | } |
6377 | 273 | } |
6378 | 12.6k | |
6379 | 12.6k | if (!IsUnrollMetadata) { |
6380 | 12.6k | // Add runtime unroll disable metadata. |
6381 | 12.6k | LLVMContext &Context = L->getHeader()->getContext(); |
6382 | 12.6k | SmallVector<Metadata *, 1> DisableOperands; |
6383 | 12.6k | DisableOperands.push_back( |
6384 | 12.6k | MDString::get(Context, "llvm.loop.unroll.runtime.disable")); |
6385 | 12.6k | MDNode *DisableNode = MDNode::get(Context, DisableOperands); |
6386 | 12.6k | MDs.push_back(DisableNode); |
6387 | 12.6k | MDNode *NewLoopID = MDNode::get(Context, MDs); |
6388 | 12.6k | // Set operand 0 to refer to the loop id itself. |
6389 | 12.6k | NewLoopID->replaceOperandWith(0, NewLoopID); |
6390 | 12.6k | L->setLoopID(NewLoopID); |
6391 | 12.6k | } |
6392 | 12.6k | } |
6393 | | |
6394 | | bool LoopVectorizationPlanner::getDecisionAndClampRange( |
6395 | 1.11M | const std::function<bool(unsigned)> &Predicate, VFRange &Range) { |
6396 | 1.11M | assert(Range.End > Range.Start && "Trying to test an empty VF range."); |
6397 | 1.11M | bool PredicateAtRangeStart = Predicate(Range.Start); |
6398 | 1.11M | |
6399 | 1.43M | for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2322k ) |
6400 | 340k | if (Predicate(TmpVF) != PredicateAtRangeStart) { |
6401 | 18.6k | Range.End = TmpVF; |
6402 | 18.6k | break; |
6403 | 18.6k | } |
6404 | 1.11M | |
6405 | 1.11M | return PredicateAtRangeStart; |
6406 | 1.11M | } |
6407 | | |
6408 | | /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, |
6409 | | /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range |
6410 | | /// of VF's starting at a given VF and extending it as much as possible. Each |
6411 | | /// vectorization decision can potentially shorten this sub-range during |
6412 | | /// buildVPlan(). |
6413 | 7 | void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) { |
6414 | 14 | for (unsigned VF = MinVF; VF < MaxVF + 1;) { |
6415 | 7 | VFRange SubRange = {VF, MaxVF + 1}; |
6416 | 7 | VPlans.push_back(buildVPlan(SubRange)); |
6417 | 7 | VF = SubRange.End; |
6418 | 7 | } |
6419 | 7 | } |
6420 | | |
6421 | | VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, |
6422 | 5.79k | VPlanPtr &Plan) { |
6423 | 5.79k | assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); |
6424 | 5.79k | |
6425 | 5.79k | // Look for cached value. |
6426 | 5.79k | std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); |
6427 | 5.79k | EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); |
6428 | 5.79k | if (ECEntryIt != EdgeMaskCache.end()) |
6429 | 1.17k | return ECEntryIt->second; |
6430 | 4.62k | |
6431 | 4.62k | VPValue *SrcMask = createBlockInMask(Src, Plan); |
6432 | 4.62k | |
6433 | 4.62k | // The terminator has to be a branch inst! |
6434 | 4.62k | BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); |
6435 | 4.62k | assert(BI && "Unexpected terminator found"); |
6436 | 4.62k | |
6437 | 4.62k | if (!BI->isConditional()) |
6438 | 1.31k | return EdgeMaskCache[Edge] = SrcMask; |
6439 | 3.30k | |
6440 | 3.30k | VPValue *EdgeMask = Plan->getVPValue(BI->getCondition()); |
6441 | 3.30k | assert(EdgeMask && "No Edge Mask found for condition"); |
6442 | 3.30k | |
6443 | 3.30k | if (BI->getSuccessor(0) != Dst) |
6444 | 1.51k | EdgeMask = Builder.createNot(EdgeMask); |
6445 | 3.30k | |
6446 | 3.30k | if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. |
6447 | 1.54k | EdgeMask = Builder.createAnd(EdgeMask, SrcMask); |
6448 | 3.30k | |
6449 | 3.30k | return EdgeMaskCache[Edge] = EdgeMask; |
6450 | 3.30k | } |
6451 | | |
6452 | 7.52k | VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { |
6453 | 7.52k | assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); |
6454 | 7.52k | |
6455 | 7.52k | // Look for cached value. |
6456 | 7.52k | BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); |
6457 | 7.52k | if (BCEntryIt != BlockMaskCache.end()) |
6458 | 3.62k | return BCEntryIt->second; |
6459 | 3.89k | |
6460 | 3.89k | // All-one mask is modelled as no-mask following the convention for masked |
6461 | 3.89k | // load/store/gather/scatter. Initialize BlockMask to no-mask. |
6462 | 3.89k | VPValue *BlockMask = nullptr; |
6463 | 3.89k | |
6464 | 3.89k | if (OrigLoop->getHeader() == BB) { |
6465 | 1.11k | if (!CM.blockNeedsPredication(BB)) |
6466 | 1.07k | return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. |
6467 | 35 | |
6468 | 35 | // Introduce the early-exit compare IV <= BTC to form header block mask. |
6469 | 35 | // This is used instead of IV < TC because TC may wrap, unlike BTC. |
6470 | 35 | VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction()); |
6471 | 35 | VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); |
6472 | 35 | BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); |
6473 | 35 | return BlockMaskCache[BB] = BlockMask; |
6474 | 35 | } |
6475 | 2.78k | |
6476 | 2.78k | // This is the block mask. We OR all incoming edges. |
6477 | 3.17k | for (auto *Predecessor : predecessors(BB))2.78k { |
6478 | 3.17k | VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); |
6479 | 3.17k | if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. |
6480 | 0 | return BlockMaskCache[BB] = EdgeMask; |
6481 | 3.17k | |
6482 | 3.17k | if (!BlockMask) { // BlockMask has its initialized nullptr value. |
6483 | 2.78k | BlockMask = EdgeMask; |
6484 | 2.78k | continue; |
6485 | 2.78k | } |
6486 | 392 | |
6487 | 392 | BlockMask = Builder.createOr(BlockMask, EdgeMask); |
6488 | 392 | } |
6489 | 2.78k | |
6490 | 2.78k | return BlockMaskCache[BB] = BlockMask; |
6491 | 2.78k | } |
6492 | | |
6493 | | VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I, |
6494 | | VFRange &Range, |
6495 | 380k | VPlanPtr &Plan) { |
6496 | 380k | const InterleaveGroup<Instruction> *IG = CM.getInterleavedAccessGroup(I); |
6497 | 380k | if (!IG) |
6498 | 374k | return nullptr; |
6499 | 5.28k | |
6500 | 5.28k | // Now check if IG is relevant for VF's in the given range. |
6501 | 5.28k | auto isIGMember = [&](Instruction *I) -> std::function<bool(unsigned)> { |
6502 | 6.61k | return [=](unsigned VF) -> bool { |
6503 | 6.61k | return (VF >= 2 && // Query is illegal for VF == 1 |
6504 | 6.61k | CM.getWideningDecision(I, VF) == |
6505 | 2.92k | LoopVectorizationCostModel::CM_Interleave); |
6506 | 6.61k | }; |
6507 | 5.28k | }; |
6508 | 5.28k | if (!LoopVectorizationPlanner::getDecisionAndClampRange(isIGMember(I), Range)) |
6509 | 3.80k | return nullptr; |
6510 | 1.48k | |
6511 | 1.48k | // I is a member of an InterleaveGroup for VF's in the (possibly trimmed) |
6512 | 1.48k | // range. If it's the primary member of the IG construct a VPInterleaveRecipe. |
6513 | 1.48k | // Otherwise, it's an adjunct member of the IG, do not construct any Recipe. |
6514 | 1.48k | assert(I == IG->getInsertPos() && |
6515 | 1.48k | "Generating a recipe for an adjunct member of an interleave group"); |
6516 | 1.48k | |
6517 | 1.48k | VPValue *Mask = nullptr; |
6518 | 1.48k | if (Legal->isMaskRequired(I)) |
6519 | 11 | Mask = createBlockInMask(I->getParent(), Plan); |
6520 | 1.48k | |
6521 | 1.48k | return new VPInterleaveRecipe(IG, Mask); |
6522 | 1.48k | } |
6523 | | |
6524 | | VPWidenMemoryInstructionRecipe * |
6525 | | VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, |
6526 | 378k | VPlanPtr &Plan) { |
6527 | 378k | if (!isa<LoadInst>(I) && !isa<StoreInst>(I)342k ) |
6528 | 302k | return nullptr; |
6529 | 75.8k | |
6530 | 105k | auto willWiden = [&](unsigned VF) -> bool 75.8k { |
6531 | 105k | if (VF == 1) |
6532 | 39.1k | return false; |
6533 | 66.4k | if (CM.isScalarAfterVectorization(I, VF) || |
6534 | 66.4k | CM.isProfitableToScalarize(I, VF)) |
6535 | 258 | return false; |
6536 | 66.1k | LoopVectorizationCostModel::InstWidening Decision = |
6537 | 66.1k | CM.getWideningDecision(I, VF); |
6538 | 66.1k | assert(Decision != LoopVectorizationCostModel::CM_Unknown && |
6539 | 66.1k | "CM decision should be taken at this point."); |
6540 | 66.1k | assert(Decision != LoopVectorizationCostModel::CM_Interleave && |
6541 | 66.1k | "Interleave memory opportunity should be caught earlier."); |
6542 | 66.1k | return Decision != LoopVectorizationCostModel::CM_Scalarize; |
6543 | 66.1k | }; |
6544 | 75.8k | |
6545 | 75.8k | if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) |
6546 | 48.1k | return nullptr; |
6547 | 27.6k | |
6548 | 27.6k | VPValue *Mask = nullptr; |
6549 | 27.6k | if (Legal->isMaskRequired(I)) |
6550 | 106 | Mask = createBlockInMask(I->getParent(), Plan); |
6551 | 27.6k | |
6552 | 27.6k | return new VPWidenMemoryInstructionRecipe(*I, Mask); |
6553 | 27.6k | } |
6554 | | |
6555 | | VPWidenIntOrFpInductionRecipe * |
6556 | 351k | VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) { |
6557 | 351k | if (PHINode *Phi = dyn_cast<PHINode>(I)) { |
6558 | 53.8k | // Check if this is an integer or fp induction. If so, build the recipe that |
6559 | 53.8k | // produces its scalar and vector values. |
6560 | 53.8k | InductionDescriptor II = Legal->getInductionVars()->lookup(Phi); |
6561 | 53.8k | if (II.getKind() == InductionDescriptor::IK_IntInduction || |
6562 | 53.8k | II.getKind() == InductionDescriptor::IK_FpInduction16.0k ) |
6563 | 37.7k | return new VPWidenIntOrFpInductionRecipe(Phi); |
6564 | 16.0k | |
6565 | 16.0k | return nullptr; |
6566 | 16.0k | } |
6567 | 297k | |
6568 | 297k | // Optimize the special case where the source is a constant integer |
6569 | 297k | // induction variable. Notice that we can only optimize the 'trunc' case |
6570 | 297k | // because (a) FP conversions lose precision, (b) sext/zext may wrap, and |
6571 | 297k | // (c) other casts depend on pointer size. |
6572 | 297k | |
6573 | 297k | // Determine whether \p K is a truncation based on an induction variable that |
6574 | 297k | // can be optimized. |
6575 | 297k | auto isOptimizableIVTruncate = |
6576 | 297k | [&](Instruction *K) -> std::function<bool(unsigned)> { |
6577 | 21.3k | return |
6578 | 28.4k | [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); }; |
6579 | 21.3k | }; |
6580 | 297k | |
6581 | 297k | if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange( |
6582 | 21.3k | isOptimizableIVTruncate(I), Range)) |
6583 | 5.01k | return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), |
6584 | 5.01k | cast<TruncInst>(I)); |
6585 | 292k | return nullptr; |
6586 | 292k | } |
6587 | | |
6588 | 308k | VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) { |
6589 | 308k | PHINode *Phi = dyn_cast<PHINode>(I); |
6590 | 308k | if (!Phi || Phi->getParent() == OrigLoop->getHeader()16.0k ) |
6591 | 307k | return nullptr; |
6592 | 1.08k | |
6593 | 1.08k | // We know that all PHIs in non-header blocks are converted into selects, so |
6594 | 1.08k | // we don't have to worry about the insertion order and we can just use the |
6595 | 1.08k | // builder. At this point we generate the predication tree. There may be |
6596 | 1.08k | // duplications since this is a simple recursive scan, but future |
6597 | 1.08k | // optimizations will clean it up. |
6598 | 1.08k | |
6599 | 1.08k | SmallVector<VPValue *, 2> Masks; |
6600 | 1.08k | unsigned NumIncoming = Phi->getNumIncomingValues(); |
6601 | 3.71k | for (unsigned In = 0; In < NumIncoming; In++2.62k ) { |
6602 | 2.62k | VPValue *EdgeMask = |
6603 | 2.62k | createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); |
6604 | 2.62k | assert((EdgeMask || NumIncoming == 1) && |
6605 | 2.62k | "Multiple predecessors with one having a full mask"); |
6606 | 2.62k | if (EdgeMask) |
6607 | 2.62k | Masks.push_back(EdgeMask); |
6608 | 2.62k | } |
6609 | 1.08k | return new VPBlendRecipe(Phi, Masks); |
6610 | 1.08k | } |
6611 | | |
6612 | | bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB, |
6613 | 292k | VFRange &Range) { |
6614 | 292k | |
6615 | 292k | bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( |
6616 | 394k | [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); |
6617 | 292k | |
6618 | 292k | if (IsPredicated) |
6619 | 2.78k | return false; |
6620 | 289k | |
6621 | 289k | auto IsVectorizableOpcode = [](unsigned Opcode) { |
6622 | 289k | switch (Opcode) { |
6623 | 289k | case Instruction::Add: |
6624 | 289k | case Instruction::And: |
6625 | 289k | case Instruction::AShr: |
6626 | 289k | case Instruction::BitCast: |
6627 | 289k | case Instruction::Br: |
6628 | 289k | case Instruction::Call: |
6629 | 289k | case Instruction::FAdd: |
6630 | 289k | case Instruction::FCmp: |
6631 | 289k | case Instruction::FDiv: |
6632 | 289k | case Instruction::FMul: |
6633 | 289k | case Instruction::FNeg: |
6634 | 289k | case Instruction::FPExt: |
6635 | 289k | case Instruction::FPToSI: |
6636 | 289k | case Instruction::FPToUI: |
6637 | 289k | case Instruction::FPTrunc: |
6638 | 289k | case Instruction::FRem: |
6639 | 289k | case Instruction::FSub: |
6640 | 289k | case Instruction::GetElementPtr: |
6641 | 289k | case Instruction::ICmp: |
6642 | 289k | case Instruction::IntToPtr: |
6643 | 289k | case Instruction::Load: |
6644 | 289k | case Instruction::LShr: |
6645 | 289k | case Instruction::Mul: |
6646 | 289k | case Instruction::Or: |
6647 | 289k | case Instruction::PHI: |
6648 | 289k | case Instruction::PtrToInt: |
6649 | 289k | case Instruction::SDiv: |
6650 | 289k | case Instruction::Select: |
6651 | 289k | case Instruction::SExt: |
6652 | 289k | case Instruction::Shl: |
6653 | 289k | case Instruction::SIToFP: |
6654 | 289k | case Instruction::SRem: |
6655 | 289k | case Instruction::Store: |
6656 | 289k | case Instruction::Sub: |
6657 | 289k | case Instruction::Trunc: |
6658 | 289k | case Instruction::UDiv: |
6659 | 289k | case Instruction::UIToFP: |
6660 | 289k | case Instruction::URem: |
6661 | 289k | case Instruction::Xor: |
6662 | 289k | case Instruction::ZExt: |
6663 | 289k | return true; |
6664 | 6 | } |
6665 | 6 | return false; |
6666 | 6 | }; |
6667 | 289k | |
6668 | 289k | if (!IsVectorizableOpcode(I->getOpcode())) |
6669 | 6 | return false; |
6670 | 289k | |
6671 | 289k | if (CallInst *CI = dyn_cast<CallInst>(I)) { |
6672 | 850 | Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); |
6673 | 850 | if (ID && (686 ID == Intrinsic::assume686 || ID == Intrinsic::lifetime_end676 || |
6674 | 686 | ID == Intrinsic::lifetime_start671 || ID == Intrinsic::sideeffect668 )) |
6675 | 19 | return false; |
6676 | 289k | } |
6677 | 289k | |
6678 | 387k | auto willWiden = [&](unsigned VF) -> bool 289k { |
6679 | 387k | if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) || |
6680 | 387k | CM.isProfitableToScalarize(I, VF)137k )) |
6681 | 249k | return false; |
6682 | 137k | if (CallInst *CI = dyn_cast<CallInst>(I)) { |
6683 | 653 | Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); |
6684 | 653 | // The following case may be scalarized depending on the VF. |
6685 | 653 | // The flag shows whether we use Intrinsic or a usual Call for vectorized |
6686 | 653 | // version of the instruction. |
6687 | 653 | // Is it beneficial to perform intrinsic call compared to lib call? |
6688 | 653 | bool NeedToScalarize; |
6689 | 653 | unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); |
6690 | 653 | bool UseVectorIntrinsic = |
6691 | 653 | ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost516 ; |
6692 | 653 | return UseVectorIntrinsic || !NeedToScalarize188 ; |
6693 | 653 | } |
6694 | 137k | if (isa<LoadInst>(I) || isa<StoreInst>(I)128k ) { |
6695 | 12.8k | assert(CM.getWideningDecision(I, VF) == |
6696 | 12.8k | LoopVectorizationCostModel::CM_Scalarize && |
6697 | 12.8k | "Memory widening decisions should have been taken care by now"); |
6698 | 12.8k | return false; |
6699 | 12.8k | } |
6700 | 124k | return true; |
6701 | 124k | }; |
6702 | 289k | |
6703 | 289k | if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) |
6704 | 210k | return false; |
6705 | 79.0k | |
6706 | 79.0k | // Success: widen this instruction. We optimize the common case where |
6707 | 79.0k | // consecutive instructions can be represented by a single recipe. |
6708 | 79.0k | if (!VPBB->empty()) { |
6709 | 77.6k | VPWidenRecipe *LastWidenRecipe = dyn_cast<VPWidenRecipe>(&VPBB->back()); |
6710 | 77.6k | if (LastWidenRecipe && LastWidenRecipe->appendInstruction(I)53.5k ) |
6711 | 53.3k | return true; |
6712 | 25.7k | } |
6713 | 25.7k | |
6714 | 25.7k | VPBB->appendRecipe(new VPWidenRecipe(I)); |
6715 | 25.7k | return true; |
6716 | 25.7k | } |
6717 | | |
6718 | | VPBasicBlock *VPRecipeBuilder::handleReplication( |
6719 | | Instruction *I, VFRange &Range, VPBasicBlock *VPBB, |
6720 | | DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, |
6721 | 213k | VPlanPtr &Plan) { |
6722 | 213k | bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( |
6723 | 265k | [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); }, |
6724 | 213k | Range); |
6725 | 213k | |
6726 | 213k | bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( |
6727 | 264k | [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); |
6728 | 213k | |
6729 | 213k | auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated); |
6730 | 213k | |
6731 | 213k | // Find if I uses a predicated instruction. If so, it will use its scalar |
6732 | 213k | // value. Avoid hoisting the insert-element which packs the scalar value into |
6733 | 213k | // a vector value, as that happens iff all users use the vector value. |
6734 | 213k | for (auto &Op : I->operands()) |
6735 | 436k | if (auto *PredInst = dyn_cast<Instruction>(Op)) |
6736 | 290k | if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) |
6737 | 1.08k | PredInst2Recipe[PredInst]->setAlsoPack(false); |
6738 | 213k | |
6739 | 213k | // Finalize the recipe for Instr, first if it is not predicated. |
6740 | 213k | if (!IsPredicated) { |
6741 | 210k | LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); |
6742 | 210k | VPBB->appendRecipe(Recipe); |
6743 | 210k | return VPBB; |
6744 | 210k | } |
6745 | 2.78k | LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); |
6746 | 2.78k | assert(VPBB->getSuccessors().empty() && |
6747 | 2.78k | "VPBB has successors when handling predicated replication."); |
6748 | 2.78k | // Record predicated instructions for above packing optimizations. |
6749 | 2.78k | PredInst2Recipe[I] = Recipe; |
6750 | 2.78k | VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); |
6751 | 2.78k | VPBlockUtils::insertBlockAfter(Region, VPBB); |
6752 | 2.78k | auto *RegSucc = new VPBasicBlock(); |
6753 | 2.78k | VPBlockUtils::insertBlockAfter(RegSucc, Region); |
6754 | 2.78k | return RegSucc; |
6755 | 2.78k | } |
6756 | | |
6757 | | VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, |
6758 | | VPRecipeBase *PredRecipe, |
6759 | 2.78k | VPlanPtr &Plan) { |
6760 | 2.78k | // Instructions marked for predication are replicated and placed under an |
6761 | 2.78k | // if-then construct to prevent side-effects. |
6762 | 2.78k | |
6763 | 2.78k | // Generate recipes to compute the block mask for this region. |
6764 | 2.78k | VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); |
6765 | 2.78k | |
6766 | 2.78k | // Build the triangular if-then region. |
6767 | 2.78k | std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); |
6768 | 2.78k | assert(Instr->getParent() && "Predicated instruction not in any basic block"); |
6769 | 2.78k | auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); |
6770 | 2.78k | auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); |
6771 | 2.78k | auto *PHIRecipe = |
6772 | 2.78k | Instr->getType()->isVoidTy() ? nullptr1.13k : new VPPredInstPHIRecipe(Instr)1.65k ; |
6773 | 2.78k | auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); |
6774 | 2.78k | auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); |
6775 | 2.78k | VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); |
6776 | 2.78k | |
6777 | 2.78k | // Note: first set Entry as region entry and then connect successors starting |
6778 | 2.78k | // from it in order, to propagate the "parent" of each VPBasicBlock. |
6779 | 2.78k | VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); |
6780 | 2.78k | VPBlockUtils::connectBlocks(Pred, Exit); |
6781 | 2.78k | |
6782 | 2.78k | return Region; |
6783 | 2.78k | } |
6784 | | |
6785 | | bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range, |
6786 | 380k | VPlanPtr &Plan, VPBasicBlock *VPBB) { |
6787 | 380k | VPRecipeBase *Recipe = nullptr; |
6788 | 380k | // Check if Instr should belong to an interleave memory recipe, or already |
6789 | 380k | // does. In the latter case Instr is irrelevant. |
6790 | 380k | if ((Recipe = tryToInterleaveMemory(Instr, Range, Plan))) { |
6791 | 1.48k | VPBB->appendRecipe(Recipe); |
6792 | 1.48k | return true; |
6793 | 1.48k | } |
6794 | 378k | |
6795 | 378k | // Check if Instr is a memory operation that should be widened. |
6796 | 378k | if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) { |
6797 | 27.6k | VPBB->appendRecipe(Recipe); |
6798 | 27.6k | return true; |
6799 | 27.6k | } |
6800 | 351k | |
6801 | 351k | // Check if Instr should form some PHI recipe. |
6802 | 351k | if ((Recipe = tryToOptimizeInduction(Instr, Range))) { |
6803 | 42.8k | VPBB->appendRecipe(Recipe); |
6804 | 42.8k | return true; |
6805 | 42.8k | } |
6806 | 308k | if ((Recipe = tryToBlend(Instr, Plan))) { |
6807 | 1.08k | VPBB->appendRecipe(Recipe); |
6808 | 1.08k | return true; |
6809 | 1.08k | } |
6810 | 307k | if (PHINode *Phi = dyn_cast<PHINode>(Instr)) { |
6811 | 14.9k | VPBB->appendRecipe(new VPWidenPHIRecipe(Phi)); |
6812 | 14.9k | return true; |
6813 | 14.9k | } |
6814 | 292k | |
6815 | 292k | // Check if Instr is to be widened by a general VPWidenRecipe, after |
6816 | 292k | // having first checked for specific widening recipes that deal with |
6817 | 292k | // Interleave Groups, Inductions and Phi nodes. |
6818 | 292k | if (tryToWiden(Instr, VPBB, Range)) |
6819 | 79.0k | return true; |
6820 | 213k | |
6821 | 213k | return false; |
6822 | 213k | } |
6823 | | |
6824 | | void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, |
6825 | 19.8k | unsigned MaxVF) { |
6826 | 19.8k | assert(OrigLoop->empty() && "Inner loop expected."); |
6827 | 19.8k | |
6828 | 19.8k | // Collect conditions feeding internal conditional branches; they need to be |
6829 | 19.8k | // represented in VPlan for it to model masking. |
6830 | 19.8k | SmallPtrSet<Value *, 1> NeedDef; |
6831 | 19.8k | |
6832 | 19.8k | auto *Latch = OrigLoop->getLoopLatch(); |
6833 | 21.9k | for (BasicBlock *BB : OrigLoop->blocks()) { |
6834 | 21.9k | if (BB == Latch) |
6835 | 19.8k | continue; |
6836 | 2.07k | BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator()); |
6837 | 2.07k | if (Branch && Branch->isConditional()) |
6838 | 1.01k | NeedDef.insert(Branch->getCondition()); |
6839 | 2.07k | } |
6840 | 19.8k | |
6841 | 19.8k | // If the tail is to be folded by masking, the primary induction variable |
6842 | 19.8k | // needs to be represented in VPlan for it to model early-exit masking. |
6843 | 19.8k | if (CM.foldTailByMasking()) |
6844 | 26 | NeedDef.insert(Legal->getPrimaryInduction()); |
6845 | 19.8k | |
6846 | 19.8k | // Collect instructions from the original loop that will become trivially dead |
6847 | 19.8k | // in the vectorized loop. We don't need to vectorize these instructions. For |
6848 | 19.8k | // example, original induction update instructions can become dead because we |
6849 | 19.8k | // separately emit induction "steps" when generating code for the new loop. |
6850 | 19.8k | // Similarly, we create a new latch condition when setting up the structure |
6851 | 19.8k | // of the new loop, so the old one can become dead. |
6852 | 19.8k | SmallPtrSet<Instruction *, 4> DeadInstructions; |
6853 | 19.8k | collectTriviallyDeadInstructions(DeadInstructions); |
6854 | 19.8k | |
6855 | 58.3k | for (unsigned VF = MinVF; VF < MaxVF + 1;) { |
6856 | 38.5k | VFRange SubRange = {VF, MaxVF + 1}; |
6857 | 38.5k | VPlans.push_back( |
6858 | 38.5k | buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions)); |
6859 | 38.5k | VF = SubRange.End; |
6860 | 38.5k | } |
6861 | 19.8k | } |
6862 | | |
6863 | | VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( |
6864 | | VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, |
6865 | 38.5k | SmallPtrSetImpl<Instruction *> &DeadInstructions) { |
6866 | 38.5k | // Hold a mapping from predicated instructions to their recipes, in order to |
6867 | 38.5k | // fix their AlsoPack behavior if a user is determined to replicate and use a |
6868 | 38.5k | // scalar instead of vector value. |
6869 | 38.5k | DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; |
6870 | 38.5k | |
6871 | 38.5k | DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); |
6872 | 38.5k | DenseMap<Instruction *, Instruction *> SinkAfterInverse; |
6873 | 38.5k | |
6874 | 38.5k | // Create a dummy pre-entry VPBasicBlock to start building the VPlan. |
6875 | 38.5k | VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); |
6876 | 38.5k | auto Plan = llvm::make_unique<VPlan>(VPBB); |
6877 | 38.5k | |
6878 | 38.5k | VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder); |
6879 | 38.5k | // Represent values that will have defs inside VPlan. |
6880 | 38.5k | for (Value *V : NeedDef) |
6881 | 1.96k | Plan->addVPValue(V); |
6882 | 38.5k | |
6883 | 38.5k | // Scan the body of the loop in a topological order to visit each basic block |
6884 | 38.5k | // after having visited its predecessor basic blocks. |
6885 | 38.5k | LoopBlocksDFS DFS(OrigLoop); |
6886 | 38.5k | DFS.perform(LI); |
6887 | 38.5k | |
6888 | 42.4k | for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { |
6889 | 42.4k | // Relevant instructions from basic block BB will be grouped into VPRecipe |
6890 | 42.4k | // ingredients and fill a new VPBasicBlock. |
6891 | 42.4k | unsigned VPBBsForBB = 0; |
6892 | 42.4k | auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); |
6893 | 42.4k | VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); |
6894 | 42.4k | VPBB = FirstVPBBForBB; |
6895 | 42.4k | Builder.setInsertPoint(VPBB); |
6896 | 42.4k | |
6897 | 42.4k | std::vector<Instruction *> Ingredients; |
6898 | 42.4k | |
6899 | 42.4k | // Organize the ingredients to vectorize from current basic block in the |
6900 | 42.4k | // right order. |
6901 | 491k | for (Instruction &I : BB->instructionsWithoutDebug()) { |
6902 | 491k | Instruction *Instr = &I; |
6903 | 491k | |
6904 | 491k | // First filter out irrelevant instructions, to ensure no recipes are |
6905 | 491k | // built for them. |
6906 | 491k | if (isa<BranchInst>(Instr) || |
6907 | 491k | DeadInstructions.find(Instr) != DeadInstructions.end()448k ) |
6908 | 108k | continue; |
6909 | 382k | |
6910 | 382k | // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct |
6911 | 382k | // member of the IG, do not construct any Recipe for it. |
6912 | 382k | const InterleaveGroup<Instruction> *IG = |
6913 | 382k | CM.getInterleavedAccessGroup(Instr); |
6914 | 382k | if (IG && Instr != IG->getInsertPos()7.54k && |
6915 | 382k | Range.Start >= 24.56k && // Query is illegal for VF == 1 |
6916 | 382k | CM.getWideningDecision(Instr, Range.Start) == |
6917 | 2.32k | LoopVectorizationCostModel::CM_Interleave) { |
6918 | 2.26k | auto SinkCandidate = SinkAfterInverse.find(Instr); |
6919 | 2.26k | if (SinkCandidate != SinkAfterInverse.end()) |
6920 | 1 | Ingredients.push_back(SinkCandidate->second); |
6921 | 2.26k | continue; |
6922 | 2.26k | } |
6923 | 380k | |
6924 | 380k | // Move instructions to handle first-order recurrences, step 1: avoid |
6925 | 380k | // handling this instruction until after we've handled the instruction it |
6926 | 380k | // should follow. |
6927 | 380k | auto SAIt = SinkAfter.find(Instr); |
6928 | 380k | if (SAIt != SinkAfter.end()) { |
6929 | 31 | LLVM_DEBUG(dbgs() << "Sinking" << *SAIt->first << " after" |
6930 | 31 | << *SAIt->second |
6931 | 31 | << " to vectorize a 1st order recurrence.\n"); |
6932 | 31 | SinkAfterInverse[SAIt->second] = Instr; |
6933 | 31 | continue; |
6934 | 31 | } |
6935 | 380k | |
6936 | 380k | Ingredients.push_back(Instr); |
6937 | 380k | |
6938 | 380k | // Move instructions to handle first-order recurrences, step 2: push the |
6939 | 380k | // instruction to be sunk at its insertion point. |
6940 | 380k | auto SAInvIt = SinkAfterInverse.find(Instr); |
6941 | 380k | if (SAInvIt != SinkAfterInverse.end()) |
6942 | 30 | Ingredients.push_back(SAInvIt->second); |
6943 | 380k | } |
6944 | 42.4k | |
6945 | 42.4k | // Introduce each ingredient into VPlan. |
6946 | 380k | for (Instruction *Instr : Ingredients) { |
6947 | 380k | if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB)) |
6948 | 167k | continue; |
6949 | 213k | |
6950 | 213k | // Otherwise, if all widening options failed, Instruction is to be |
6951 | 213k | // replicated. This may create a successor for VPBB. |
6952 | 213k | VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( |
6953 | 213k | Instr, Range, VPBB, PredInst2Recipe, Plan); |
6954 | 213k | if (NextVPBB != VPBB) { |
6955 | 2.78k | VPBB = NextVPBB; |
6956 | 2.78k | VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)121 |
6957 | 2.78k | : ""2.66k ); |
6958 | 2.78k | } |
6959 | 213k | } |
6960 | 42.4k | } |
6961 | 38.5k | |
6962 | 38.5k | // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks |
6963 | 38.5k | // may also be empty, such as the last one VPBB, reflecting original |
6964 | 38.5k | // basic-blocks with no recipes. |
6965 | 38.5k | VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); |
6966 | 38.5k | assert(PreEntry->empty() && "Expecting empty pre-entry block."); |
6967 | 38.5k | VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); |
6968 | 38.5k | VPBlockUtils::disconnectBlocks(PreEntry, Entry); |
6969 | 38.5k | delete PreEntry; |
6970 | 38.5k | |
6971 | 38.5k | std::string PlanName; |
6972 | 38.5k | raw_string_ostream RSO(PlanName); |
6973 | 38.5k | unsigned VF = Range.Start; |
6974 | 38.5k | Plan->addVF(VF); |
6975 | 38.5k | RSO << "Initial VPlan for VF={" << VF; |
6976 | 51.6k | for (VF *= 2; VF < Range.End; VF *= 213.0k ) { |
6977 | 13.0k | Plan->addVF(VF); |
6978 | 13.0k | RSO << "," << VF; |
6979 | 13.0k | } |
6980 | 38.5k | RSO << "},UF>=1"; |
6981 | 38.5k | RSO.flush(); |
6982 | 38.5k | Plan->setName(PlanName); |
6983 | 38.5k | |
6984 | 38.5k | return Plan; |
6985 | 38.5k | } |
6986 | | |
6987 | 7 | VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { |
6988 | 7 | // Outer loop handling: They may require CFG and instruction level |
6989 | 7 | // transformations before even evaluating whether vectorization is profitable. |
6990 | 7 | // Since we cannot modify the incoming IR, we need to build VPlan upfront in |
6991 | 7 | // the vectorization pipeline. |
6992 | 7 | assert(!OrigLoop->empty()); |
6993 | 7 | assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); |
6994 | 7 | |
6995 | 7 | // Create new empty VPlan |
6996 | 7 | auto Plan = llvm::make_unique<VPlan>(); |
6997 | 7 | |
6998 | 7 | // Build hierarchical CFG |
6999 | 7 | VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); |
7000 | 7 | HCFGBuilder.buildHierarchicalCFG(); |
7001 | 7 | |
7002 | 14 | for (unsigned VF = Range.Start; VF < Range.End; VF *= 27 ) |
7003 | 7 | Plan->addVF(VF); |
7004 | 7 | |
7005 | 7 | if (EnableVPlanPredication) { |
7006 | 0 | VPlanPredicator VPP(*Plan); |
7007 | 0 | VPP.predicate(); |
7008 | 0 |
|
7009 | 0 | // Avoid running transformation to recipes until masked code generation in |
7010 | 0 | // VPlan-native path is in place. |
7011 | 0 | return Plan; |
7012 | 0 | } |
7013 | 7 | |
7014 | 7 | SmallPtrSet<Instruction *, 1> DeadInstructions; |
7015 | 7 | VPlanHCFGTransforms::VPInstructionsToVPRecipes( |
7016 | 7 | Plan, Legal->getInductionVars(), DeadInstructions); |
7017 | 7 | |
7018 | 7 | return Plan; |
7019 | 7 | } |
7020 | | |
7021 | | Value* LoopVectorizationPlanner::VPCallbackILV:: |
7022 | 1.34k | getOrCreateVectorValues(Value *V, unsigned Part) { |
7023 | 1.34k | return ILV.getOrCreateVectorValue(V, Part); |
7024 | 1.34k | } |
7025 | | |
7026 | 0 | void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const { |
7027 | 0 | O << " +\n" |
7028 | 0 | << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; |
7029 | 0 | IG->getInsertPos()->printAsOperand(O, false); |
7030 | 0 | if (User) { |
7031 | 0 | O << ", "; |
7032 | 0 | User->getOperand(0)->printAsOperand(O); |
7033 | 0 | } |
7034 | 0 | O << "\\l\""; |
7035 | 0 | for (unsigned i = 0; i < IG->getFactor(); ++i) |
7036 | 0 | if (Instruction *I = IG->getMember(i)) |
7037 | 0 | O << " +\n" |
7038 | 0 | << Indent << "\" " << VPlanIngredient(I) << " " << i << "\\l\""; |
7039 | 0 | } |
7040 | | |
7041 | 14.2k | void VPWidenRecipe::execute(VPTransformState &State) { |
7042 | 14.2k | for (auto &Instr : make_range(Begin, End)) |
7043 | 50.4k | State.ILV->widenInstruction(Instr); |
7044 | 14.2k | } |
7045 | | |
7046 | 19.2k | void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { |
7047 | 19.2k | assert(!State.Instance && "Int or FP induction being replicated."); |
7048 | 19.2k | State.ILV->widenIntOrFpInduction(IV, Trunc); |
7049 | 19.2k | } |
7050 | | |
7051 | 5.05k | void VPWidenPHIRecipe::execute(VPTransformState &State) { |
7052 | 5.05k | State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); |
7053 | 5.05k | } |
7054 | | |
7055 | 168 | void VPBlendRecipe::execute(VPTransformState &State) { |
7056 | 168 | State.ILV->setDebugLocFromInst(State.Builder, Phi); |
7057 | 168 | // We know that all PHIs in non-header blocks are converted into |
7058 | 168 | // selects, so we don't have to worry about the insertion order and we |
7059 | 168 | // can just use the builder. |
7060 | 168 | // At this point we generate the predication tree. There may be |
7061 | 168 | // duplications since this is a simple recursive scan, but future |
7062 | 168 | // optimizations will clean it up. |
7063 | 168 | |
7064 | 168 | unsigned NumIncoming = Phi->getNumIncomingValues(); |
7065 | 168 | |
7066 | 168 | assert((User || NumIncoming == 1) && |
7067 | 168 | "Multiple predecessors with predecessors having a full mask"); |
7068 | 168 | // Generate a sequence of selects of the form: |
7069 | 168 | // SELECT(Mask3, In3, |
7070 | 168 | // SELECT(Mask2, In2, |
7071 | 168 | // ( ...))) |
7072 | 168 | InnerLoopVectorizer::VectorParts Entry(State.UF); |
7073 | 536 | for (unsigned In = 0; In < NumIncoming; ++In368 ) { |
7074 | 980 | for (unsigned Part = 0; Part < State.UF; ++Part612 ) { |
7075 | 612 | // We might have single edge PHIs (blocks) - use an identity |
7076 | 612 | // 'select' for the first PHI operand. |
7077 | 612 | Value *In0 = |
7078 | 612 | State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part); |
7079 | 612 | if (In == 0) |
7080 | 277 | Entry[Part] = In0; // Initialize with the first incoming value. |
7081 | 335 | else { |
7082 | 335 | // Select between the current value and the previous incoming edge |
7083 | 335 | // based on the incoming mask. |
7084 | 335 | Value *Cond = State.get(User->getOperand(In), Part); |
7085 | 335 | Entry[Part] = |
7086 | 335 | State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); |
7087 | 335 | } |
7088 | 612 | } |
7089 | 368 | } |
7090 | 445 | for (unsigned Part = 0; Part < State.UF; ++Part277 ) |
7091 | 277 | State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); |
7092 | 168 | } |
7093 | | |
7094 | 669 | void VPInterleaveRecipe::execute(VPTransformState &State) { |
7095 | 669 | assert(!State.Instance && "Interleave group being replicated."); |
7096 | 669 | if (!User) |
7097 | 658 | return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos()); |
7098 | 11 | |
7099 | 11 | // Last (and currently only) operand is a mask. |
7100 | 11 | InnerLoopVectorizer::VectorParts MaskValues(State.UF); |
7101 | 11 | VPValue *Mask = User->getOperand(User->getNumOperands() - 1); |
7102 | 22 | for (unsigned Part = 0; Part < State.UF; ++Part11 ) |
7103 | 11 | MaskValues[Part] = State.get(Mask, Part); |
7104 | 11 | State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues); |
7105 | 11 | } |
7106 | | |
7107 | 37.2k | void VPReplicateRecipe::execute(VPTransformState &State) { |
7108 | 37.2k | if (State.Instance) { // Generate a single instance. |
7109 | 948 | State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated); |
7110 | 948 | // Insert scalar instance packing it into a vector. |
7111 | 948 | if (AlsoPack && State.VF > 1222 ) { |
7112 | 182 | // If we're constructing lane 0, initialize to start from undef. |
7113 | 182 | if (State.Instance->Lane == 0) { |
7114 | 46 | Value *Undef = |
7115 | 46 | UndefValue::get(VectorType::get(Ingredient->getType(), State.VF)); |
7116 | 46 | State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); |
7117 | 46 | } |
7118 | 182 | State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); |
7119 | 182 | } |
7120 | 948 | return; |
7121 | 948 | } |
7122 | 36.2k | |
7123 | 36.2k | // Generate scalar instances for all VF lanes of all UF parts, unless the |
7124 | 36.2k | // instruction is uniform inwhich case generate only the first lane for each |
7125 | 36.2k | // of the UF parts. |
7126 | 36.2k | unsigned EndLane = IsUniform ? 135.6k : State.VF651 ; |
7127 | 104k | for (unsigned Part = 0; Part < State.UF; ++Part68.3k ) |
7128 | 139k | for (unsigned Lane = 0; 68.3k Lane < EndLane; ++Lane70.7k ) |
7129 | 70.7k | State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated); |
7130 | 36.2k | } |
7131 | | |
7132 | 948 | void VPBranchOnMaskRecipe::execute(VPTransformState &State) { |
7133 | 948 | assert(State.Instance && "Branch on Mask works only on single instance."); |
7134 | 948 | |
7135 | 948 | unsigned Part = State.Instance->Part; |
7136 | 948 | unsigned Lane = State.Instance->Lane; |
7137 | 948 | |
7138 | 948 | Value *ConditionBit = nullptr; |
7139 | 948 | if (!User) // Block in mask is all-one. |
7140 | 0 | ConditionBit = State.Builder.getTrue(); |
7141 | 948 | else { |
7142 | 948 | VPValue *BlockInMask = User->getOperand(0); |
7143 | 948 | ConditionBit = State.get(BlockInMask, Part); |
7144 | 948 | if (ConditionBit->getType()->isVectorTy()) |
7145 | 560 | ConditionBit = State.Builder.CreateExtractElement( |
7146 | 560 | ConditionBit, State.Builder.getInt32(Lane)); |
7147 | 948 | } |
7148 | 948 | |
7149 | 948 | // Replace the temporary unreachable terminator with a new conditional branch, |
7150 | 948 | // whose two destinations will be set later when they are created. |
7151 | 948 | auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); |
7152 | 948 | assert(isa<UnreachableInst>(CurrentTerminator) && |
7153 | 948 | "Expected to replace unreachable terminator with conditional branch."); |
7154 | 948 | auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); |
7155 | 948 | CondBr->setSuccessor(0, nullptr); |
7156 | 948 | ReplaceInstWithInst(CurrentTerminator, CondBr); |
7157 | 948 | } |
7158 | | |
7159 | 512 | void VPPredInstPHIRecipe::execute(VPTransformState &State) { |
7160 | 512 | assert(State.Instance && "Predicated instruction PHI works per instance."); |
7161 | 512 | Instruction *ScalarPredInst = cast<Instruction>( |
7162 | 512 | State.ValueMap.getScalarValue(PredInst, *State.Instance)); |
7163 | 512 | BasicBlock *PredicatedBB = ScalarPredInst->getParent(); |
7164 | 512 | BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); |
7165 | 512 | assert(PredicatingBB && "Predicated block has no single predecessor."); |
7166 | 512 | |
7167 | 512 | // By current pack/unpack logic we need to generate only a single phi node: if |
7168 | 512 | // a vector value for the predicated instruction exists at this point it means |
7169 | 512 | // the instruction has vector users only, and a phi for the vector value is |
7170 | 512 | // needed. In this case the recipe of the predicated instruction is marked to |
7171 | 512 | // also do that packing, thereby "hoisting" the insert-element sequence. |
7172 | 512 | // Otherwise, a phi node for the scalar value is needed. |
7173 | 512 | unsigned Part = State.Instance->Part; |
7174 | 512 | if (State.ValueMap.hasVectorValue(PredInst, Part)) { |
7175 | 182 | Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); |
7176 | 182 | InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); |
7177 | 182 | PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); |
7178 | 182 | VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. |
7179 | 182 | VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. |
7180 | 182 | State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. |
7181 | 330 | } else { |
7182 | 330 | Type *PredInstType = PredInst->getType(); |
7183 | 330 | PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); |
7184 | 330 | Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); |
7185 | 330 | Phi->addIncoming(ScalarPredInst, PredicatedBB); |
7186 | 330 | State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); |
7187 | 330 | } |
7188 | 512 | } |
7189 | | |
7190 | 21.6k | void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { |
7191 | 21.6k | if (!User) |
7192 | 21.5k | return State.ILV->vectorizeMemoryInstruction(&Instr); |
7193 | 95 | |
7194 | 95 | // Last (and currently only) operand is a mask. |
7195 | 95 | InnerLoopVectorizer::VectorParts MaskValues(State.UF); |
7196 | 95 | VPValue *Mask = User->getOperand(User->getNumOperands() - 1); |
7197 | 316 | for (unsigned Part = 0; Part < State.UF; ++Part221 ) |
7198 | 221 | MaskValues[Part] = State.get(Mask, Part); |
7199 | 95 | State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues); |
7200 | 95 | } |
7201 | | |
7202 | | // Process the loop in the VPlan-native vectorization path. This path builds |
7203 | | // VPlan upfront in the vectorization pipeline, which allows to apply |
7204 | | // VPlan-to-VPlan transformations from the very beginning without modifying the |
7205 | | // input LLVM IR. |
7206 | | static bool processLoopInVPlanNativePath( |
7207 | | Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, |
7208 | | LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, |
7209 | | TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, |
7210 | | OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, |
7211 | 7 | ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { |
7212 | 7 | |
7213 | 7 | assert(EnableVPlanNativePath && "VPlan-native path is disabled."); |
7214 | 7 | Function *F = L->getHeader()->getParent(); |
7215 | 7 | InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); |
7216 | 7 | LoopVectorizationCostModel CM(L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, |
7217 | 7 | &Hints, IAI); |
7218 | 7 | // Use the planner for outer loop vectorization. |
7219 | 7 | // TODO: CM is not used at this point inside the planner. Turn CM into an |
7220 | 7 | // optional argument if we don't need it in the future. |
7221 | 7 | LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM); |
7222 | 7 | |
7223 | 7 | // Get user vectorization factor. |
7224 | 7 | const unsigned UserVF = Hints.getWidth(); |
7225 | 7 | |
7226 | 7 | // Check the function attributes and profiles to find out if this function |
7227 | 7 | // should be optimized for size. |
7228 | 7 | bool OptForSize = |
7229 | 7 | Hints.getForce() != LoopVectorizeHints::FK_Enabled && |
7230 | 7 | (0 F->hasOptSize()0 || |
7231 | 0 | llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)); |
7232 | 7 | |
7233 | 7 | // Plan how to best vectorize, return the best VF and its cost. |
7234 | 7 | const VectorizationFactor VF = LVP.planInVPlanNativePath(OptForSize, UserVF); |
7235 | 7 | |
7236 | 7 | // If we are stress testing VPlan builds, do not attempt to generate vector |
7237 | 7 | // code. Masked vector code generation support will follow soon. |
7238 | 7 | // Also, do not attempt to vectorize if no vector code will be produced. |
7239 | 7 | if (VPlanBuildStressTest || EnableVPlanPredication || |
7240 | 7 | VectorizationFactor::Disabled() == VF) |
7241 | 0 | return false; |
7242 | 7 | |
7243 | 7 | LVP.setBestPlan(VF.Width, 1); |
7244 | 7 | |
7245 | 7 | InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, |
7246 | 7 | &CM); |
7247 | 7 | LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" |
7248 | 7 | << L->getHeader()->getParent()->getName() << "\"\n"); |
7249 | 7 | LVP.executePlan(LB, DT); |
7250 | 7 | |
7251 | 7 | // Mark the loop as already vectorized to avoid vectorizing again. |
7252 | 7 | Hints.setAlreadyVectorized(); |
7253 | 7 | |
7254 | 7 | LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); |
7255 | 7 | return true; |
7256 | 7 | } |
7257 | | |
7258 | 146k | bool LoopVectorizePass::processLoop(Loop *L) { |
7259 | 146k | assert((EnableVPlanNativePath || L->empty()) && |
7260 | 146k | "VPlan-native path is not enabled. Only process inner loops."); |
7261 | 146k | |
7262 | | #ifndef NDEBUG |
7263 | | const std::string DebugLocStr = getDebugLocString(L); |
7264 | | #endif /* NDEBUG */ |
7265 | | |
7266 | 146k | LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" |
7267 | 146k | << L->getHeader()->getParent()->getName() << "\" from " |
7268 | 146k | << DebugLocStr << "\n"); |
7269 | 146k | |
7270 | 146k | LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); |
7271 | 146k | |
7272 | 146k | LLVM_DEBUG( |
7273 | 146k | dbgs() << "LV: Loop hints:" |
7274 | 146k | << " force=" |
7275 | 146k | << (Hints.getForce() == LoopVectorizeHints::FK_Disabled |
7276 | 146k | ? "disabled" |
7277 | 146k | : (Hints.getForce() == LoopVectorizeHints::FK_Enabled |
7278 | 146k | ? "enabled" |
7279 | 146k | : "?")) |
7280 | 146k | << " width=" << Hints.getWidth() |
7281 | 146k | << " unroll=" << Hints.getInterleave() << "\n"); |
7282 | 146k | |
7283 | 146k | // Function containing loop |
7284 | 146k | Function *F = L->getHeader()->getParent(); |
7285 | 146k | |
7286 | 146k | // Looking at the diagnostic output is the only way to determine if a loop |
7287 | 146k | // was vectorized (other than looking at the IR or machine code), so it |
7288 | 146k | // is important to generate an optimization remark for each loop. Most of |
7289 | 146k | // these messages are generated as OptimizationRemarkAnalysis. Remarks |
7290 | 146k | // generated as OptimizationRemark and OptimizationRemarkMissed are |
7291 | 146k | // less verbose reporting vectorized loops and unvectorized loops that may |
7292 | 146k | // benefit from vectorization, respectively. |
7293 | 146k | |
7294 | 146k | if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { |
7295 | 5.70k | LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); |
7296 | 5.70k | return false; |
7297 | 5.70k | } |
7298 | 141k | |
7299 | 141k | PredicatedScalarEvolution PSE(*SE, *L); |
7300 | 141k | |
7301 | 141k | // Check if it is legal to vectorize the loop. |
7302 | 141k | LoopVectorizationRequirements Requirements(*ORE); |
7303 | 141k | LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, |
7304 | 141k | &Requirements, &Hints, DB, AC); |
7305 | 141k | if (!LVL.canVectorize(EnableVPlanNativePath)) { |
7306 | 121k | LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); |
7307 | 121k | Hints.emitRemarkWithHints(); |
7308 | 121k | return false; |
7309 | 121k | } |
7310 | 19.9k | |
7311 | 19.9k | // Check the function attributes and profiles to find out if this function |
7312 | 19.9k | // should be optimized for size. |
7313 | 19.9k | bool OptForSize = |
7314 | 19.9k | Hints.getForce() != LoopVectorizeHints::FK_Enabled && |
7315 | 19.9k | (19.9k F->hasOptSize()19.9k || |
7316 | 19.9k | llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)19.9k ); |
7317 | 19.9k | |
7318 | 19.9k | // Entrance to the VPlan-native vectorization path. Outer loops are processed |
7319 | 19.9k | // here. They may require CFG and instruction level transformations before |
7320 | 19.9k | // even evaluating whether vectorization is profitable. Since we cannot modify |
7321 | 19.9k | // the incoming IR, we need to build VPlan upfront in the vectorization |
7322 | 19.9k | // pipeline. |
7323 | 19.9k | if (!L->empty()) |
7324 | 7 | return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, |
7325 | 7 | ORE, BFI, PSI, Hints); |
7326 | 19.9k | |
7327 | 19.9k | assert(L->empty() && "Inner loop expected."); |
7328 | 19.9k | // Check the loop for a trip count threshold: vectorize loops with a tiny trip |
7329 | 19.9k | // count by optimizing for size, to minimize overheads. |
7330 | 19.9k | // Prefer constant trip counts over profile data, over upper bound estimate. |
7331 | 19.9k | unsigned ExpectedTC = 0; |
7332 | 19.9k | bool HasExpectedTC = false; |
7333 | 19.9k | if (const SCEVConstant *ConstExits = |
7334 | 11.1k | dyn_cast<SCEVConstant>(SE->getBackedgeTakenCount(L))) { |
7335 | 11.1k | const APInt &ExitsCount = ConstExits->getAPInt(); |
7336 | 11.1k | // We are interested in small values for ExpectedTC. Skip over those that |
7337 | 11.1k | // can't fit an unsigned. |
7338 | 11.1k | if (ExitsCount.ult(std::numeric_limits<unsigned>::max())) { |
7339 | 11.1k | ExpectedTC = static_cast<unsigned>(ExitsCount.getZExtValue()) + 1; |
7340 | 11.1k | HasExpectedTC = true; |
7341 | 11.1k | } |
7342 | 11.1k | } |
7343 | 19.9k | // ExpectedTC may be large because it's bound by a variable. Check |
7344 | 19.9k | // profiling information to validate we should vectorize. |
7345 | 19.9k | if (!HasExpectedTC && LoopVectorizeWithBlockFrequency8.82k ) { |
7346 | 8.82k | auto EstimatedTC = getLoopEstimatedTripCount(L); |
7347 | 8.82k | if (EstimatedTC) { |
7348 | 11 | ExpectedTC = *EstimatedTC; |
7349 | 11 | HasExpectedTC = true; |
7350 | 11 | } |
7351 | 8.82k | } |
7352 | 19.9k | if (!HasExpectedTC) { |
7353 | 8.81k | ExpectedTC = SE->getSmallConstantMaxTripCount(L); |
7354 | 8.81k | HasExpectedTC = (ExpectedTC > 0); |
7355 | 8.81k | } |
7356 | 19.9k | |
7357 | 19.9k | if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold13.3k ) { |
7358 | 184 | LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " |
7359 | 184 | << "This loop is worth vectorizing only if no scalar " |
7360 | 184 | << "iteration overheads are incurred."); |
7361 | 184 | if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) |
7362 | 184 | LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); |
7363 | 184 | else { |
7364 | 182 | LLVM_DEBUG(dbgs() << "\n"); |
7365 | 182 | // Loops with a very small trip count are considered for vectorization |
7366 | 182 | // under OptForSize, thereby making sure the cost of their loop body is |
7367 | 182 | // dominant, free of runtime guards and scalar iteration overheads. |
7368 | 182 | OptForSize = true; |
7369 | 182 | } |
7370 | 184 | } |
7371 | 19.9k | |
7372 | 19.9k | // Check the function attributes to see if implicit floats are allowed. |
7373 | 19.9k | // FIXME: This check doesn't seem possibly correct -- what if the loop is |
7374 | 19.9k | // an integer loop and the vector instructions selected are purely integer |
7375 | 19.9k | // vector instructions? |
7376 | 19.9k | if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { |
7377 | 1 | LLVM_DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat" |
7378 | 1 | "attribute is used.\n"); |
7379 | 1 | ORE->emit(createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(), |
7380 | 1 | "NoImplicitFloat", L) |
7381 | 1 | << "loop not vectorized due to NoImplicitFloat attribute"); |
7382 | 1 | Hints.emitRemarkWithHints(); |
7383 | 1 | return false; |
7384 | 1 | } |
7385 | 19.9k | |
7386 | 19.9k | // Check if the target supports potentially unsafe FP vectorization. |
7387 | 19.9k | // FIXME: Add a check for the type of safety issue (denormal, signaling) |
7388 | 19.9k | // for the target we're vectorizing for, to make sure none of the |
7389 | 19.9k | // additional fp-math flags can help. |
7390 | 19.9k | if (Hints.isPotentiallyUnsafe() && |
7391 | 19.9k | TTI->isFPVectorizationPotentiallyUnsafe()10.7k ) { |
7392 | 0 | LLVM_DEBUG( |
7393 | 0 | dbgs() << "LV: Potentially unsafe FP op prevents vectorization.\n"); |
7394 | 0 | ORE->emit( |
7395 | 0 | createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(), "UnsafeFP", L) |
7396 | 0 | << "loop not vectorized due to unsafe FP support."); |
7397 | 0 | Hints.emitRemarkWithHints(); |
7398 | 0 | return false; |
7399 | 0 | } |
7400 | 19.9k | |
7401 | 19.9k | bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); |
7402 | 19.9k | InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); |
7403 | 19.9k | |
7404 | 19.9k | // If an override option has been passed in for interleaved accesses, use it. |
7405 | 19.9k | if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) |
7406 | 72 | UseInterleaved = EnableInterleavedMemAccesses; |
7407 | 19.9k | |
7408 | 19.9k | // Analyze interleaved memory accesses. |
7409 | 19.9k | if (UseInterleaved) { |
7410 | 19.4k | IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); |
7411 | 19.4k | } |
7412 | 19.9k | |
7413 | 19.9k | // Use the cost model. |
7414 | 19.9k | LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F, |
7415 | 19.9k | &Hints, IAI); |
7416 | 19.9k | CM.collectValuesToIgnore(); |
7417 | 19.9k | |
7418 | 19.9k | // Use the planner for vectorization. |
7419 | 19.9k | LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM); |
7420 | 19.9k | |
7421 | 19.9k | // Get user vectorization factor. |
7422 | 19.9k | unsigned UserVF = Hints.getWidth(); |
7423 | 19.9k | |
7424 | 19.9k | // Plan how to best vectorize, return the best VF and its cost. |
7425 | 19.9k | Optional<VectorizationFactor> MaybeVF = LVP.plan(OptForSize, UserVF); |
7426 | 19.9k | |
7427 | 19.9k | VectorizationFactor VF = VectorizationFactor::Disabled(); |
7428 | 19.9k | unsigned IC = 1; |
7429 | 19.9k | unsigned UserIC = Hints.getInterleave(); |
7430 | 19.9k | |
7431 | 19.9k | if (MaybeVF) { |
7432 | 19.8k | VF = *MaybeVF; |
7433 | 19.8k | // Select the interleave count. |
7434 | 19.8k | IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost); |
7435 | 19.8k | } |
7436 | 19.9k | |
7437 | 19.9k | // Identify the diagnostic messages that should be produced. |
7438 | 19.9k | std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; |
7439 | 19.9k | bool VectorizeLoop = true, InterleaveLoop = true; |
7440 | 19.9k | if (Requirements.doesNotMeet(F, L, Hints)) { |
7441 | 1.49k | LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " |
7442 | 1.49k | "requirements.\n"); |
7443 | 1.49k | Hints.emitRemarkWithHints(); |
7444 | 1.49k | return false; |
7445 | 1.49k | } |
7446 | 18.4k | |
7447 | 18.4k | if (VF.Width == 1) { |
7448 | 3.15k | LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); |
7449 | 3.15k | VecDiagMsg = std::make_pair( |
7450 | 3.15k | "VectorizationNotBeneficial", |
7451 | 3.15k | "the cost-model indicates that vectorization is not beneficial"); |
7452 | 3.15k | VectorizeLoop = false; |
7453 | 3.15k | } |
7454 | 18.4k | |
7455 | 18.4k | if (!MaybeVF && UserIC > 171 ) { |
7456 | 1 | // Tell the user interleaving was avoided up-front, despite being explicitly |
7457 | 1 | // requested. |
7458 | 1 | LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " |
7459 | 1 | "interleaving should be avoided up front\n"); |
7460 | 1 | IntDiagMsg = std::make_pair( |
7461 | 1 | "InterleavingAvoided", |
7462 | 1 | "Ignoring UserIC, because interleaving was avoided up front"); |
7463 | 1 | InterleaveLoop = false; |
7464 | 18.4k | } else if (IC == 1 && UserIC <= 12.96k ) { |
7465 | 2.81k | // Tell the user interleaving is not beneficial. |
7466 | 2.81k | LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); |
7467 | 2.81k | IntDiagMsg = std::make_pair( |
7468 | 2.81k | "InterleavingNotBeneficial", |
7469 | 2.81k | "the cost-model indicates that interleaving is not beneficial"); |
7470 | 2.81k | InterleaveLoop = false; |
7471 | 2.81k | if (UserIC == 1) { |
7472 | 411 | IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; |
7473 | 411 | IntDiagMsg.second += |
7474 | 411 | " and is explicitly disabled or interleave count is set to 1"; |
7475 | 411 | } |
7476 | 15.6k | } else if (IC > 1 && UserIC == 115.4k ) { |
7477 | 118 | // Tell the user interleaving is beneficial, but it explicitly disabled. |
7478 | 118 | LLVM_DEBUG( |
7479 | 118 | dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); |
7480 | 118 | IntDiagMsg = std::make_pair( |
7481 | 118 | "InterleavingBeneficialButDisabled", |
7482 | 118 | "the cost-model indicates that interleaving is beneficial " |
7483 | 118 | "but is explicitly disabled or interleave count is set to 1"); |
7484 | 118 | InterleaveLoop = false; |
7485 | 118 | } |
7486 | 18.4k | |
7487 | 18.4k | // Override IC if user provided an interleave count. |
7488 | 18.4k | IC = UserIC > 0 ? UserIC685 : IC17.7k ; |
7489 | 18.4k | |
7490 | 18.4k | // Emit diagnostic messages, if any. |
7491 | 18.4k | const char *VAPassName = Hints.vectorizeAnalysisPassName(); |
7492 | 18.4k | if (!VectorizeLoop && !InterleaveLoop3.15k ) { |
7493 | 1.40k | // Do not vectorize or interleaving the loop. |
7494 | 1.40k | ORE->emit([&]() { |
7495 | 5 | return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, |
7496 | 5 | L->getStartLoc(), L->getHeader()) |
7497 | 5 | << VecDiagMsg.second; |
7498 | 5 | }); |
7499 | 1.40k | ORE->emit([&]() { |
7500 | 5 | return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, |
7501 | 5 | L->getStartLoc(), L->getHeader()) |
7502 | 5 | << IntDiagMsg.second; |
7503 | 5 | }); |
7504 | 1.40k | return false; |
7505 | 17.0k | } else if (!VectorizeLoop && InterleaveLoop1.74k ) { |
7506 | 1.74k | LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); |
7507 | 1.74k | ORE->emit([&]() { |
7508 | 2 | return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, |
7509 | 2 | L->getStartLoc(), L->getHeader()) |
7510 | 2 | << VecDiagMsg.second; |
7511 | 2 | }); |
7512 | 15.3k | } else if (VectorizeLoop && !InterleaveLoop) { |
7513 | 1.52k | LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width |
7514 | 1.52k | << ") in " << DebugLocStr << '\n'); |
7515 | 1.52k | ORE->emit([&]() { |
7516 | 5 | return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, |
7517 | 5 | L->getStartLoc(), L->getHeader()) |
7518 | 5 | << IntDiagMsg.second; |
7519 | 5 | }); |
7520 | 13.7k | } else if (VectorizeLoop && InterleaveLoop) { |
7521 | 13.7k | LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width |
7522 | 13.7k | << ") in " << DebugLocStr << '\n'); |
7523 | 13.7k | LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); |
7524 | 13.7k | } |
7525 | 18.4k | |
7526 | 18.4k | LVP.setBestPlan(VF.Width, IC); |
7527 | 17.0k | |
7528 | 17.0k | using namespace ore; |
7529 | 17.0k | bool DisableRuntimeUnroll = false; |
7530 | 17.0k | MDNode *OrigLoopID = L->getLoopID(); |
7531 | 17.0k | |
7532 | 17.0k | if (!VectorizeLoop) { |
7533 | 1.74k | assert(IC > 1 && "interleave count should not be 1 or 0"); |
7534 | 1.74k | // If we decided that it is not legal to vectorize the loop, then |
7535 | 1.74k | // interleave it. |
7536 | 1.74k | InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, |
7537 | 1.74k | &CM); |
7538 | 1.74k | LVP.executePlan(Unroller, DT); |
7539 | 1.74k | |
7540 | 1.74k | ORE->emit([&]() { |
7541 | 2 | return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), |
7542 | 2 | L->getHeader()) |
7543 | 2 | << "interleaved loop (interleaved count: " |
7544 | 2 | << NV("InterleaveCount", IC) << ")"; |
7545 | 2 | }); |
7546 | 15.3k | } else { |
7547 | 15.3k | // If we decided that it is *legal* to vectorize the loop, then do it. |
7548 | 15.3k | InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, |
7549 | 15.3k | &LVL, &CM); |
7550 | 15.3k | LVP.executePlan(LB, DT); |
7551 | 15.3k | ++LoopsVectorized; |
7552 | 15.3k | |
7553 | 15.3k | // Add metadata to disable runtime unrolling a scalar loop when there are |
7554 | 15.3k | // no runtime checks about strides and memory. A scalar loop that is |
7555 | 15.3k | // rarely used is not worth unrolling. |
7556 | 15.3k | if (!LB.areSafetyChecksAdded()) |
7557 | 12.6k | DisableRuntimeUnroll = true; |
7558 | 15.3k | |
7559 | 15.3k | // Report the vectorization decision. |
7560 | 15.3k | ORE->emit([&]() { |
7561 | 12 | return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), |
7562 | 12 | L->getHeader()) |
7563 | 12 | << "vectorized loop (vectorization width: " |
7564 | 12 | << NV("VectorizationFactor", VF.Width) |
7565 | 12 | << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; |
7566 | 12 | }); |
7567 | 15.3k | } |
7568 | 17.0k | |
7569 | 17.0k | Optional<MDNode *> RemainderLoopID = |
7570 | 17.0k | makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, |
7571 | 17.0k | LLVMLoopVectorizeFollowupEpilogue}); |
7572 | 17.0k | if (RemainderLoopID.hasValue()) { |
7573 | 1 | L->setLoopID(RemainderLoopID.getValue()); |
7574 | 17.0k | } else { |
7575 | 17.0k | if (DisableRuntimeUnroll) |
7576 | 12.6k | AddRuntimeUnrollDisableMetaData(L); |
7577 | 17.0k | |
7578 | 17.0k | // Mark the loop as already vectorized to avoid vectorizing again. |
7579 | 17.0k | Hints.setAlreadyVectorized(); |
7580 | 17.0k | } |
7581 | 17.0k | |
7582 | 17.0k | LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); |
7583 | 17.0k | return true; |
7584 | 18.4k | } |
7585 | | |
7586 | | bool LoopVectorizePass::runImpl( |
7587 | | Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, |
7588 | | DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, |
7589 | | DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_, |
7590 | | std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, |
7591 | 280k | OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { |
7592 | 280k | SE = &SE_; |
7593 | 280k | LI = &LI_; |
7594 | 280k | TTI = &TTI_; |
7595 | 280k | DT = &DT_; |
7596 | 280k | BFI = &BFI_; |
7597 | 280k | TLI = TLI_; |
7598 | 280k | AA = &AA_; |
7599 | 280k | AC = &AC_; |
7600 | 280k | GetLAA = &GetLAA_; |
7601 | 280k | DB = &DB_; |
7602 | 280k | ORE = &ORE_; |
7603 | 280k | PSI = PSI_; |
7604 | 280k | |
7605 | 280k | // Don't attempt if |
7606 | 280k | // 1. the target claims to have no vector registers, and |
7607 | 280k | // 2. interleaving won't help ILP. |
7608 | 280k | // |
7609 | 280k | // The second condition is necessary because, even if the target has no |
7610 | 280k | // vector registers, loop vectorization may still enable scalar |
7611 | 280k | // interleaving. |
7612 | 280k | if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 22.24k ) |
7613 | 2.17k | return false; |
7614 | 278k | |
7615 | 278k | bool Changed = false; |
7616 | 278k | |
7617 | 278k | // The vectorizer requires loops to be in simplified form. |
7618 | 278k | // Since simplification may add new inner loops, it has to run before the |
7619 | 278k | // legality and profitability checks. This means running the loop vectorizer |
7620 | 278k | // will simplify all loops, regardless of whether anything end up being |
7621 | 278k | // vectorized. |
7622 | 278k | for (auto &L : *LI) |
7623 | 129k | Changed |= |
7624 | 129k | simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); |
7625 | 278k | |
7626 | 278k | // Build up a worklist of inner-loops to vectorize. This is necessary as |
7627 | 278k | // the act of vectorizing or partially unrolling a loop creates new loops |
7628 | 278k | // and can invalidate iterators across the loops. |
7629 | 278k | SmallVector<Loop *, 8> Worklist; |
7630 | 278k | |
7631 | 278k | for (Loop *L : *LI) |
7632 | 129k | collectSupportedLoops(*L, LI, ORE, Worklist); |
7633 | 278k | |
7634 | 278k | LoopsAnalyzed += Worklist.size(); |
7635 | 278k | |
7636 | 278k | // Now walk the identified inner loops. |
7637 | 425k | while (!Worklist.empty()) { |
7638 | 146k | Loop *L = Worklist.pop_back_val(); |
7639 | 146k | |
7640 | 146k | // For the inner loops we actually process, form LCSSA to simplify the |
7641 | 146k | // transform. |
7642 | 146k | Changed |= formLCSSARecursively(*L, *DT, LI, SE); |
7643 | 146k | |
7644 | 146k | Changed |= processLoop(L); |
7645 | 146k | } |
7646 | 278k | |
7647 | 278k | // Process each loop nest in the function. |
7648 | 278k | return Changed; |
7649 | 278k | } |
7650 | | |
7651 | | PreservedAnalyses LoopVectorizePass::run(Function &F, |
7652 | 887 | FunctionAnalysisManager &AM) { |
7653 | 887 | auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); |
7654 | 887 | auto &LI = AM.getResult<LoopAnalysis>(F); |
7655 | 887 | auto &TTI = AM.getResult<TargetIRAnalysis>(F); |
7656 | 887 | auto &DT = AM.getResult<DominatorTreeAnalysis>(F); |
7657 | 887 | auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); |
7658 | 887 | auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); |
7659 | 887 | auto &AA = AM.getResult<AAManager>(F); |
7660 | 887 | auto &AC = AM.getResult<AssumptionAnalysis>(F); |
7661 | 887 | auto &DB = AM.getResult<DemandedBitsAnalysis>(F); |
7662 | 887 | auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); |
7663 | 887 | MemorySSA *MSSA = EnableMSSALoopDependency |
7664 | 887 | ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()0 |
7665 | 887 | : nullptr; |
7666 | 887 | |
7667 | 887 | auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); |
7668 | 887 | std::function<const LoopAccessInfo &(Loop &)> GetLAA = |
7669 | 887 | [&](Loop &L) -> const LoopAccessInfo & { |
7670 | 23 | LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; |
7671 | 23 | return LAM.getResult<LoopAccessAnalysis>(L, AR); |
7672 | 23 | }; |
7673 | 887 | const ModuleAnalysisManager &MAM = |
7674 | 887 | AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager(); |
7675 | 887 | ProfileSummaryInfo *PSI = |
7676 | 887 | MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); |
7677 | 887 | bool Changed = |
7678 | 887 | runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); |
7679 | 887 | if (!Changed) |
7680 | 868 | return PreservedAnalyses::all(); |
7681 | 19 | PreservedAnalyses PA; |
7682 | 19 | |
7683 | 19 | // We currently do not preserve loopinfo/dominator analyses with outer loop |
7684 | 19 | // vectorization. Until this is addressed, mark these analyses as preserved |
7685 | 19 | // only for non-VPlan-native path. |
7686 | 19 | // TODO: Preserve Loop and Dominator analyses for VPlan-native path. |
7687 | 19 | if (!EnableVPlanNativePath) { |
7688 | 19 | PA.preserve<LoopAnalysis>(); |
7689 | 19 | PA.preserve<DominatorTreeAnalysis>(); |
7690 | 19 | } |
7691 | 19 | PA.preserve<BasicAA>(); |
7692 | 19 | PA.preserve<GlobalsAA>(); |
7693 | 19 | return PA; |
7694 | 19 | } |