/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===- LoopVectorizationLegality.cpp --------------------------------------===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | // |
9 | | // This file provides loop vectorization legality analysis. Original code |
10 | | // resided in LoopVectorize.cpp for a long time. |
11 | | // |
12 | | // At this point, it is implemented as a utility class, not as an analysis |
13 | | // pass. It should be easy to create an analysis pass around it if there |
14 | | // is a need (but D45420 needs to happen first). |
15 | | // |
16 | | #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" |
17 | | #include "llvm/Analysis/VectorUtils.h" |
18 | | #include "llvm/IR/IntrinsicInst.h" |
19 | | |
20 | | using namespace llvm; |
21 | | |
22 | 574k | #define LV_NAME "loop-vectorize" |
23 | 423k | #define DEBUG_TYPE LV_NAME |
24 | | |
25 | | extern cl::opt<bool> EnableVPlanPredication; |
26 | | |
27 | | static cl::opt<bool> |
28 | | EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, |
29 | | cl::desc("Enable if-conversion during vectorization.")); |
30 | | |
31 | | static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( |
32 | | "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, |
33 | | cl::desc("The maximum allowed number of runtime memory checks with a " |
34 | | "vectorize(enable) pragma.")); |
35 | | |
36 | | static cl::opt<unsigned> VectorizeSCEVCheckThreshold( |
37 | | "vectorize-scev-check-threshold", cl::init(16), cl::Hidden, |
38 | | cl::desc("The maximum number of SCEV checks allowed.")); |
39 | | |
40 | | static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold( |
41 | | "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden, |
42 | | cl::desc("The maximum number of SCEV checks allowed with a " |
43 | | "vectorize(enable) pragma")); |
44 | | |
45 | | /// Maximum vectorization interleave count. |
46 | | static const unsigned MaxInterleaveFactor = 16; |
47 | | |
48 | | namespace llvm { |
49 | | |
50 | | #ifndef NDEBUG |
51 | | static void debugVectorizationFailure(const StringRef DebugMsg, |
52 | | Instruction *I) { |
53 | | dbgs() << "LV: Not vectorizing: " << DebugMsg; |
54 | | if (I != nullptr) |
55 | | dbgs() << " " << *I; |
56 | | else |
57 | | dbgs() << '.'; |
58 | | dbgs() << '\n'; |
59 | | } |
60 | | #endif |
61 | | |
62 | | OptimizationRemarkAnalysis createLVMissedAnalysis(const char *PassName, |
63 | | StringRef RemarkName, |
64 | | Loop *TheLoop, |
65 | 114k | Instruction *I) { |
66 | 114k | Value *CodeRegion = TheLoop->getHeader(); |
67 | 114k | DebugLoc DL = TheLoop->getStartLoc(); |
68 | 114k | |
69 | 114k | if (I) { |
70 | 67.4k | CodeRegion = I->getParent(); |
71 | 67.4k | // If there is no debug location attached to the instruction, revert back to |
72 | 67.4k | // using the loop's. |
73 | 67.4k | if (I->getDebugLoc()) |
74 | 2.55k | DL = I->getDebugLoc(); |
75 | 67.4k | } |
76 | 114k | |
77 | 114k | OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); |
78 | 114k | R << "loop not vectorized: "; |
79 | 114k | return R; |
80 | 114k | } |
81 | | |
82 | 11.2k | bool LoopVectorizeHints::Hint::validate(unsigned Val) { |
83 | 11.2k | switch (Kind) { |
84 | 11.2k | case HK_WIDTH: |
85 | 5.59k | return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth; |
86 | 11.2k | case HK_UNROLL: |
87 | 5.56k | return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor; |
88 | 11.2k | case HK_FORCE: |
89 | 104 | return (Val <= 1); |
90 | 11.2k | case HK_ISVECTORIZED: |
91 | 1 | return (Val == 0 || Val == 1); |
92 | 0 | } |
93 | 0 | return false; |
94 | 0 | } |
95 | | |
96 | | LoopVectorizeHints::LoopVectorizeHints(const Loop *L, |
97 | | bool InterleaveOnlyWhenForced, |
98 | | OptimizationRemarkEmitter &ORE) |
99 | | : Width("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH), |
100 | | Interleave("interleave.count", InterleaveOnlyWhenForced, HK_UNROLL), |
101 | | Force("vectorize.enable", FK_Undefined, HK_FORCE), |
102 | 163k | IsVectorized("isvectorized", 0, HK_ISVECTORIZED), TheLoop(L), ORE(ORE) { |
103 | 163k | // Populate values with existing loop metadata. |
104 | 163k | getHintsFromMetadata(); |
105 | 163k | |
106 | 163k | // force-vector-interleave overrides DisableInterleaving. |
107 | 163k | if (VectorizerParams::isInterleaveForced()) |
108 | 1.48k | Interleave.Value = VectorizerParams::VectorizationInterleave; |
109 | 163k | |
110 | 163k | if (IsVectorized.Value != 1) |
111 | 163k | // If the vectorization width and interleaving count are both 1 then |
112 | 163k | // consider the loop to have been already vectorized because there's |
113 | 163k | // nothing more that we can do. |
114 | 163k | IsVectorized.Value = Width.Value == 1 && Interleave.Value == 15.64k ; |
115 | 163k | LLVM_DEBUG(if (InterleaveOnlyWhenForced && Interleave.Value == 1) dbgs() |
116 | 163k | << "LV: Interleaving disabled by the pass manager\n"); |
117 | 163k | } |
118 | | |
119 | 34.1k | void LoopVectorizeHints::setAlreadyVectorized() { |
120 | 34.1k | LLVMContext &Context = TheLoop->getHeader()->getContext(); |
121 | 34.1k | |
122 | 34.1k | MDNode *IsVectorizedMD = MDNode::get( |
123 | 34.1k | Context, |
124 | 34.1k | {MDString::get(Context, "llvm.loop.isvectorized"), |
125 | 34.1k | ConstantAsMetadata::get(ConstantInt::get(Context, APInt(32, 1)))}); |
126 | 34.1k | MDNode *LoopID = TheLoop->getLoopID(); |
127 | 34.1k | MDNode *NewLoopID = |
128 | 34.1k | makePostTransformationMetadata(Context, LoopID, |
129 | 34.1k | {Twine(Prefix(), "vectorize.").str(), |
130 | 34.1k | Twine(Prefix(), "interleave.").str()}, |
131 | 34.1k | {IsVectorizedMD}); |
132 | 34.1k | TheLoop->setLoopID(NewLoopID); |
133 | 34.1k | |
134 | 34.1k | // Update internal cache. |
135 | 34.1k | IsVectorized.Value = 1; |
136 | 34.1k | } |
137 | | |
138 | | bool LoopVectorizeHints::allowVectorization( |
139 | 146k | Function *F, Loop *L, bool VectorizeOnlyWhenForced) const { |
140 | 146k | if (getForce() == LoopVectorizeHints::FK_Disabled) { |
141 | 14 | LLVM_DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n"); |
142 | 14 | emitRemarkWithHints(); |
143 | 14 | return false; |
144 | 14 | } |
145 | 146k | |
146 | 146k | if (VectorizeOnlyWhenForced && getForce() != LoopVectorizeHints::FK_Enabled136 ) { |
147 | 118 | LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n"); |
148 | 118 | emitRemarkWithHints(); |
149 | 118 | return false; |
150 | 118 | } |
151 | 146k | |
152 | 146k | if (getIsVectorized() == 1) { |
153 | 5.56k | LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n"); |
154 | 5.56k | // FIXME: Add interleave.disable metadata. This will allow |
155 | 5.56k | // vectorize.disable to be used without disabling the pass and errors |
156 | 5.56k | // to differentiate between disabled vectorization and a width of 1. |
157 | 5.56k | ORE.emit([&]() { |
158 | 6 | return OptimizationRemarkAnalysis(vectorizeAnalysisPassName(), |
159 | 6 | "AllDisabled", L->getStartLoc(), |
160 | 6 | L->getHeader()) |
161 | 6 | << "loop not vectorized: vectorization and interleaving are " |
162 | 6 | "explicitly disabled, or the loop has already been " |
163 | 6 | "vectorized"; |
164 | 6 | }); |
165 | 5.56k | return false; |
166 | 5.56k | } |
167 | 141k | |
168 | 141k | return true; |
169 | 141k | } |
170 | | |
171 | 122k | void LoopVectorizeHints::emitRemarkWithHints() const { |
172 | 122k | using namespace ore; |
173 | 122k | |
174 | 122k | ORE.emit([&]() { |
175 | 53 | if (Force.Value == LoopVectorizeHints::FK_Disabled) |
176 | 1 | return OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled", |
177 | 1 | TheLoop->getStartLoc(), |
178 | 1 | TheLoop->getHeader()) |
179 | 1 | << "loop not vectorized: vectorization is explicitly disabled"; |
180 | 52 | else { |
181 | 52 | OptimizationRemarkMissed R(LV_NAME, "MissedDetails", |
182 | 52 | TheLoop->getStartLoc(), TheLoop->getHeader()); |
183 | 52 | R << "loop not vectorized"; |
184 | 52 | if (Force.Value == LoopVectorizeHints::FK_Enabled) { |
185 | 5 | R << " (Force=" << NV("Force", true); |
186 | 5 | if (Width.Value != 0) |
187 | 1 | R << ", Vector Width=" << NV("VectorWidth", Width.Value); |
188 | 5 | if (Interleave.Value != 0) |
189 | 0 | R << ", Interleave Count=" << NV("InterleaveCount", Interleave.Value); |
190 | 5 | R << ")"; |
191 | 5 | } |
192 | 52 | return R; |
193 | 52 | } |
194 | 53 | }); |
195 | 122k | } |
196 | | |
197 | 152k | const char *LoopVectorizeHints::vectorizeAnalysisPassName() const { |
198 | 152k | if (getWidth() == 1) |
199 | 83 | return LV_NAME; |
200 | 152k | if (getForce() == LoopVectorizeHints::FK_Disabled) |
201 | 0 | return LV_NAME; |
202 | 152k | if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0152k ) |
203 | 150k | return LV_NAME; |
204 | 1.39k | return OptimizationRemarkAnalysis::AlwaysPrint; |
205 | 1.39k | } |
206 | | |
207 | 163k | void LoopVectorizeHints::getHintsFromMetadata() { |
208 | 163k | MDNode *LoopID = TheLoop->getLoopID(); |
209 | 163k | if (!LoopID) |
210 | 147k | return; |
211 | 16.8k | |
212 | 16.8k | // First operand should refer to the loop id itself. |
213 | 16.8k | assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); |
214 | 16.8k | assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); |
215 | 16.8k | |
216 | 50.4k | for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i33.6k ) { |
217 | 33.6k | const MDString *S = nullptr; |
218 | 33.6k | SmallVector<Metadata *, 4> Args; |
219 | 33.6k | |
220 | 33.6k | // The expected hint is either a MDString or a MDNode with the first |
221 | 33.6k | // operand a MDString. |
222 | 33.6k | if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) { |
223 | 33.6k | if (!MD || MD->getNumOperands() == 0) |
224 | 0 | continue; |
225 | 33.6k | S = dyn_cast<MDString>(MD->getOperand(0)); |
226 | 57.2k | for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i23.6k ) |
227 | 23.6k | Args.push_back(MD->getOperand(i)); |
228 | 33.6k | } else { |
229 | 0 | S = dyn_cast<MDString>(LoopID->getOperand(i)); |
230 | 0 | assert(Args.size() == 0 && "too many arguments for MDString"); |
231 | 0 | } |
232 | 33.6k | |
233 | 33.6k | if (!S) |
234 | 22.3k | continue; |
235 | 11.3k | |
236 | 11.3k | // Check if the hint starts with the loop metadata prefix. |
237 | 11.3k | StringRef Name = S->getString(); |
238 | 11.3k | if (Args.size() == 1) |
239 | 11.2k | setHint(Name, Args[0]); |
240 | 11.3k | } |
241 | 16.8k | } |
242 | | |
243 | 11.2k | void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) { |
244 | 11.2k | if (!Name.startswith(Prefix())) |
245 | 0 | return; |
246 | 11.2k | Name = Name.substr(Prefix().size(), StringRef::npos); |
247 | 11.2k | |
248 | 11.2k | const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg); |
249 | 11.2k | if (!C) |
250 | 22 | return; |
251 | 11.2k | unsigned Val = C->getZExtValue(); |
252 | 11.2k | |
253 | 11.2k | Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized}; |
254 | 17.0k | for (auto H : Hints) { |
255 | 17.0k | if (Name == H->Name) { |
256 | 11.2k | if (H->validate(Val)) |
257 | 11.2k | H->Value = Val; |
258 | 11.2k | else |
259 | 11.2k | LLVM_DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n"); |
260 | 11.2k | break; |
261 | 11.2k | } |
262 | 17.0k | } |
263 | 11.2k | } |
264 | | |
265 | | bool LoopVectorizationRequirements::doesNotMeet( |
266 | 19.9k | Function *F, Loop *L, const LoopVectorizeHints &Hints) { |
267 | 19.9k | const char *PassName = Hints.vectorizeAnalysisPassName(); |
268 | 19.9k | bool Failed = false; |
269 | 19.9k | if (UnsafeAlgebraInst && !Hints.allowReordering()1.35k ) { |
270 | 1.35k | ORE.emit([&]() { |
271 | 4 | return OptimizationRemarkAnalysisFPCommute( |
272 | 4 | PassName, "CantReorderFPOps", UnsafeAlgebraInst->getDebugLoc(), |
273 | 4 | UnsafeAlgebraInst->getParent()) |
274 | 4 | << "loop not vectorized: cannot prove it is safe to reorder " |
275 | 4 | "floating-point operations"; |
276 | 4 | }); |
277 | 1.35k | Failed = true; |
278 | 1.35k | } |
279 | 19.9k | |
280 | 19.9k | // Test if runtime memcheck thresholds are exceeded. |
281 | 19.9k | bool PragmaThresholdReached = |
282 | 19.9k | NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; |
283 | 19.9k | bool ThresholdReached = |
284 | 19.9k | NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; |
285 | 19.9k | if ((ThresholdReached && !Hints.allowReordering()144 ) || |
286 | 19.9k | PragmaThresholdReached19.8k ) { |
287 | 142 | ORE.emit([&]() { |
288 | 5 | return OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps", |
289 | 5 | L->getStartLoc(), |
290 | 5 | L->getHeader()) |
291 | 5 | << "loop not vectorized: cannot prove it is safe to reorder " |
292 | 5 | "memory operations"; |
293 | 5 | }); |
294 | 142 | LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); |
295 | 142 | Failed = true; |
296 | 142 | } |
297 | 19.9k | |
298 | 19.9k | return Failed; |
299 | 19.9k | } |
300 | | |
301 | | // Return true if the inner loop \p Lp is uniform with regard to the outer loop |
302 | | // \p OuterLp (i.e., if the outer loop is vectorized, all the vector lanes |
303 | | // executing the inner loop will execute the same iterations). This check is |
304 | | // very constrained for now but it will be relaxed in the future. \p Lp is |
305 | | // considered uniform if it meets all the following conditions: |
306 | | // 1) it has a canonical IV (starting from 0 and with stride 1), |
307 | | // 2) its latch terminator is a conditional branch and, |
308 | | // 3) its latch condition is a compare instruction whose operands are the |
309 | | // canonical IV and an OuterLp invariant. |
310 | | // This check doesn't take into account the uniformity of other conditions not |
311 | | // related to the loop latch because they don't affect the loop uniformity. |
312 | | // |
313 | | // NOTE: We decided to keep all these checks and its associated documentation |
314 | | // together so that we can easily have a picture of the current supported loop |
315 | | // nests. However, some of the current checks don't depend on \p OuterLp and |
316 | | // would be redundantly executed for each \p Lp if we invoked this function for |
317 | | // different candidate outer loops. This is not the case for now because we |
318 | | // don't currently have the infrastructure to evaluate multiple candidate outer |
319 | | // loops and \p OuterLp will be a fixed parameter while we only support explicit |
320 | | // outer loop vectorization. It's also very likely that these checks go away |
321 | | // before introducing the aforementioned infrastructure. However, if this is not |
322 | | // the case, we should move the \p OuterLp independent checks to a separate |
323 | | // function that is only executed once for each \p Lp. |
324 | 14 | static bool isUniformLoop(Loop *Lp, Loop *OuterLp) { |
325 | 14 | assert(Lp->getLoopLatch() && "Expected loop with a single latch."); |
326 | 14 | |
327 | 14 | // If Lp is the outer loop, it's uniform by definition. |
328 | 14 | if (Lp == OuterLp) |
329 | 7 | return true; |
330 | 7 | assert(OuterLp->contains(Lp) && "OuterLp must contain Lp."); |
331 | 7 | |
332 | 7 | // 1. |
333 | 7 | PHINode *IV = Lp->getCanonicalInductionVariable(); |
334 | 7 | if (!IV) { |
335 | 0 | LLVM_DEBUG(dbgs() << "LV: Canonical IV not found.\n"); |
336 | 0 | return false; |
337 | 0 | } |
338 | 7 | |
339 | 7 | // 2. |
340 | 7 | BasicBlock *Latch = Lp->getLoopLatch(); |
341 | 7 | auto *LatchBr = dyn_cast<BranchInst>(Latch->getTerminator()); |
342 | 7 | if (!LatchBr || LatchBr->isUnconditional()) { |
343 | 0 | LLVM_DEBUG(dbgs() << "LV: Unsupported loop latch branch.\n"); |
344 | 0 | return false; |
345 | 0 | } |
346 | 7 | |
347 | 7 | // 3. |
348 | 7 | auto *LatchCmp = dyn_cast<CmpInst>(LatchBr->getCondition()); |
349 | 7 | if (!LatchCmp) { |
350 | 0 | LLVM_DEBUG( |
351 | 0 | dbgs() << "LV: Loop latch condition is not a compare instruction.\n"); |
352 | 0 | return false; |
353 | 0 | } |
354 | 7 | |
355 | 7 | Value *CondOp0 = LatchCmp->getOperand(0); |
356 | 7 | Value *CondOp1 = LatchCmp->getOperand(1); |
357 | 7 | Value *IVUpdate = IV->getIncomingValueForBlock(Latch); |
358 | 7 | if (!(CondOp0 == IVUpdate && OuterLp->isLoopInvariant(CondOp1)) && |
359 | 7 | !(0 CondOp1 == IVUpdate0 && OuterLp->isLoopInvariant(CondOp0)0 )) { |
360 | 0 | LLVM_DEBUG(dbgs() << "LV: Loop latch condition is not uniform.\n"); |
361 | 0 | return false; |
362 | 0 | } |
363 | 7 | |
364 | 7 | return true; |
365 | 7 | } |
366 | | |
367 | | // Return true if \p Lp and all its nested loops are uniform with regard to \p |
368 | | // OuterLp. |
369 | 14 | static bool isUniformLoopNest(Loop *Lp, Loop *OuterLp) { |
370 | 14 | if (!isUniformLoop(Lp, OuterLp)) |
371 | 0 | return false; |
372 | 14 | |
373 | 14 | // Check if nested loops are uniform. |
374 | 14 | for (Loop *SubLp : *Lp) |
375 | 7 | if (!isUniformLoopNest(SubLp, OuterLp)) |
376 | 0 | return false; |
377 | 14 | |
378 | 14 | return true; |
379 | 14 | } |
380 | | |
381 | | /// Check whether it is safe to if-convert this phi node. |
382 | | /// |
383 | | /// Phi nodes with constant expressions that can trap are not safe to if |
384 | | /// convert. |
385 | 4.37k | static bool canIfConvertPHINodes(BasicBlock *BB) { |
386 | 4.62k | for (PHINode &Phi : BB->phis()) { |
387 | 4.62k | for (Value *V : Phi.incoming_values()) |
388 | 11.5k | if (auto *C = dyn_cast<Constant>(V)) |
389 | 1.14k | if (C->canTrap()) |
390 | 0 | return false; |
391 | 4.62k | } |
392 | 4.37k | return true; |
393 | 4.37k | } |
394 | | |
395 | 72.7k | static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) { |
396 | 72.7k | if (Ty->isPointerTy()) |
397 | 11.6k | return DL.getIntPtrType(Ty); |
398 | 61.0k | |
399 | 61.0k | // It is possible that char's or short's overflow when we ask for the loop's |
400 | 61.0k | // trip count, work around this by changing the type size. |
401 | 61.0k | if (Ty->getScalarSizeInBits() < 32) |
402 | 66 | return Type::getInt32Ty(Ty->getContext()); |
403 | 60.9k | |
404 | 60.9k | return Ty; |
405 | 60.9k | } |
406 | | |
407 | 7.01k | static Type *getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) { |
408 | 7.01k | Ty0 = convertPointerToIntegerType(DL, Ty0); |
409 | 7.01k | Ty1 = convertPointerToIntegerType(DL, Ty1); |
410 | 7.01k | if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits()) |
411 | 284 | return Ty0; |
412 | 6.73k | return Ty1; |
413 | 6.73k | } |
414 | | |
415 | | /// Check that the instruction has outside loop users and is not an |
416 | | /// identified reduction variable. |
417 | | static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, |
418 | 448k | SmallPtrSetImpl<Value *> &AllowedExit) { |
419 | 448k | // Reductions, Inductions and non-header phis are allowed to have exit users. All |
420 | 448k | // other instructions must not have external users. |
421 | 448k | if (!AllowedExit.count(Inst)) |
422 | 409k | // Check that all of the users of the loop are inside the BB. |
423 | 409k | for (User *U : Inst->users()) { |
424 | 370k | Instruction *UI = cast<Instruction>(U); |
425 | 370k | // This user may be a reduction exit value. |
426 | 370k | if (!TheLoop->contains(UI)) { |
427 | 666 | LLVM_DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n'); |
428 | 666 | return true; |
429 | 666 | } |
430 | 370k | } |
431 | 448k | return false447k ; |
432 | 448k | } |
433 | | |
434 | 159k | int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { |
435 | 159k | const ValueToValueMap &Strides = |
436 | 159k | getSymbolicStrides() ? *getSymbolicStrides() : ValueToValueMap()0 ; |
437 | 159k | |
438 | 159k | int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, true, false); |
439 | 159k | if (Stride == 1 || Stride == -131.7k ) |
440 | 136k | return Stride; |
441 | 22.6k | return 0; |
442 | 22.6k | } |
443 | | |
444 | 157k | bool LoopVectorizationLegality::isUniform(Value *V) { |
445 | 157k | return LAI->isUniform(V); |
446 | 157k | } |
447 | | |
448 | | void LoopVectorizationLegality::reportVectorizationFailure( |
449 | | const StringRef DebugMsg, const StringRef OREMsg, |
450 | 113k | const StringRef ORETag, Instruction *I) const { |
451 | 113k | LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); |
452 | 113k | ORE->emit(createLVMissedAnalysis(Hints->vectorizeAnalysisPassName(), |
453 | 113k | ORETag, TheLoop, I) << OREMsg); |
454 | 113k | } |
455 | | |
456 | 7 | bool LoopVectorizationLegality::canVectorizeOuterLoop() { |
457 | 7 | assert(!TheLoop->empty() && "We are not vectorizing an outer loop."); |
458 | 7 | // Store the result and return it at the end instead of exiting early, in case |
459 | 7 | // allowExtraAnalysis is used to report multiple reasons for not vectorizing. |
460 | 7 | bool Result = true; |
461 | 7 | bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); |
462 | 7 | |
463 | 23 | for (BasicBlock *BB : TheLoop->blocks()) { |
464 | 23 | // Check whether the BB terminator is a BranchInst. Any other terminator is |
465 | 23 | // not supported yet. |
466 | 23 | auto *Br = dyn_cast<BranchInst>(BB->getTerminator()); |
467 | 23 | if (!Br) { |
468 | 0 | reportVectorizationFailure("Unsupported basic block terminator", |
469 | 0 | "loop control flow is not understood by vectorizer", |
470 | 0 | "CFGNotUnderstood"); |
471 | 0 | if (DoExtraAnalysis) |
472 | 0 | Result = false; |
473 | 0 | else |
474 | 0 | return false; |
475 | 23 | } |
476 | 23 | |
477 | 23 | // Check whether the BranchInst is a supported one. Only unconditional |
478 | 23 | // branches, conditional branches with an outer loop invariant condition or |
479 | 23 | // backedges are supported. |
480 | 23 | // FIXME: We skip these checks when VPlan predication is enabled as we |
481 | 23 | // want to allow divergent branches. This whole check will be removed |
482 | 23 | // once VPlan predication is on by default. |
483 | 23 | if (!EnableVPlanPredication && Br && Br->isConditional() && |
484 | 23 | !TheLoop->isLoopInvariant(Br->getCondition())15 && |
485 | 23 | !LI->isLoopHeader(Br->getSuccessor(0))14 && |
486 | 23 | !LI->isLoopHeader(Br->getSuccessor(1))14 ) { |
487 | 0 | reportVectorizationFailure("Unsupported conditional branch", |
488 | 0 | "loop control flow is not understood by vectorizer", |
489 | 0 | "CFGNotUnderstood"); |
490 | 0 | if (DoExtraAnalysis) |
491 | 0 | Result = false; |
492 | 0 | else |
493 | 0 | return false; |
494 | 0 | } |
495 | 23 | } |
496 | 7 | |
497 | 7 | // Check whether inner loops are uniform. At this point, we only support |
498 | 7 | // simple outer loops scenarios with uniform nested loops. |
499 | 7 | if (!isUniformLoopNest(TheLoop /*loop nest*/, |
500 | 7 | TheLoop /*context outer loop*/)) { |
501 | 0 | reportVectorizationFailure("Outer loop contains divergent loops", |
502 | 0 | "loop control flow is not understood by vectorizer", |
503 | 0 | "CFGNotUnderstood"); |
504 | 0 | if (DoExtraAnalysis) |
505 | 0 | Result = false; |
506 | 0 | else |
507 | 0 | return false; |
508 | 7 | } |
509 | 7 | |
510 | 7 | // Check whether we are able to set up outer loop induction. |
511 | 7 | if (!setupOuterLoopInductions()) { |
512 | 0 | reportVectorizationFailure("Unsupported outer loop Phi(s)", |
513 | 0 | "Unsupported outer loop Phi(s)", |
514 | 0 | "UnsupportedPhi"); |
515 | 0 | if (DoExtraAnalysis) |
516 | 0 | Result = false; |
517 | 0 | else |
518 | 0 | return false; |
519 | 7 | } |
520 | 7 | |
521 | 7 | return Result; |
522 | 7 | } |
523 | | |
524 | | void LoopVectorizationLegality::addInductionPhi( |
525 | | PHINode *Phi, const InductionDescriptor &ID, |
526 | 65.7k | SmallPtrSetImpl<Value *> &AllowedExit) { |
527 | 65.7k | Inductions[Phi] = ID; |
528 | 65.7k | |
529 | 65.7k | // In case this induction also comes with casts that we know we can ignore |
530 | 65.7k | // in the vectorized loop body, record them here. All casts could be recorded |
531 | 65.7k | // here for ignoring, but suffices to record only the first (as it is the |
532 | 65.7k | // only one that may bw used outside the cast sequence). |
533 | 65.7k | const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); |
534 | 65.7k | if (!Casts.empty()) |
535 | 15 | InductionCastsToIgnore.insert(*Casts.begin()); |
536 | 65.7k | |
537 | 65.7k | Type *PhiTy = Phi->getType(); |
538 | 65.7k | const DataLayout &DL = Phi->getModule()->getDataLayout(); |
539 | 65.7k | |
540 | 65.7k | // Get the widest type. |
541 | 65.7k | if (!PhiTy->isFloatingPointTy()) { |
542 | 65.7k | if (!WidestIndTy) |
543 | 58.7k | WidestIndTy = convertPointerToIntegerType(DL, PhiTy); |
544 | 7.01k | else |
545 | 7.01k | WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy); |
546 | 65.7k | } |
547 | 65.7k | |
548 | 65.7k | // Int inductions are special because we only allow one IV. |
549 | 65.7k | if (ID.getKind() == InductionDescriptor::IK_IntInduction && |
550 | 65.7k | ID.getConstIntStepValue()54.0k && ID.getConstIntStepValue()->isOne()53.6k && |
551 | 65.7k | isa<Constant>(ID.getStartValue())46.9k && |
552 | 65.7k | cast<Constant>(ID.getStartValue())->isNullValue()44.8k ) { |
553 | 41.9k | |
554 | 41.9k | // Use the phi node with the widest type as induction. Use the last |
555 | 41.9k | // one if there are multiple (no good reason for doing this other |
556 | 41.9k | // than it is expedient). We've checked that it begins at zero and |
557 | 41.9k | // steps by one, so this is a canonical induction variable. |
558 | 41.9k | if (!PrimaryInduction || PhiTy == WidestIndTy140 ) |
559 | 41.8k | PrimaryInduction = Phi; |
560 | 41.9k | } |
561 | 65.7k | |
562 | 65.7k | // Both the PHI node itself, and the "post-increment" value feeding |
563 | 65.7k | // back into the PHI node may have external users. |
564 | 65.7k | // We can allow those uses, except if the SCEVs we have for them rely |
565 | 65.7k | // on predicates that only hold within the loop, since allowing the exit |
566 | 65.7k | // currently means re-using this SCEV outside the loop (see PR33706 for more |
567 | 65.7k | // details). |
568 | 65.7k | if (PSE.getUnionPredicate().isAlwaysTrue()) { |
569 | 65.6k | AllowedExit.insert(Phi); |
570 | 65.6k | AllowedExit.insert(Phi->getIncomingValueForBlock(TheLoop->getLoopLatch())); |
571 | 65.6k | } |
572 | 65.7k | |
573 | 65.7k | LLVM_DEBUG(dbgs() << "LV: Found an induction variable.\n"); |
574 | 65.7k | } |
575 | | |
576 | 7 | bool LoopVectorizationLegality::setupOuterLoopInductions() { |
577 | 7 | BasicBlock *Header = TheLoop->getHeader(); |
578 | 7 | |
579 | 7 | // Returns true if a given Phi is a supported induction. |
580 | 7 | auto isSupportedPhi = [&](PHINode &Phi) -> bool { |
581 | 7 | InductionDescriptor ID; |
582 | 7 | if (InductionDescriptor::isInductionPHI(&Phi, TheLoop, PSE, ID) && |
583 | 7 | ID.getKind() == InductionDescriptor::IK_IntInduction) { |
584 | 7 | addInductionPhi(&Phi, ID, AllowedExit); |
585 | 7 | return true; |
586 | 7 | } else { |
587 | 0 | // Bail out for any Phi in the outer loop header that is not a supported |
588 | 0 | // induction. |
589 | 0 | LLVM_DEBUG( |
590 | 0 | dbgs() |
591 | 0 | << "LV: Found unsupported PHI for outer loop vectorization.\n"); |
592 | 0 | return false; |
593 | 0 | } |
594 | 7 | }; |
595 | 7 | |
596 | 7 | if (llvm::all_of(Header->phis(), isSupportedPhi)) |
597 | 7 | return true; |
598 | 0 | else |
599 | 0 | return false; |
600 | 7 | } |
601 | | |
602 | 92.7k | bool LoopVectorizationLegality::canVectorizeInstrs() { |
603 | 92.7k | BasicBlock *Header = TheLoop->getHeader(); |
604 | 92.7k | |
605 | 92.7k | // Look for the attribute signaling the absence of NaNs. |
606 | 92.7k | Function &F = *Header->getParent(); |
607 | 92.7k | HasFunNoNaNAttr = |
608 | 92.7k | F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true"; |
609 | 92.7k | |
610 | 92.7k | // For each block in the loop. |
611 | 98.7k | for (BasicBlock *BB : TheLoop->blocks()) { |
612 | 98.7k | // Scan the instructions in the block and look for hazards. |
613 | 582k | for (Instruction &I : *BB) { |
614 | 582k | if (auto *Phi = dyn_cast<PHINode>(&I)) { |
615 | 108k | Type *PhiTy = Phi->getType(); |
616 | 108k | // Check that this PHI type is allowed. |
617 | 108k | if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy()33.7k && |
618 | 108k | !PhiTy->isPointerTy()28.2k ) { |
619 | 4 | reportVectorizationFailure("Found a non-int non-pointer PHI", |
620 | 4 | "loop control flow is not understood by vectorizer", |
621 | 4 | "CFGNotUnderstood"); |
622 | 4 | return false; |
623 | 4 | } |
624 | 108k | |
625 | 108k | // If this PHINode is not in the header block, then we know that we |
626 | 108k | // can convert it to select during if-conversion. No need to check if |
627 | 108k | // the PHIs in this block are induction or reduction variables. |
628 | 108k | if (BB != Header) { |
629 | 1.64k | // Non-header phi nodes that have outside uses can be vectorized. Add |
630 | 1.64k | // them to the list of allowed exits. |
631 | 1.64k | // Unsafe cyclic dependencies with header phis are identified during |
632 | 1.64k | // legalization for reduction, induction and first order |
633 | 1.64k | // recurrences. |
634 | 1.64k | continue; |
635 | 1.64k | } |
636 | 106k | |
637 | 106k | // We only allow if-converted PHIs with exactly two incoming values. |
638 | 106k | if (Phi->getNumIncomingValues() != 2) { |
639 | 0 | reportVectorizationFailure("Found an invalid PHI", |
640 | 0 | "loop control flow is not understood by vectorizer", |
641 | 0 | "CFGNotUnderstood", Phi); |
642 | 0 | return false; |
643 | 0 | } |
644 | 106k | |
645 | 106k | RecurrenceDescriptor RedDes; |
646 | 106k | if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC, |
647 | 106k | DT)) { |
648 | 8.23k | if (RedDes.hasUnsafeAlgebra()) |
649 | 1.77k | Requirements->addUnsafeAlgebraInst(RedDes.getUnsafeAlgebraInst()); |
650 | 8.23k | AllowedExit.insert(RedDes.getLoopExitInstr()); |
651 | 8.23k | Reductions[Phi] = RedDes; |
652 | 8.23k | continue; |
653 | 8.23k | } |
654 | 98.5k | |
655 | 98.5k | // TODO: Instead of recording the AllowedExit, it would be good to record the |
656 | 98.5k | // complementary set: NotAllowedExit. These include (but may not be |
657 | 98.5k | // limited to): |
658 | 98.5k | // 1. Reduction phis as they represent the one-before-last value, which |
659 | 98.5k | // is not available when vectorized |
660 | 98.5k | // 2. Induction phis and increment when SCEV predicates cannot be used |
661 | 98.5k | // outside the loop - see addInductionPhi |
662 | 98.5k | // 3. Non-Phis with outside uses when SCEV predicates cannot be used |
663 | 98.5k | // outside the loop - see call to hasOutsideLoopUser in the non-phi |
664 | 98.5k | // handling below |
665 | 98.5k | // 4. FirstOrderRecurrence phis that can possibly be handled by |
666 | 98.5k | // extraction. |
667 | 98.5k | // By recording these, we can then reason about ways to vectorize each |
668 | 98.5k | // of these NotAllowedExit. |
669 | 98.5k | InductionDescriptor ID; |
670 | 98.5k | if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID)) { |
671 | 65.6k | addInductionPhi(Phi, ID, AllowedExit); |
672 | 65.6k | if (ID.hasUnsafeAlgebra() && !HasFunNoNaNAttr44 ) |
673 | 43 | Requirements->addUnsafeAlgebraInst(ID.getUnsafeAlgebraInst()); |
674 | 65.6k | continue; |
675 | 65.6k | } |
676 | 32.9k | |
677 | 32.9k | if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop, |
678 | 32.9k | SinkAfter, DT)) { |
679 | 277 | FirstOrderRecurrences.insert(Phi); |
680 | 277 | continue; |
681 | 277 | } |
682 | 32.6k | |
683 | 32.6k | // As a last resort, coerce the PHI to a AddRec expression |
684 | 32.6k | // and re-try classifying it a an induction PHI. |
685 | 32.6k | if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true)) { |
686 | 96 | addInductionPhi(Phi, ID, AllowedExit); |
687 | 96 | continue; |
688 | 96 | } |
689 | 32.5k | |
690 | 32.5k | reportVectorizationFailure("Found an unidentified PHI", |
691 | 32.5k | "value that could not be identified as " |
692 | 32.5k | "reduction is used outside the loop", |
693 | 32.5k | "NonReductionValueUsedOutsideLoop", Phi); |
694 | 32.5k | return false; |
695 | 32.5k | } // end of PHI handling |
696 | 474k | |
697 | 474k | // We handle calls that: |
698 | 474k | // * Are debug info intrinsics. |
699 | 474k | // * Have a mapping to an IR intrinsic. |
700 | 474k | // * Have a vector version available. |
701 | 474k | auto *CI = dyn_cast<CallInst>(&I); |
702 | 474k | if (CI && !getVectorIntrinsicIDForCall(CI, TLI)25.6k && |
703 | 474k | !isa<DbgInfoIntrinsic>(CI)24.5k && |
704 | 474k | !(24.5k CI->getCalledFunction()24.5k && TLI24.3k && |
705 | 24.5k | TLI->isFunctionVectorizable(CI->getCalledFunction()->getName())24.3k )) { |
706 | 24.4k | // If the call is a recognized math libary call, it is likely that |
707 | 24.4k | // we can vectorize it given loosened floating-point constraints. |
708 | 24.4k | LibFunc Func; |
709 | 24.4k | bool IsMathLibCall = |
710 | 24.4k | TLI && CI->getCalledFunction() && |
711 | 24.4k | CI->getType()->isFloatingPointTy()24.2k && |
712 | 24.4k | TLI->getLibFunc(CI->getCalledFunction()->getName(), Func)70 && |
713 | 24.4k | TLI->hasOptimizedCodeGen(Func)10 ; |
714 | 24.4k | |
715 | 24.4k | if (IsMathLibCall) { |
716 | 4 | // TODO: Ideally, we should not use clang-specific language here, |
717 | 4 | // but it's hard to provide meaningful yet generic advice. |
718 | 4 | // Also, should this be guarded by allowExtraAnalysis() and/or be part |
719 | 4 | // of the returned info from isFunctionVectorizable()? |
720 | 4 | reportVectorizationFailure("Found a non-intrinsic callsite", |
721 | 4 | "library call cannot be vectorized. " |
722 | 4 | "Try compiling with -fno-math-errno, -ffast-math, " |
723 | 4 | "or similar flags", |
724 | 4 | "CantVectorizeLibcall", CI); |
725 | 24.4k | } else { |
726 | 24.4k | reportVectorizationFailure("Found a non-intrinsic callsite", |
727 | 24.4k | "call instruction cannot be vectorized", |
728 | 24.4k | "CantVectorizeLibcall", CI); |
729 | 24.4k | } |
730 | 24.4k | return false; |
731 | 24.4k | } |
732 | 449k | |
733 | 449k | // Some intrinsics have scalar arguments and should be same in order for |
734 | 449k | // them to be vectorized (i.e. loop invariant). |
735 | 449k | if (CI) { |
736 | 1.17k | auto *SE = PSE.getSE(); |
737 | 1.17k | Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI); |
738 | 2.96k | for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i1.78k ) |
739 | 1.78k | if (hasVectorInstrinsicScalarOpd(IntrinID, i)) { |
740 | 5 | if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(i)), TheLoop)) { |
741 | 1 | reportVectorizationFailure("Found unvectorizable intrinsic", |
742 | 1 | "intrinsic instruction cannot be vectorized", |
743 | 1 | "CantVectorizeIntrinsic", CI); |
744 | 1 | return false; |
745 | 1 | } |
746 | 5 | } |
747 | 1.17k | } |
748 | 449k | |
749 | 449k | // Check that the instruction return type is vectorizable. |
750 | 449k | // Also, we can't vectorize extractelement instructions. |
751 | 449k | if (449k (449k !VectorType::isValidElementType(I.getType())449k && |
752 | 449k | !I.getType()->isVoidTy()81.2k ) || |
753 | 449k | isa<ExtractElementInst>(I)448k ) { |
754 | 1.22k | reportVectorizationFailure("Found unvectorizable type", |
755 | 1.22k | "instruction return type cannot be vectorized", |
756 | 1.22k | "CantVectorizeInstructionReturnType", &I); |
757 | 1.22k | return false; |
758 | 1.22k | } |
759 | 448k | |
760 | 448k | // Check that the stored type is vectorizable. |
761 | 448k | if (auto *ST = dyn_cast<StoreInst>(&I)) { |
762 | 37.6k | Type *T = ST->getValueOperand()->getType(); |
763 | 37.6k | if (!VectorType::isValidElementType(T)) { |
764 | 27 | reportVectorizationFailure("Store instruction cannot be vectorized", |
765 | 27 | "store instruction cannot be vectorized", |
766 | 27 | "CantVectorizeStore", ST); |
767 | 27 | return false; |
768 | 27 | } |
769 | 37.6k | |
770 | 37.6k | // For nontemporal stores, check that a nontemporal vector version is |
771 | 37.6k | // supported on the target. |
772 | 37.6k | if (ST->getMetadata(LLVMContext::MD_nontemporal)) { |
773 | 1 | // Arbitrarily try a vector of 2 elements. |
774 | 1 | Type *VecTy = VectorType::get(T, /*NumElements=*/2); |
775 | 1 | assert(VecTy && "did not find vectorized version of stored type"); |
776 | 1 | unsigned Alignment = getLoadStoreAlignment(ST); |
777 | 1 | if (!TTI->isLegalNTStore(VecTy, Alignment)) { |
778 | 1 | reportVectorizationFailure( |
779 | 1 | "nontemporal store instruction cannot be vectorized", |
780 | 1 | "nontemporal store instruction cannot be vectorized", |
781 | 1 | "CantVectorizeNontemporalStore", ST); |
782 | 1 | return false; |
783 | 1 | } |
784 | 410k | } |
785 | 410k | |
786 | 410k | } else if (auto *LD = dyn_cast<LoadInst>(&I)) { |
787 | 72.3k | if (LD->getMetadata(LLVMContext::MD_nontemporal)) { |
788 | 2 | // For nontemporal loads, check that a nontemporal vector version is |
789 | 2 | // supported on the target (arbitrarily try a vector of 2 elements). |
790 | 2 | Type *VecTy = VectorType::get(I.getType(), /*NumElements=*/2); |
791 | 2 | assert(VecTy && "did not find vectorized version of load type"); |
792 | 2 | unsigned Alignment = getLoadStoreAlignment(LD); |
793 | 2 | if (!TTI->isLegalNTLoad(VecTy, Alignment)) { |
794 | 2 | reportVectorizationFailure( |
795 | 2 | "nontemporal load instruction cannot be vectorized", |
796 | 2 | "nontemporal load instruction cannot be vectorized", |
797 | 2 | "CantVectorizeNontemporalLoad", LD); |
798 | 2 | return false; |
799 | 2 | } |
800 | 338k | } |
801 | 338k | |
802 | 338k | // FP instructions can allow unsafe algebra, thus vectorizable by |
803 | 338k | // non-IEEE-754 compliant SIMD units. |
804 | 338k | // This applies to floating-point math operations and calls, not memory |
805 | 338k | // operations, shuffles, or casts, as they don't change precision or |
806 | 338k | // semantics. |
807 | 338k | } else if (I.getType()->isFloatingPointTy() && (47.8k CI47.8k || I.isBinaryOp()47.2k ) && |
808 | 338k | !I.isFast()35.8k ) { |
809 | 35.5k | LLVM_DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n"); |
810 | 35.5k | Hints->setPotentiallyUnsafe(); |
811 | 35.5k | } |
812 | 448k | |
813 | 448k | // Reduction instructions are allowed to have exit users. |
814 | 448k | // All other instructions must not have external users. |
815 | 448k | if (448k hasOutsideLoopUser(TheLoop, &I, AllowedExit)448k ) { |
816 | 666 | // We can safely vectorize loops where instructions within the loop are |
817 | 666 | // used outside the loop only if the SCEV predicates within the loop is |
818 | 666 | // same as outside the loop. Allowing the exit means reusing the SCEV |
819 | 666 | // outside the loop. |
820 | 666 | if (PSE.getUnionPredicate().isAlwaysTrue()) { |
821 | 663 | AllowedExit.insert(&I); |
822 | 663 | continue; |
823 | 663 | } |
824 | 3 | reportVectorizationFailure("Value cannot be used outside the loop", |
825 | 3 | "value cannot be used outside the loop", |
826 | 3 | "ValueUsedOutsideLoop", &I); |
827 | 3 | return false; |
828 | 3 | } |
829 | 448k | } // next instr. |
830 | 98.7k | } |
831 | 92.7k | |
832 | 92.7k | if (34.4k !PrimaryInduction34.4k ) { |
833 | 17.2k | if (Inductions.empty()) { |
834 | 7.00k | reportVectorizationFailure("Did not find one integer induction var", |
835 | 7.00k | "loop induction variable could not be identified", |
836 | 7.00k | "NoInductionVariable"); |
837 | 7.00k | return false; |
838 | 10.2k | } else if (!WidestIndTy) { |
839 | 16 | reportVectorizationFailure("Did not find one integer induction var", |
840 | 16 | "integer loop induction variable could not be identified", |
841 | 16 | "NoIntegerInductionVariable"); |
842 | 16 | return false; |
843 | 10.2k | } else { |
844 | 10.2k | LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); |
845 | 10.2k | } |
846 | 17.2k | } |
847 | 34.4k | |
848 | 34.4k | // Now we know the widest induction type, check if our found induction |
849 | 34.4k | // is the same size. If it's not, unset it here and InnerLoopVectorizer |
850 | 34.4k | // will create another. |
851 | 34.4k | if (27.3k PrimaryInduction27.3k && WidestIndTy != PrimaryInduction->getType()17.1k ) |
852 | 320 | PrimaryInduction = nullptr; |
853 | 27.3k | |
854 | 27.3k | return true; |
855 | 34.4k | } |
856 | | |
857 | 27.4k | bool LoopVectorizationLegality::canVectorizeMemory() { |
858 | 27.4k | LAI = &(*GetLAA)(*TheLoop); |
859 | 27.4k | const OptimizationRemarkAnalysis *LAR = LAI->getReport(); |
860 | 27.4k | if (LAR) { |
861 | 7.43k | ORE->emit([&]() { |
862 | 35 | return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(), |
863 | 35 | "loop not vectorized: ", *LAR); |
864 | 35 | }); |
865 | 7.43k | } |
866 | 27.4k | if (!LAI->canVectorizeMemory()) |
867 | 7.43k | return false; |
868 | 19.9k | |
869 | 19.9k | if (LAI->hasDependenceInvolvingLoopInvariantAddress()) { |
870 | 4 | reportVectorizationFailure("Stores to a uniform address", |
871 | 4 | "write to a loop invariant address could not be vectorized", |
872 | 4 | "CantVectorizeStoreToLoopInvariantAddress"); |
873 | 4 | return false; |
874 | 4 | } |
875 | 19.9k | Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks()); |
876 | 19.9k | PSE.addPredicate(LAI->getPSE().getUnionPredicate()); |
877 | 19.9k | |
878 | 19.9k | return true; |
879 | 19.9k | } |
880 | | |
881 | 50.1k | bool LoopVectorizationLegality::isInductionPhi(const Value *V) { |
882 | 50.1k | Value *In0 = const_cast<Value *>(V); |
883 | 50.1k | PHINode *PN = dyn_cast_or_null<PHINode>(In0); |
884 | 50.1k | if (!PN) |
885 | 30.5k | return false; |
886 | 19.5k | |
887 | 19.5k | return Inductions.count(PN); |
888 | 19.5k | } |
889 | | |
890 | 6.34k | bool LoopVectorizationLegality::isCastedInductionVariable(const Value *V) { |
891 | 6.34k | auto *Inst = dyn_cast<Instruction>(V); |
892 | 6.34k | return (Inst && InductionCastsToIgnore.count(Inst)); |
893 | 6.34k | } |
894 | | |
895 | 12.3k | bool LoopVectorizationLegality::isInductionVariable(const Value *V) { |
896 | 12.3k | return isInductionPhi(V) || isCastedInductionVariable(V)6.34k ; |
897 | 12.3k | } |
898 | | |
899 | 86.7k | bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) { |
900 | 86.7k | return FirstOrderRecurrences.count(Phi); |
901 | 86.7k | } |
902 | | |
903 | 987k | bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { |
904 | 987k | return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); |
905 | 987k | } |
906 | | |
907 | | bool LoopVectorizationLegality::blockCanBePredicated( |
908 | 25.5k | BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs) { |
909 | 25.5k | const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); |
910 | 25.5k | |
911 | 140k | for (Instruction &I : *BB) { |
912 | 140k | // Check that we don't have a constant expression that can trap as operand. |
913 | 272k | for (Value *Operand : I.operands()) { |
914 | 272k | if (auto *C = dyn_cast<Constant>(Operand)) |
915 | 79.7k | if (C->canTrap()) |
916 | 4 | return false; |
917 | 272k | } |
918 | 140k | // We might be able to hoist the load. |
919 | 140k | if (140k I.mayReadFromMemory()140k ) { |
920 | 36.1k | auto *LI = dyn_cast<LoadInst>(&I); |
921 | 36.1k | if (!LI) |
922 | 8.84k | return false; |
923 | 27.3k | if (!SafePtrs.count(LI->getPointerOperand())) { |
924 | 26.5k | // !llvm.mem.parallel_loop_access implies if-conversion safety. |
925 | 26.5k | // Otherwise, record that the load needs (real or emulated) masking |
926 | 26.5k | // and let the cost model decide. |
927 | 26.5k | if (!IsAnnotatedParallel) |
928 | 26.5k | MaskedOp.insert(LI); |
929 | 26.5k | continue; |
930 | 26.5k | } |
931 | 105k | } |
932 | 105k | |
933 | 105k | if (I.mayWriteToMemory()) { |
934 | 12.4k | auto *SI = dyn_cast<StoreInst>(&I); |
935 | 12.4k | if (!SI) |
936 | 4 | return false; |
937 | 12.4k | // Predicated store requires some form of masking: |
938 | 12.4k | // 1) masked store HW instruction, |
939 | 12.4k | // 2) emulation via load-blend-store (only if safe and legal to do so, |
940 | 12.4k | // be aware on the race conditions), or |
941 | 12.4k | // 3) element-by-element predicate check and scalar store. |
942 | 12.4k | MaskedOp.insert(SI); |
943 | 12.4k | continue; |
944 | 12.4k | } |
945 | 92.8k | if (I.mayThrow()) |
946 | 0 | return false; |
947 | 92.8k | } |
948 | 25.5k | |
949 | 25.5k | return true16.6k ; |
950 | 25.5k | } |
951 | | |
952 | 12.0k | bool LoopVectorizationLegality::canVectorizeWithIfConvert() { |
953 | 12.0k | if (!EnableIfConversion) { |
954 | 0 | reportVectorizationFailure("If-conversion is disabled", |
955 | 0 | "if-conversion is disabled", |
956 | 0 | "IfConversionDisabled"); |
957 | 0 | return false; |
958 | 0 | } |
959 | 12.0k | |
960 | 12.0k | assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable"); |
961 | 12.0k | |
962 | 12.0k | // A list of pointers that we can safely read and write to. |
963 | 12.0k | SmallPtrSet<Value *, 8> SafePointes; |
964 | 12.0k | |
965 | 12.0k | // Collect safe addresses. |
966 | 102k | for (BasicBlock *BB : TheLoop->blocks()) { |
967 | 102k | if (blockNeedsPredication(BB)) |
968 | 74.0k | continue; |
969 | 28.5k | |
970 | 28.5k | for (Instruction &I : *BB) |
971 | 191k | if (auto *Ptr = getLoadStorePointerOperand(&I)) |
972 | 37.4k | SafePointes.insert(Ptr); |
973 | 28.5k | } |
974 | 12.0k | |
975 | 12.0k | // Collect the blocks that need predication. |
976 | 12.0k | BasicBlock *Header = TheLoop->getHeader(); |
977 | 42.0k | for (BasicBlock *BB : TheLoop->blocks()) { |
978 | 42.0k | // We don't support switch statements inside loops. |
979 | 42.0k | if (!isa<BranchInst>(BB->getTerminator())) { |
980 | 273 | reportVectorizationFailure("Loop contains a switch statement", |
981 | 273 | "loop contains a switch statement", |
982 | 273 | "LoopContainsSwitch", BB->getTerminator()); |
983 | 273 | return false; |
984 | 273 | } |
985 | 41.7k | |
986 | 41.7k | // We must be able to predicate all blocks that need to be predicated. |
987 | 41.7k | if (blockNeedsPredication(BB)) { |
988 | 25.4k | if (!blockCanBePredicated(BB, SafePointes)) { |
989 | 8.85k | reportVectorizationFailure( |
990 | 8.85k | "Control flow cannot be substituted for a select", |
991 | 8.85k | "control flow cannot be substituted for a select", |
992 | 8.85k | "NoCFGForSelect", BB->getTerminator()); |
993 | 8.85k | return false; |
994 | 8.85k | } |
995 | 16.2k | } else if (BB != Header && !canIfConvertPHINodes(BB)4.37k ) { |
996 | 0 | reportVectorizationFailure( |
997 | 0 | "Control flow cannot be substituted for a select", |
998 | 0 | "control flow cannot be substituted for a select", |
999 | 0 | "NoCFGForSelect", BB->getTerminator()); |
1000 | 0 | return false; |
1001 | 0 | } |
1002 | 41.7k | } |
1003 | 12.0k | |
1004 | 12.0k | // We can if-convert this loop. |
1005 | 12.0k | return true2.94k ; |
1006 | 12.0k | } |
1007 | | |
1008 | | // Helper function to canVectorizeLoopNestCFG. |
1009 | | bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp, |
1010 | 141k | bool UseVPlanNativePath) { |
1011 | 141k | assert((UseVPlanNativePath || Lp->empty()) && |
1012 | 141k | "VPlan-native path is not enabled."); |
1013 | 141k | |
1014 | 141k | // TODO: ORE should be improved to show more accurate information when an |
1015 | 141k | // outer loop can't be vectorized because a nested loop is not understood or |
1016 | 141k | // legal. Something like: "outer_loop_location: loop not vectorized: |
1017 | 141k | // (inner_loop_location) loop control flow is not understood by vectorizer". |
1018 | 141k | |
1019 | 141k | // Store the result and return it at the end instead of exiting early, in case |
1020 | 141k | // allowExtraAnalysis is used to report multiple reasons for not vectorizing. |
1021 | 141k | bool Result = true; |
1022 | 141k | bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); |
1023 | 141k | |
1024 | 141k | // We must have a loop in canonical form. Loops with indirectbr in them cannot |
1025 | 141k | // be canonicalized. |
1026 | 141k | if (!Lp->getLoopPreheader()) { |
1027 | 1 | reportVectorizationFailure("Loop doesn't have a legal pre-header", |
1028 | 1 | "loop control flow is not understood by vectorizer", |
1029 | 1 | "CFGNotUnderstood"); |
1030 | 1 | if (DoExtraAnalysis) |
1031 | 0 | Result = false; |
1032 | 1 | else |
1033 | 1 | return false; |
1034 | 141k | } |
1035 | 141k | |
1036 | 141k | // We must have a single backedge. |
1037 | 141k | if (Lp->getNumBackEdges() != 1) { |
1038 | 0 | reportVectorizationFailure("The loop must have a single backedge", |
1039 | 0 | "loop control flow is not understood by vectorizer", |
1040 | 0 | "CFGNotUnderstood"); |
1041 | 0 | if (DoExtraAnalysis) |
1042 | 0 | Result = false; |
1043 | 0 | else |
1044 | 0 | return false; |
1045 | 141k | } |
1046 | 141k | |
1047 | 141k | // We must have a single exiting block. |
1048 | 141k | if (!Lp->getExitingBlock()) { |
1049 | 35.0k | reportVectorizationFailure("The loop must have an exiting block", |
1050 | 35.0k | "loop control flow is not understood by vectorizer", |
1051 | 35.0k | "CFGNotUnderstood"); |
1052 | 35.0k | if (DoExtraAnalysis) |
1053 | 1 | Result = false; |
1054 | 35.0k | else |
1055 | 35.0k | return false; |
1056 | 106k | } |
1057 | 106k | |
1058 | 106k | // We only handle bottom-tested loops, i.e. loop in which the condition is |
1059 | 106k | // checked at the end of each iteration. With that we can assume that all |
1060 | 106k | // instructions in the loop are executed the same number of times. |
1061 | 106k | if (Lp->getExitingBlock() != Lp->getLoopLatch()) { |
1062 | 4.38k | reportVectorizationFailure("The exiting block is not the loop latch", |
1063 | 4.38k | "loop control flow is not understood by vectorizer", |
1064 | 4.38k | "CFGNotUnderstood"); |
1065 | 4.38k | if (DoExtraAnalysis) |
1066 | 1 | Result = false; |
1067 | 4.38k | else |
1068 | 4.38k | return false; |
1069 | 101k | } |
1070 | 101k | |
1071 | 101k | return Result; |
1072 | 101k | } |
1073 | | |
1074 | | bool LoopVectorizationLegality::canVectorizeLoopNestCFG( |
1075 | 141k | Loop *Lp, bool UseVPlanNativePath) { |
1076 | 141k | // Store the result and return it at the end instead of exiting early, in case |
1077 | 141k | // allowExtraAnalysis is used to report multiple reasons for not vectorizing. |
1078 | 141k | bool Result = true; |
1079 | 141k | bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); |
1080 | 141k | if (!canVectorizeLoopCFG(Lp, UseVPlanNativePath)) { |
1081 | 39.3k | if (DoExtraAnalysis) |
1082 | 1 | Result = false; |
1083 | 39.3k | else |
1084 | 39.3k | return false; |
1085 | 101k | } |
1086 | 101k | |
1087 | 101k | // Recursively check whether the loop control flow of nested loops is |
1088 | 101k | // understood. |
1089 | 101k | for (Loop *SubLp : *Lp) |
1090 | 7 | if (!canVectorizeLoopNestCFG(SubLp, UseVPlanNativePath)) { |
1091 | 0 | if (DoExtraAnalysis) |
1092 | 0 | Result = false; |
1093 | 0 | else |
1094 | 0 | return false; |
1095 | 0 | } |
1096 | 101k | |
1097 | 101k | return Result; |
1098 | 101k | } |
1099 | | |
1100 | 141k | bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { |
1101 | 141k | // Store the result and return it at the end instead of exiting early, in case |
1102 | 141k | // allowExtraAnalysis is used to report multiple reasons for not vectorizing. |
1103 | 141k | bool Result = true; |
1104 | 141k | |
1105 | 141k | bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); |
1106 | 141k | // Check whether the loop-related control flow in the loop nest is expected by |
1107 | 141k | // vectorizer. |
1108 | 141k | if (!canVectorizeLoopNestCFG(TheLoop, UseVPlanNativePath)) { |
1109 | 39.3k | if (DoExtraAnalysis) |
1110 | 1 | Result = false; |
1111 | 39.3k | else |
1112 | 39.3k | return false; |
1113 | 101k | } |
1114 | 101k | |
1115 | 101k | // We need to have a loop header. |
1116 | 101k | LLVM_DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName() |
1117 | 101k | << '\n'); |
1118 | 101k | |
1119 | 101k | // Specific checks for outer loops. We skip the remaining legal checks at this |
1120 | 101k | // point because they don't support outer loops. |
1121 | 101k | if (!TheLoop->empty()) { |
1122 | 7 | assert(UseVPlanNativePath && "VPlan-native path is not enabled."); |
1123 | 7 | |
1124 | 7 | if (!canVectorizeOuterLoop()) { |
1125 | 0 | reportVectorizationFailure("Unsupported outer loop", |
1126 | 0 | "unsupported outer loop", |
1127 | 0 | "UnsupportedOuterLoop"); |
1128 | 0 | // TODO: Implement DoExtraAnalysis when subsequent legal checks support |
1129 | 0 | // outer loops. |
1130 | 0 | return false; |
1131 | 0 | } |
1132 | 7 | |
1133 | 7 | LLVM_DEBUG(dbgs() << "LV: We can vectorize this outer loop!\n"); |
1134 | 7 | return Result; |
1135 | 7 | } |
1136 | 101k | |
1137 | 101k | assert(TheLoop->empty() && "Inner loop expected."); |
1138 | 101k | // Check if we can if-convert non-single-bb loops. |
1139 | 101k | unsigned NumBlocks = TheLoop->getNumBlocks(); |
1140 | 101k | if (NumBlocks != 1 && !canVectorizeWithIfConvert()12.0k ) { |
1141 | 9.13k | LLVM_DEBUG(dbgs() << "LV: Can't if-convert the loop.\n"); |
1142 | 9.13k | if (DoExtraAnalysis) |
1143 | 5 | Result = false; |
1144 | 9.12k | else |
1145 | 9.12k | return false; |
1146 | 92.7k | } |
1147 | 92.7k | |
1148 | 92.7k | // Check if we can vectorize the instructions and CFG in this loop. |
1149 | 92.7k | if (!canVectorizeInstrs()) { |
1150 | 65.3k | LLVM_DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n"); |
1151 | 65.3k | if (DoExtraAnalysis) |
1152 | 9 | Result = false; |
1153 | 65.3k | else |
1154 | 65.3k | return false; |
1155 | 27.4k | } |
1156 | 27.4k | |
1157 | 27.4k | // Go over each instruction and look at memory deps. |
1158 | 27.4k | if (!canVectorizeMemory()) { |
1159 | 7.43k | LLVM_DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n"); |
1160 | 7.43k | if (DoExtraAnalysis) |
1161 | 35 | Result = false; |
1162 | 7.40k | else |
1163 | 7.40k | return false; |
1164 | 20.0k | } |
1165 | 20.0k | |
1166 | 20.0k | LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop" |
1167 | 20.0k | << (LAI->getRuntimePointerChecking()->Need |
1168 | 20.0k | ? " (with a runtime bound check)" |
1169 | 20.0k | : "") |
1170 | 20.0k | << "!\n"); |
1171 | 20.0k | |
1172 | 20.0k | unsigned SCEVThreshold = VectorizeSCEVCheckThreshold; |
1173 | 20.0k | if (Hints->getForce() == LoopVectorizeHints::FK_Enabled) |
1174 | 31 | SCEVThreshold = PragmaVectorizeSCEVCheckThreshold; |
1175 | 20.0k | |
1176 | 20.0k | if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) { |
1177 | 5 | reportVectorizationFailure("Too many SCEV checks needed", |
1178 | 5 | "Too many SCEV assumptions need to be made and checked at runtime", |
1179 | 5 | "TooManySCEVRunTimeChecks"); |
1180 | 5 | if (DoExtraAnalysis) |
1181 | 0 | Result = false; |
1182 | 5 | else |
1183 | 5 | return false; |
1184 | 19.9k | } |
1185 | 19.9k | |
1186 | 19.9k | // Okay! We've done all the tests. If any have failed, return false. Otherwise |
1187 | 19.9k | // we can vectorize, and at this point we don't have any other mem analysis |
1188 | 19.9k | // which may limit our maximum vectorization factor, so just return true with |
1189 | 19.9k | // no restrictions. |
1190 | 19.9k | return Result; |
1191 | 19.9k | } |
1192 | | |
1193 | 75 | bool LoopVectorizationLegality::canFoldTailByMasking() { |
1194 | 75 | |
1195 | 75 | LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n"); |
1196 | 75 | |
1197 | 75 | if (!PrimaryInduction) { |
1198 | 33 | reportVectorizationFailure( |
1199 | 33 | "No primary induction, cannot fold tail by masking", |
1200 | 33 | "Missing a primary induction variable in the loop, which is " |
1201 | 33 | "needed in order to fold tail by masking as required.", |
1202 | 33 | "NoPrimaryInduction"); |
1203 | 33 | return false; |
1204 | 33 | } |
1205 | 42 | |
1206 | 42 | // TODO: handle reductions when tail is folded by masking. |
1207 | 42 | if (!Reductions.empty()) { |
1208 | 15 | reportVectorizationFailure( |
1209 | 15 | "Loop has reductions, cannot fold tail by masking", |
1210 | 15 | "Cannot fold tail by masking in the presence of reductions.", |
1211 | 15 | "ReductionFoldingTailByMasking"); |
1212 | 15 | return false; |
1213 | 15 | } |
1214 | 27 | |
1215 | 27 | // TODO: handle outside users when tail is folded by masking. |
1216 | 60 | for (auto *AE : AllowedExit)27 { |
1217 | 60 | // Check that all users of allowed exit values are inside the loop. |
1218 | 182 | for (User *U : AE->users()) { |
1219 | 182 | Instruction *UI = cast<Instruction>(U); |
1220 | 182 | if (TheLoop->contains(UI)) |
1221 | 181 | continue; |
1222 | 1 | reportVectorizationFailure( |
1223 | 1 | "Cannot fold tail by masking, loop has an outside user for", |
1224 | 1 | "Cannot fold tail by masking in the presence of live outs.", |
1225 | 1 | "LiveOutFoldingTailByMasking", UI); |
1226 | 1 | return false; |
1227 | 1 | } |
1228 | 60 | } |
1229 | 27 | |
1230 | 27 | // The list of pointers that we can safely read and write to remains empty. |
1231 | 27 | SmallPtrSet<Value *, 8> SafePointers; |
1232 | 26 | |
1233 | 26 | // Check and mark all blocks for predication, including those that ordinarily |
1234 | 26 | // do not need predication such as the header block. |
1235 | 44 | for (BasicBlock *BB : TheLoop->blocks()) { |
1236 | 44 | if (!blockCanBePredicated(BB, SafePointers)) { |
1237 | 0 | reportVectorizationFailure( |
1238 | 0 | "Cannot fold tail by masking as required", |
1239 | 0 | "control flow cannot be substituted for a select", |
1240 | 0 | "NoCFGForSelect", BB->getTerminator()); |
1241 | 0 | return false; |
1242 | 0 | } |
1243 | 44 | } |
1244 | 26 | |
1245 | 26 | LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n"); |
1246 | 26 | return true; |
1247 | 26 | } |
1248 | | |
1249 | | } // namespace llvm |