Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Line
Count
Source (jump to first uncovered line)
1
//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10
// and generates target-independent LLVM-IR.
11
// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12
// of instructions in order to estimate the profitability of vectorization.
13
//
14
// The loop vectorizer combines consecutive loop iterations into a single
15
// 'wide' iteration. After this transformation the index is incremented
16
// by the SIMD vector width, and not by one.
17
//
18
// This pass has three parts:
19
// 1. The main loop pass that drives the different parts.
20
// 2. LoopVectorizationLegality - A unit that checks for the legality
21
//    of the vectorization.
22
// 3. InnerLoopVectorizer - A unit that performs the actual
23
//    widening of instructions.
24
// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25
//    of vectorization. It decides on the optimal vector width, which
26
//    can be one, if vectorization is not profitable.
27
//
28
// There is a development effort going on to migrate loop vectorizer to the
29
// VPlan infrastructure and to introduce outer loop vectorization support (see
30
// docs/Proposal/VectorizationPlan.rst and
31
// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32
// purpose, we temporarily introduced the VPlan-native vectorization path: an
33
// alternative vectorization path that is natively implemented on top of the
34
// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35
//
36
//===----------------------------------------------------------------------===//
37
//
38
// The reduction-variable vectorization is based on the paper:
39
//  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40
//
41
// Variable uniformity checks are inspired by:
42
//  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43
//
44
// The interleaved access vectorization is based on the paper:
45
//  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46
//  Data for SIMD
47
//
48
// Other ideas/concepts are from:
49
//  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50
//
51
//  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52
//  Vectorizing Compilers.
53
//
54
//===----------------------------------------------------------------------===//
55
56
#include "llvm/Transforms/Vectorize/LoopVectorize.h"
57
#include "LoopVectorizationPlanner.h"
58
#include "VPRecipeBuilder.h"
59
#include "VPlan.h"
60
#include "VPlanHCFGBuilder.h"
61
#include "VPlanHCFGTransforms.h"
62
#include "VPlanPredicator.h"
63
#include "llvm/ADT/APInt.h"
64
#include "llvm/ADT/ArrayRef.h"
65
#include "llvm/ADT/DenseMap.h"
66
#include "llvm/ADT/DenseMapInfo.h"
67
#include "llvm/ADT/Hashing.h"
68
#include "llvm/ADT/MapVector.h"
69
#include "llvm/ADT/None.h"
70
#include "llvm/ADT/Optional.h"
71
#include "llvm/ADT/STLExtras.h"
72
#include "llvm/ADT/SetVector.h"
73
#include "llvm/ADT/SmallPtrSet.h"
74
#include "llvm/ADT/SmallVector.h"
75
#include "llvm/ADT/Statistic.h"
76
#include "llvm/ADT/StringRef.h"
77
#include "llvm/ADT/Twine.h"
78
#include "llvm/ADT/iterator_range.h"
79
#include "llvm/Analysis/AssumptionCache.h"
80
#include "llvm/Analysis/BasicAliasAnalysis.h"
81
#include "llvm/Analysis/BlockFrequencyInfo.h"
82
#include "llvm/Analysis/CFG.h"
83
#include "llvm/Analysis/CodeMetrics.h"
84
#include "llvm/Analysis/DemandedBits.h"
85
#include "llvm/Analysis/GlobalsModRef.h"
86
#include "llvm/Analysis/LoopAccessAnalysis.h"
87
#include "llvm/Analysis/LoopAnalysisManager.h"
88
#include "llvm/Analysis/LoopInfo.h"
89
#include "llvm/Analysis/LoopIterator.h"
90
#include "llvm/Analysis/MemorySSA.h"
91
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
92
#include "llvm/Analysis/ProfileSummaryInfo.h"
93
#include "llvm/Analysis/ScalarEvolution.h"
94
#include "llvm/Analysis/ScalarEvolutionExpander.h"
95
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
96
#include "llvm/Analysis/TargetLibraryInfo.h"
97
#include "llvm/Analysis/TargetTransformInfo.h"
98
#include "llvm/Analysis/VectorUtils.h"
99
#include "llvm/IR/Attributes.h"
100
#include "llvm/IR/BasicBlock.h"
101
#include "llvm/IR/CFG.h"
102
#include "llvm/IR/Constant.h"
103
#include "llvm/IR/Constants.h"
104
#include "llvm/IR/DataLayout.h"
105
#include "llvm/IR/DebugInfoMetadata.h"
106
#include "llvm/IR/DebugLoc.h"
107
#include "llvm/IR/DerivedTypes.h"
108
#include "llvm/IR/DiagnosticInfo.h"
109
#include "llvm/IR/Dominators.h"
110
#include "llvm/IR/Function.h"
111
#include "llvm/IR/IRBuilder.h"
112
#include "llvm/IR/InstrTypes.h"
113
#include "llvm/IR/Instruction.h"
114
#include "llvm/IR/Instructions.h"
115
#include "llvm/IR/IntrinsicInst.h"
116
#include "llvm/IR/Intrinsics.h"
117
#include "llvm/IR/LLVMContext.h"
118
#include "llvm/IR/Metadata.h"
119
#include "llvm/IR/Module.h"
120
#include "llvm/IR/Operator.h"
121
#include "llvm/IR/Type.h"
122
#include "llvm/IR/Use.h"
123
#include "llvm/IR/User.h"
124
#include "llvm/IR/Value.h"
125
#include "llvm/IR/ValueHandle.h"
126
#include "llvm/IR/Verifier.h"
127
#include "llvm/Pass.h"
128
#include "llvm/Support/Casting.h"
129
#include "llvm/Support/CommandLine.h"
130
#include "llvm/Support/Compiler.h"
131
#include "llvm/Support/Debug.h"
132
#include "llvm/Support/ErrorHandling.h"
133
#include "llvm/Support/MathExtras.h"
134
#include "llvm/Support/raw_ostream.h"
135
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
136
#include "llvm/Transforms/Utils/LoopSimplify.h"
137
#include "llvm/Transforms/Utils/LoopUtils.h"
138
#include "llvm/Transforms/Utils/LoopVersioning.h"
139
#include "llvm/Transforms/Utils/SizeOpts.h"
140
#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
141
#include <algorithm>
142
#include <cassert>
143
#include <cstdint>
144
#include <cstdlib>
145
#include <functional>
146
#include <iterator>
147
#include <limits>
148
#include <memory>
149
#include <string>
150
#include <tuple>
151
#include <utility>
152
#include <vector>
153
154
using namespace llvm;
155
156
24
#define LV_NAME "loop-vectorize"
157
#define DEBUG_TYPE LV_NAME
158
159
/// @{
160
/// Metadata attribute names
161
static const char *const LLVMLoopVectorizeFollowupAll =
162
    "llvm.loop.vectorize.followup_all";
163
static const char *const LLVMLoopVectorizeFollowupVectorized =
164
    "llvm.loop.vectorize.followup_vectorized";
165
static const char *const LLVMLoopVectorizeFollowupEpilogue =
166
    "llvm.loop.vectorize.followup_epilogue";
167
/// @}
168
169
STATISTIC(LoopsVectorized, "Number of loops vectorized");
170
STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
171
172
/// Loops with a known constant trip count below this number are vectorized only
173
/// if no scalar iteration overheads are incurred.
174
static cl::opt<unsigned> TinyTripCountVectorThreshold(
175
    "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
176
    cl::desc("Loops with a constant trip count that is smaller than this "
177
             "value are vectorized only if no scalar iteration overheads "
178
             "are incurred."));
179
180
static cl::opt<bool> MaximizeBandwidth(
181
    "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
182
    cl::desc("Maximize bandwidth when selecting vectorization factor which "
183
             "will be determined by the smallest type in loop."));
184
185
static cl::opt<bool> EnableInterleavedMemAccesses(
186
    "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
187
    cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
188
189
/// An interleave-group may need masking if it resides in a block that needs
190
/// predication, or in order to mask away gaps. 
191
static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
192
    "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
193
    cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
194
195
/// We don't interleave loops with a known constant trip count below this
196
/// number.
197
static const unsigned TinyTripCountInterleaveThreshold = 128;
198
199
static cl::opt<unsigned> ForceTargetNumScalarRegs(
200
    "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
201
    cl::desc("A flag that overrides the target's number of scalar registers."));
202
203
static cl::opt<unsigned> ForceTargetNumVectorRegs(
204
    "force-target-num-vector-regs", cl::init(0), cl::Hidden,
205
    cl::desc("A flag that overrides the target's number of vector registers."));
206
207
static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
208
    "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
209
    cl::desc("A flag that overrides the target's max interleave factor for "
210
             "scalar loops."));
211
212
static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
213
    "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
214
    cl::desc("A flag that overrides the target's max interleave factor for "
215
             "vectorized loops."));
216
217
static cl::opt<unsigned> ForceTargetInstructionCost(
218
    "force-target-instruction-cost", cl::init(0), cl::Hidden,
219
    cl::desc("A flag that overrides the target's expected cost for "
220
             "an instruction to a single constant value. Mostly "
221
             "useful for getting consistent testing."));
222
223
static cl::opt<unsigned> SmallLoopCost(
224
    "small-loop-cost", cl::init(20), cl::Hidden,
225
    cl::desc(
226
        "The cost of a loop that is considered 'small' by the interleaver."));
227
228
static cl::opt<bool> LoopVectorizeWithBlockFrequency(
229
    "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
230
    cl::desc("Enable the use of the block frequency analysis to access PGO "
231
             "heuristics minimizing code growth in cold regions and being more "
232
             "aggressive in hot regions."));
233
234
// Runtime interleave loops for load/store throughput.
235
static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
236
    "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
237
    cl::desc(
238
        "Enable runtime interleaving until load/store ports are saturated"));
239
240
/// The number of stores in a loop that are allowed to need predication.
241
static cl::opt<unsigned> NumberOfStoresToPredicate(
242
    "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
243
    cl::desc("Max number of stores to be predicated behind an if."));
244
245
static cl::opt<bool> EnableIndVarRegisterHeur(
246
    "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
247
    cl::desc("Count the induction variable only once when interleaving"));
248
249
static cl::opt<bool> EnableCondStoresVectorization(
250
    "enable-cond-stores-vec", cl::init(true), cl::Hidden,
251
    cl::desc("Enable if predication of stores during vectorization."));
252
253
static cl::opt<unsigned> MaxNestedScalarReductionIC(
254
    "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
255
    cl::desc("The maximum interleave count to use when interleaving a scalar "
256
             "reduction in a nested loop."));
257
258
cl::opt<bool> EnableVPlanNativePath(
259
    "enable-vplan-native-path", cl::init(false), cl::Hidden,
260
    cl::desc("Enable VPlan-native vectorization path with "
261
             "support for outer loop vectorization."));
262
263
// FIXME: Remove this switch once we have divergence analysis. Currently we
264
// assume divergent non-backedge branches when this switch is true.
265
cl::opt<bool> EnableVPlanPredication(
266
    "enable-vplan-predication", cl::init(false), cl::Hidden,
267
    cl::desc("Enable VPlan-native vectorization path predicator with "
268
             "support for outer loop vectorization."));
269
270
// This flag enables the stress testing of the VPlan H-CFG construction in the
271
// VPlan-native vectorization path. It must be used in conjuction with
272
// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
273
// verification of the H-CFGs built.
274
static cl::opt<bool> VPlanBuildStressTest(
275
    "vplan-build-stress-test", cl::init(false), cl::Hidden,
276
    cl::desc(
277
        "Build VPlan for every supported loop nest in the function and bail "
278
        "out right after the build (stress test the VPlan H-CFG construction "
279
        "in the VPlan-native vectorization path)."));
280
281
cl::opt<bool> llvm::EnableLoopInterleaving(
282
    "interleave-loops", cl::init(true), cl::Hidden,
283
    cl::desc("Enable loop interleaving in Loop vectorization passes"));
284
cl::opt<bool> llvm::EnableLoopVectorization(
285
    "vectorize-loops", cl::init(true), cl::Hidden,
286
    cl::desc("Run the Loop vectorization passes"));
287
288
/// A helper function for converting Scalar types to vector types.
289
/// If the incoming type is void, we return void. If the VF is 1, we return
290
/// the scalar type.
291
633k
static Type *ToVectorTy(Type *Scalar, unsigned VF) {
292
633k
  if (Scalar->isVoidTy() || 
VF == 1558k
)
293
226k
    return Scalar;
294
406k
  return VectorType::get(Scalar, VF);
295
406k
}
296
297
/// A helper function that returns the type of loaded or stored value.
298
260k
static Type *getMemInstValueType(Value *I) {
299
260k
  assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
300
260k
         "Expected Load or Store instruction");
301
260k
  if (auto *LI = dyn_cast<LoadInst>(I))
302
125k
    return LI->getType();
303
134k
  return cast<StoreInst>(I)->getValueOperand()->getType();
304
134k
}
305
306
/// A helper function that returns true if the given type is irregular. The
307
/// type is irregular if its allocated size doesn't equal the store size of an
308
/// element of the corresponding vector type at the given vectorization factor.
309
45.6k
static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
310
45.6k
  // Determine if an array of VF elements of type Ty is "bitcast compatible"
311
45.6k
  // with a <VF x Ty> vector.
312
45.6k
  if (VF > 1) {
313
45.6k
    auto *VectorTy = VectorType::get(Ty, VF);
314
45.6k
    return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
315
45.6k
  }
316
0
317
0
  // If the vectorization factor is one, we just check if an array of type Ty
318
0
  // requires padding between elements.
319
0
  return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
320
0
}
321
322
/// A helper function that returns the reciprocal of the block probability of
323
/// predicated blocks. If we return X, we are assuming the predicated block
324
/// will execute once for every X iterations of the loop header.
325
///
326
/// TODO: We should use actual block probability here, if available. Currently,
327
///       we always assume predicated blocks have a 50% chance of executing.
328
4.29k
static unsigned getReciprocalPredBlockProb() { return 2; }
329
330
/// A helper function that adds a 'fast' flag to floating-point operations.
331
151k
static Value *addFastMathFlag(Value *V) {
332
151k
  if (isa<FPMathOperator>(V))
333
68
    cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
334
151k
  return V;
335
151k
}
336
337
1.19k
static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
338
1.19k
  if (isa<FPMathOperator>(V))
339
47
    cast<Instruction>(V)->setFastMathFlags(FMF);
340
1.19k
  return V;
341
1.19k
}
342
343
/// A helper function that returns an integer or floating-point constant with
344
/// value C.
345
70.4k
static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
346
70.4k
  return Ty->isIntegerTy() ? 
ConstantInt::getSigned(Ty, C)70.4k
347
70.4k
                           : 
ConstantFP::get(Ty, C)34
;
348
70.4k
}
349
350
namespace llvm {
351
352
/// InnerLoopVectorizer vectorizes loops which contain only one basic
353
/// block to a specified vectorization factor (VF).
354
/// This class performs the widening of scalars into vectors, or multiple
355
/// scalars. This class also implements the following features:
356
/// * It inserts an epilogue loop for handling loops that don't have iteration
357
///   counts that are known to be a multiple of the vectorization factor.
358
/// * It handles the code generation for reduction variables.
359
/// * Scalarization (implementation using scalars) of un-vectorizable
360
///   instructions.
361
/// InnerLoopVectorizer does not perform any vectorization-legality
362
/// checks, and relies on the caller to check for the different legality
363
/// aspects. The InnerLoopVectorizer relies on the
364
/// LoopVectorizationLegality class to provide information about the induction
365
/// and reduction variables that were found to a given vectorization factor.
366
class InnerLoopVectorizer {
367
public:
368
  InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
369
                      LoopInfo *LI, DominatorTree *DT,
370
                      const TargetLibraryInfo *TLI,
371
                      const TargetTransformInfo *TTI, AssumptionCache *AC,
372
                      OptimizationRemarkEmitter *ORE, unsigned VecWidth,
373
                      unsigned UnrollFactor, LoopVectorizationLegality *LVL,
374
                      LoopVectorizationCostModel *CM)
375
      : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
376
        AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
377
        Builder(PSE.getSE()->getContext()),
378
17.0k
        VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
379
17.0k
  virtual ~InnerLoopVectorizer() = default;
380
381
  /// Create a new empty loop. Unlink the old loop and connect the new one.
382
  /// Return the pre-header block of the new loop.
383
  BasicBlock *createVectorizedLoopSkeleton();
384
385
  /// Widen a single instruction within the innermost loop.
386
  void widenInstruction(Instruction &I);
387
388
  /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
389
  void fixVectorizedLoop();
390
391
  // Return true if any runtime check is added.
392
15.3k
  bool areSafetyChecksAdded() { return AddedSafetyChecks; }
393
394
  /// A type for vectorized values in the new loop. Each value from the
395
  /// original loop, when vectorized, is represented by UF vector values in the
396
  /// new unrolled loop, where UF is the unroll factor.
397
  using VectorParts = SmallVector<Value *, 2>;
398
399
  /// Vectorize a single PHINode in a block. This method handles the induction
400
  /// variable canonicalization. It supports both VF = 1 for unrolled loops and
401
  /// arbitrary length vectors.
402
  void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
403
404
  /// A helper function to scalarize a single Instruction in the innermost loop.
405
  /// Generates a sequence of scalar instances for each lane between \p MinLane
406
  /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
407
  /// inclusive..
408
  void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
409
                            bool IfPredicateInstr);
410
411
  /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
412
  /// is provided, the integer induction variable will first be truncated to
413
  /// the corresponding type.
414
  void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
415
416
  /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
417
  /// vector or scalar value on-demand if one is not yet available. When
418
  /// vectorizing a loop, we visit the definition of an instruction before its
419
  /// uses. When visiting the definition, we either vectorize or scalarize the
420
  /// instruction, creating an entry for it in the corresponding map. (In some
421
  /// cases, such as induction variables, we will create both vector and scalar
422
  /// entries.) Then, as we encounter uses of the definition, we derive values
423
  /// for each scalar or vector use unless such a value is already available.
424
  /// For example, if we scalarize a definition and one of its uses is vector,
425
  /// we build the required vector on-demand with an insertelement sequence
426
  /// when visiting the use. Otherwise, if the use is scalar, we can use the
427
  /// existing scalar definition.
428
  ///
429
  /// Return a value in the new loop corresponding to \p V from the original
430
  /// loop at unroll index \p Part. If the value has already been vectorized,
431
  /// the corresponding vector entry in VectorLoopValueMap is returned. If,
432
  /// however, the value has a scalar entry in VectorLoopValueMap, we construct
433
  /// a new vector value on-demand by inserting the scalar values into a vector
434
  /// with an insertelement sequence. If the value has been neither vectorized
435
  /// nor scalarized, it must be loop invariant, so we simply broadcast the
436
  /// value into a vector.
437
  Value *getOrCreateVectorValue(Value *V, unsigned Part);
438
439
  /// Return a value in the new loop corresponding to \p V from the original
440
  /// loop at unroll and vector indices \p Instance. If the value has been
441
  /// vectorized but not scalarized, the necessary extractelement instruction
442
  /// will be generated.
443
  Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
444
445
  /// Construct the vector value of a scalarized value \p V one lane at a time.
446
  void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
447
448
  /// Try to vectorize the interleaved access group that \p Instr belongs to,
449
  /// optionally masking the vector operations if \p BlockInMask is non-null.
450
  void vectorizeInterleaveGroup(Instruction *Instr,
451
                                VectorParts *BlockInMask = nullptr);
452
453
  /// Vectorize Load and Store instructions, optionally masking the vector
454
  /// operations if \p BlockInMask is non-null.
455
  void vectorizeMemoryInstruction(Instruction *Instr,
456
                                  VectorParts *BlockInMask = nullptr);
457
458
  /// Set the debug location in the builder using the debug location in
459
  /// the instruction.
460
  void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
461
462
  /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
463
  void fixNonInductionPHIs(void);
464
465
protected:
466
  friend class LoopVectorizationPlanner;
467
468
  /// A small list of PHINodes.
469
  using PhiVector = SmallVector<PHINode *, 4>;
470
471
  /// A type for scalarized values in the new loop. Each value from the
472
  /// original loop, when scalarized, is represented by UF x VF scalar values
473
  /// in the new unrolled loop, where UF is the unroll factor and VF is the
474
  /// vectorization factor.
475
  using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
476
477
  /// Set up the values of the IVs correctly when exiting the vector loop.
478
  void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
479
                    Value *CountRoundDown, Value *EndValue,
480
                    BasicBlock *MiddleBlock);
481
482
  /// Create a new induction variable inside L.
483
  PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
484
                                   Value *Step, Instruction *DL);
485
486
  /// Handle all cross-iteration phis in the header.
487
  void fixCrossIterationPHIs();
488
489
  /// Fix a first-order recurrence. This is the second phase of vectorizing
490
  /// this phi node.
491
  void fixFirstOrderRecurrence(PHINode *Phi);
492
493
  /// Fix a reduction cross-iteration phi. This is the second phase of
494
  /// vectorizing this phi node.
495
  void fixReduction(PHINode *Phi);
496
497
  /// The Loop exit block may have single value PHI nodes with some
498
  /// incoming value. While vectorizing we only handled real values
499
  /// that were defined inside the loop and we should have one value for
500
  /// each predecessor of its parent basic block. See PR14725.
501
  void fixLCSSAPHIs();
502
503
  /// Iteratively sink the scalarized operands of a predicated instruction into
504
  /// the block that was created for it.
505
  void sinkScalarOperands(Instruction *PredInst);
506
507
  /// Shrinks vector element sizes to the smallest bitwidth they can be legally
508
  /// represented as.
509
  void truncateToMinimalBitwidths();
510
511
  /// Insert the new loop to the loop hierarchy and pass manager
512
  /// and update the analysis passes.
513
  void updateAnalysis();
514
515
  /// Create a broadcast instruction. This method generates a broadcast
516
  /// instruction (shuffle) for loop invariant values and for the induction
517
  /// value. If this is the induction variable then we extend it to N, N+1, ...
518
  /// this is needed because each iteration in the loop corresponds to a SIMD
519
  /// element.
520
  virtual Value *getBroadcastInstrs(Value *V);
521
522
  /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
523
  /// to each vector element of Val. The sequence starts at StartIndex.
524
  /// \p Opcode is relevant for FP induction variable.
525
  virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
526
                               Instruction::BinaryOps Opcode =
527
                               Instruction::BinaryOpsEnd);
528
529
  /// Compute scalar induction steps. \p ScalarIV is the scalar induction
530
  /// variable on which to base the steps, \p Step is the size of the step, and
531
  /// \p EntryVal is the value from the original loop that maps to the steps.
532
  /// Note that \p EntryVal doesn't have to be an induction variable - it
533
  /// can also be a truncate instruction.
534
  void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
535
                        const InductionDescriptor &ID);
536
537
  /// Create a vector induction phi node based on an existing scalar one. \p
538
  /// EntryVal is the value from the original loop that maps to the vector phi
539
  /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
540
  /// truncate instruction, instead of widening the original IV, we widen a
541
  /// version of the IV truncated to \p EntryVal's type.
542
  void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
543
                                       Value *Step, Instruction *EntryVal);
544
545
  /// Returns true if an instruction \p I should be scalarized instead of
546
  /// vectorized for the chosen vectorization factor.
547
  bool shouldScalarizeInstruction(Instruction *I) const;
548
549
  /// Returns true if we should generate a scalar version of \p IV.
550
  bool needsScalarInduction(Instruction *IV) const;
551
552
  /// If there is a cast involved in the induction variable \p ID, which should
553
  /// be ignored in the vectorized loop body, this function records the
554
  /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
555
  /// cast. We had already proved that the casted Phi is equal to the uncasted
556
  /// Phi in the vectorized loop (under a runtime guard), and therefore
557
  /// there is no need to vectorize the cast - the same value can be used in the
558
  /// vector loop for both the Phi and the cast.
559
  /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
560
  /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
561
  ///
562
  /// \p EntryVal is the value from the original loop that maps to the vector
563
  /// phi node and is used to distinguish what is the IV currently being
564
  /// processed - original one (if \p EntryVal is a phi corresponding to the
565
  /// original IV) or the "newly-created" one based on the proof mentioned above
566
  /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
567
  /// latter case \p EntryVal is a TruncInst and we must not record anything for
568
  /// that IV, but it's error-prone to expect callers of this routine to care
569
  /// about that, hence this explicit parameter.
570
  void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
571
                                             const Instruction *EntryVal,
572
                                             Value *VectorLoopValue,
573
                                             unsigned Part,
574
                                             unsigned Lane = UINT_MAX);
575
576
  /// Generate a shuffle sequence that will reverse the vector Vec.
577
  virtual Value *reverseVector(Value *Vec);
578
579
  /// Returns (and creates if needed) the original loop trip count.
580
  Value *getOrCreateTripCount(Loop *NewLoop);
581
582
  /// Returns (and creates if needed) the trip count of the widened loop.
583
  Value *getOrCreateVectorTripCount(Loop *NewLoop);
584
585
  /// Returns a bitcasted value to the requested vector type.
586
  /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
587
  Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
588
                                const DataLayout &DL);
589
590
  /// Emit a bypass check to see if the vector trip count is zero, including if
591
  /// it overflows.
592
  void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
593
594
  /// Emit a bypass check to see if all of the SCEV assumptions we've
595
  /// had to make are correct.
596
  void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
597
598
  /// Emit bypass checks to check any memory assumptions we may have made.
599
  void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
600
601
  /// Compute the transformed value of Index at offset StartValue using step
602
  /// StepValue.
603
  /// For integer induction, returns StartValue + Index * StepValue.
604
  /// For pointer induction, returns StartValue[Index * StepValue].
605
  /// FIXME: The newly created binary instructions should contain nsw/nuw
606
  /// flags, which can be found from the original scalar operations.
607
  Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
608
                              const DataLayout &DL,
609
                              const InductionDescriptor &ID) const;
610
611
  /// Add additional metadata to \p To that was not present on \p Orig.
612
  ///
613
  /// Currently this is used to add the noalias annotations based on the
614
  /// inserted memchecks.  Use this for instructions that are *cloned* into the
615
  /// vector loop.
616
  void addNewMetadata(Instruction *To, const Instruction *Orig);
617
618
  /// Add metadata from one instruction to another.
619
  ///
620
  /// This includes both the original MDs from \p From and additional ones (\see
621
  /// addNewMetadata).  Use this for *newly created* instructions in the vector
622
  /// loop.
623
  void addMetadata(Instruction *To, Instruction *From);
624
625
  /// Similar to the previous function but it adds the metadata to a
626
  /// vector of instructions.
627
  void addMetadata(ArrayRef<Value *> To, Instruction *From);
628
629
  /// The original loop.
630
  Loop *OrigLoop;
631
632
  /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
633
  /// dynamic knowledge to simplify SCEV expressions and converts them to a
634
  /// more usable form.
635
  PredicatedScalarEvolution &PSE;
636
637
  /// Loop Info.
638
  LoopInfo *LI;
639
640
  /// Dominator Tree.
641
  DominatorTree *DT;
642
643
  /// Alias Analysis.
644
  AliasAnalysis *AA;
645
646
  /// Target Library Info.
647
  const TargetLibraryInfo *TLI;
648
649
  /// Target Transform Info.
650
  const TargetTransformInfo *TTI;
651
652
  /// Assumption Cache.
653
  AssumptionCache *AC;
654
655
  /// Interface to emit optimization remarks.
656
  OptimizationRemarkEmitter *ORE;
657
658
  /// LoopVersioning.  It's only set up (non-null) if memchecks were
659
  /// used.
660
  ///
661
  /// This is currently only used to add no-alias metadata based on the
662
  /// memchecks.  The actually versioning is performed manually.
663
  std::unique_ptr<LoopVersioning> LVer;
664
665
  /// The vectorization SIMD factor to use. Each vector will have this many
666
  /// vector elements.
667
  unsigned VF;
668
669
  /// The vectorization unroll factor to use. Each scalar is vectorized to this
670
  /// many different vector instructions.
671
  unsigned UF;
672
673
  /// The builder that we use
674
  IRBuilder<> Builder;
675
676
  // --- Vectorization state ---
677
678
  /// The vector-loop preheader.
679
  BasicBlock *LoopVectorPreHeader;
680
681
  /// The scalar-loop preheader.
682
  BasicBlock *LoopScalarPreHeader;
683
684
  /// Middle Block between the vector and the scalar.
685
  BasicBlock *LoopMiddleBlock;
686
687
  /// The ExitBlock of the scalar loop.
688
  BasicBlock *LoopExitBlock;
689
690
  /// The vector loop body.
691
  BasicBlock *LoopVectorBody;
692
693
  /// The scalar loop body.
694
  BasicBlock *LoopScalarBody;
695
696
  /// A list of all bypass blocks. The first block is the entry of the loop.
697
  SmallVector<BasicBlock *, 4> LoopBypassBlocks;
698
699
  /// The new Induction variable which was added to the new block.
700
  PHINode *Induction = nullptr;
701
702
  /// The induction variable of the old basic block.
703
  PHINode *OldInduction = nullptr;
704
705
  /// Maps values from the original loop to their corresponding values in the
706
  /// vectorized loop. A key value can map to either vector values, scalar
707
  /// values or both kinds of values, depending on whether the key was
708
  /// vectorized and scalarized.
709
  VectorizerValueMap VectorLoopValueMap;
710
711
  /// Store instructions that were predicated.
712
  SmallVector<Instruction *, 4> PredicatedInstructions;
713
714
  /// Trip count of the original loop.
715
  Value *TripCount = nullptr;
716
717
  /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
718
  Value *VectorTripCount = nullptr;
719
720
  /// The legality analysis.
721
  LoopVectorizationLegality *Legal;
722
723
  /// The profitablity analysis.
724
  LoopVectorizationCostModel *Cost;
725
726
  // Record whether runtime checks are added.
727
  bool AddedSafetyChecks = false;
728
729
  // Holds the end values for each induction variable. We save the end values
730
  // so we can later fix-up the external users of the induction variables.
731
  DenseMap<PHINode *, Value *> IVEndValues;
732
733
  // Vector of original scalar PHIs whose corresponding widened PHIs need to be
734
  // fixed up at the end of vector code generation.
735
  SmallVector<PHINode *, 8> OrigPHIsToFix;
736
};
737
738
class InnerLoopUnroller : public InnerLoopVectorizer {
739
public:
740
  InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
741
                    LoopInfo *LI, DominatorTree *DT,
742
                    const TargetLibraryInfo *TLI,
743
                    const TargetTransformInfo *TTI, AssumptionCache *AC,
744
                    OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
745
                    LoopVectorizationLegality *LVL,
746
                    LoopVectorizationCostModel *CM)
747
      : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
748
1.74k
                            UnrollFactor, LVL, CM) {}
749
750
private:
751
  Value *getBroadcastInstrs(Value *V) override;
752
  Value *getStepVector(Value *Val, int StartIdx, Value *Step,
753
                       Instruction::BinaryOps Opcode =
754
                       Instruction::BinaryOpsEnd) override;
755
  Value *reverseVector(Value *Vec) override;
756
};
757
758
} // end namespace llvm
759
760
/// Look for a meaningful debug location on the instruction or it's
761
/// operands.
762
34.1k
static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
763
34.1k
  if (!I)
764
7.56k
    return I;
765
26.5k
766
26.5k
  DebugLoc Empty;
767
26.5k
  if (I->getDebugLoc() != Empty)
768
4
    return I;
769
26.5k
770
78.0k
  
for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); 26.5k
OI != OE;
++OI51.4k
) {
771
52.3k
    if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
772
26.5k
      if (OpInst->getDebugLoc() != Empty)
773
928
        return OpInst;
774
52.3k
  }
775
26.5k
776
26.5k
  
return I25.6k
;
777
26.5k
}
778
779
186k
void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
780
186k
  if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
781
178k
    const DILocation *DIL = Inst->getDebugLoc();
782
178k
    if (DIL && 
Inst->getFunction()->isDebugInfoForProfiling()8.93k
&&
783
178k
        
!isa<DbgInfoIntrinsic>(Inst)34
) {
784
34
      auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
785
34
      if (NewDIL)
786
34
        B.SetCurrentDebugLocation(NewDIL.getValue());
787
34
      else
788
34
        LLVM_DEBUG(dbgs()
789
34
                   << "Failed to create new discriminator: "
790
34
                   << DIL->getFilename() << " Line: " << DIL->getLine());
791
34
    }
792
178k
    else
793
178k
      B.SetCurrentDebugLocation(DIL);
794
178k
  } else
795
8.50k
    B.SetCurrentDebugLocation(DebugLoc());
796
186k
}
797
798
#ifndef NDEBUG
799
/// \return string containing a file name and a line # for the given loop.
800
static std::string getDebugLocString(const Loop *L) {
801
  std::string Result;
802
  if (L) {
803
    raw_string_ostream OS(Result);
804
    if (const DebugLoc LoopDbgLoc = L->getStartLoc())
805
      LoopDbgLoc.print(OS);
806
    else
807
      // Just print the module name.
808
      OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
809
    OS.flush();
810
  }
811
  return Result;
812
}
813
#endif
814
815
void InnerLoopVectorizer::addNewMetadata(Instruction *To,
816
210k
                                         const Instruction *Orig) {
817
210k
  // If the loop was versioned with memchecks, add the corresponding no-alias
818
210k
  // metadata.
819
210k
  if (LVer && 
(30.8k
isa<LoadInst>(Orig)30.8k
||
isa<StoreInst>(Orig)23.9k
))
820
11.9k
    LVer->annotateInstWithNoAlias(To, Orig);
821
210k
}
822
823
void InnerLoopVectorizer::addMetadata(Instruction *To,
824
138k
                                      Instruction *From) {
825
138k
  propagateMetadata(To, From);
826
138k
  addNewMetadata(To, From);
827
138k
}
828
829
void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
830
105k
                                      Instruction *From) {
831
105k
  for (Value *V : To) {
832
105k
    if (Instruction *I = dyn_cast<Instruction>(V))
833
105k
      addMetadata(I, From);
834
105k
  }
835
105k
}
836
837
namespace llvm {
838
839
/// LoopVectorizationCostModel - estimates the expected speedups due to
840
/// vectorization.
841
/// In many cases vectorization is not profitable. This can happen because of
842
/// a number of reasons. In this class we mainly attempt to predict the
843
/// expected speedup/slowdowns due to the supported instruction set. We use the
844
/// TargetTransformInfo to query the different backends for the cost of
845
/// different operations.
846
class LoopVectorizationCostModel {
847
public:
848
  LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE,
849
                             LoopInfo *LI, LoopVectorizationLegality *Legal,
850
                             const TargetTransformInfo &TTI,
851
                             const TargetLibraryInfo *TLI, DemandedBits *DB,
852
                             AssumptionCache *AC,
853
                             OptimizationRemarkEmitter *ORE, const Function *F,
854
                             const LoopVectorizeHints *Hints,
855
                             InterleavedAccessInfo &IAI)
856
      : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
857
19.9k
    AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {}
858
859
  /// \return An upper bound for the vectorization factor, or None if
860
  /// vectorization and interleaving should be avoided up front.
861
  Optional<unsigned> computeMaxVF(bool OptForSize);
862
863
  /// \return The most profitable vectorization factor and the cost of that VF.
864
  /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
865
  /// then this vectorization factor will be selected if vectorization is
866
  /// possible.
867
  VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
868
869
  /// Setup cost-based decisions for user vectorization factor.
870
649
  void selectUserVectorizationFactor(unsigned UserVF) {
871
649
    collectUniformsAndScalars(UserVF);
872
649
    collectInstsToScalarize(UserVF);
873
649
  }
874
875
  /// \return The size (in bits) of the smallest and widest types in the code
876
  /// that needs to be vectorized. We ignore values that remain scalar such as
877
  /// 64 bit loop indices.
878
  std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
879
880
  /// \return The desired interleave count.
881
  /// If interleave count has been specified by metadata it will be returned.
882
  /// Otherwise, the interleave count is computed and returned. VF and LoopCost
883
  /// are the selected vectorization factor and the cost of the selected VF.
884
  unsigned selectInterleaveCount(bool OptForSize, unsigned VF,
885
                                 unsigned LoopCost);
886
887
  /// Memory access instruction may be vectorized in more than one way.
888
  /// Form of instruction after vectorization depends on cost.
889
  /// This function takes cost-based decisions for Load/Store instructions
890
  /// and collects them in a map. This decisions map is used for building
891
  /// the lists of loop-uniform and loop-scalar instructions.
892
  /// The calculated cost is saved with widening decision in order to
893
  /// avoid redundant calculations.
894
  void setCostBasedWideningDecision(unsigned VF);
895
896
  /// A struct that represents some properties of the register usage
897
  /// of a loop.
898
  struct RegisterUsage {
899
    /// Holds the number of loop invariant values that are used in the loop.
900
    unsigned LoopInvariantRegs;
901
902
    /// Holds the maximum number of concurrent live intervals in the loop.
903
    unsigned MaxLocalUsers;
904
  };
905
906
  /// \return Returns information about the register usages of the loop for the
907
  /// given vectorization factors.
908
  SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
909
910
  /// Collect values we want to ignore in the cost model.
911
  void collectValuesToIgnore();
912
913
  /// \returns The smallest bitwidth each instruction can be represented with.
914
  /// The vector equivalents of these instructions should be truncated to this
915
  /// type.
916
30.6k
  const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
917
30.6k
    return MinBWs;
918
30.6k
  }
919
920
  /// \returns True if it is more profitable to scalarize instruction \p I for
921
  /// vectorization factor \p VF.
922
492k
  bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
923
492k
    assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
924
492k
925
492k
    // Cost model is not run in the VPlan-native path - return conservative
926
492k
    // result until this changes.
927
492k
    if (EnableVPlanNativePath)
928
47
      return false;
929
492k
930
492k
    auto Scalars = InstsToScalarize.find(VF);
931
492k
    assert(Scalars != InstsToScalarize.end() &&
932
492k
           "VF not yet analyzed for scalarization profitability");
933
492k
    return Scalars->second.find(I) != Scalars->second.end();
934
492k
  }
935
936
  /// Returns true if \p I is known to be uniform after vectorization.
937
917k
  bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
938
917k
    if (VF == 1)
939
404k
      return true;
940
512k
941
512k
    // Cost model is not run in the VPlan-native path - return conservative
942
512k
    // result until this changes.
943
512k
    if (EnableVPlanNativePath)
944
0
      return false;
945
512k
946
512k
    auto UniformsPerVF = Uniforms.find(VF);
947
512k
    assert(UniformsPerVF != Uniforms.end() &&
948
512k
           "VF not yet analyzed for uniformity");
949
512k
    return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
950
512k
  }
951
952
  /// Returns true if \p I is known to be scalar after vectorization.
953
1.72M
  bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
954
1.72M
    if (VF == 1)
955
650k
      return true;
956
1.07M
957
1.07M
    // Cost model is not run in the VPlan-native path - return conservative
958
1.07M
    // result until this changes.
959
1.07M
    if (EnableVPlanNativePath)
960
47
      return false;
961
1.07M
962
1.07M
    auto ScalarsPerVF = Scalars.find(VF);
963
1.07M
    assert(ScalarsPerVF != Scalars.end() &&
964
1.07M
           "Scalar values are not calculated for VF");
965
1.07M
    return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
966
1.07M
  }
967
968
  /// \returns True if instruction \p I can be truncated to a smaller bitwidth
969
  /// for vectorization factor \p VF.
970
757k
  bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
971
757k
    return VF > 1 && 
MinBWs.find(I) != MinBWs.end()296k
&&
972
757k
           
!isProfitableToScalarize(I, VF)893
&&
973
757k
           
!isScalarAfterVectorization(I, VF)893
;
974
757k
  }
975
976
  /// Decision that was taken during cost calculation for memory instruction.
977
  enum InstWidening {
978
    CM_Unknown,
979
    CM_Widen,         // For consecutive accesses with stride +1.
980
    CM_Widen_Reverse, // For consecutive accesses with stride -1.
981
    CM_Interleave,
982
    CM_GatherScatter,
983
    CM_Scalarize
984
  };
985
986
  /// Save vectorization decision \p W and \p Cost taken by the cost model for
987
  /// instruction \p I and vector width \p VF.
988
  void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
989
58.3k
                           unsigned Cost) {
990
58.3k
    assert(VF >= 2 && "Expected VF >=2");
991
58.3k
    WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
992
58.3k
  }
993
994
  /// Save vectorization decision \p W and \p Cost taken by the cost model for
995
  /// interleaving group \p Grp and vector width \p VF.
996
  void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
997
2.26k
                           InstWidening W, unsigned Cost) {
998
2.26k
    assert(VF >= 2 && "Expected VF >=2");
999
2.26k
    /// Broadcast this decicion to all instructions inside the group.
1000
2.26k
    /// But the cost will be assigned to one instruction only.
1001
9.05k
    for (unsigned i = 0; i < Grp->getFactor(); 
++i6.78k
) {
1002
6.78k
      if (auto *I = Grp->getMember(i)) {
1003
5.90k
        if (Grp->getInsertPos() == I)
1004
2.26k
          WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1005
3.64k
        else
1006
3.64k
          WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1007
5.90k
      }
1008
6.78k
    }
1009
2.26k
  }
1010
1011
  /// Return the cost model decision for the given instruction \p I and vector
1012
  /// width \p VF. Return CM_Unknown if this instruction did not pass
1013
  /// through the cost modeling.
1014
248k
  InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1015
248k
    assert(VF >= 2 && "Expected VF >=2");
1016
248k
1017
248k
    // Cost model is not run in the VPlan-native path - return conservative
1018
248k
    // result until this changes.
1019
248k
    if (EnableVPlanNativePath)
1020
16
      return CM_GatherScatter;
1021
248k
1022
248k
    std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1023
248k
    auto Itr = WideningDecisions.find(InstOnVF);
1024
248k
    if (Itr == WideningDecisions.end())
1025
2.26k
      return CM_Unknown;
1026
245k
    return Itr->second.first;
1027
245k
  }
1028
1029
  /// Return the vectorization cost for the given instruction \p I and vector
1030
  /// width \p VF.
1031
64.1k
  unsigned getWideningCost(Instruction *I, unsigned VF) {
1032
64.1k
    assert(VF >= 2 && "Expected VF >=2");
1033
64.1k
    std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1034
64.1k
    assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1035
64.1k
           "The cost is not calculated");
1036
64.1k
    return WideningDecisions[InstOnVF].second;
1037
64.1k
  }
1038
1039
  /// Return True if instruction \p I is an optimizable truncate whose operand
1040
  /// is an induction variable. Such a truncate will be removed by adding a new
1041
  /// induction variable with the destination type.
1042
99.7k
  bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1043
99.7k
    // If the instruction is not a truncate, return false.
1044
99.7k
    auto *Trunc = dyn_cast<TruncInst>(I);
1045
99.7k
    if (!Trunc)
1046
45.2k
      return false;
1047
54.5k
1048
54.5k
    // Get the source and destination types of the truncate.
1049
54.5k
    Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1050
54.5k
    Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1051
54.5k
1052
54.5k
    // If the truncate is free for the given types, return false. Replacing a
1053
54.5k
    // free truncate with an induction variable would add an induction variable
1054
54.5k
    // update instruction to each iteration of the loop. We exclude from this
1055
54.5k
    // check the primary induction variable since it will need an update
1056
54.5k
    // instruction regardless.
1057
54.5k
    Value *Op = Trunc->getOperand(0);
1058
54.5k
    if (Op != Legal->getPrimaryInduction() && 
TTI.isTruncateFree(SrcTy, DestTy)42.0k
)
1059
16.7k
      return false;
1060
37.8k
1061
37.8k
    // If the truncated value is not an induction variable, return false.
1062
37.8k
    return Legal->isInductionPhi(Op);
1063
37.8k
  }
1064
1065
  /// Collects the instructions to scalarize for each predicated instruction in
1066
  /// the loop.
1067
  void collectInstsToScalarize(unsigned VF);
1068
1069
  /// Collect Uniform and Scalar values for the given \p VF.
1070
  /// The sets depend on CM decision for Load/Store instructions
1071
  /// that may be vectorized as interleave, gather-scatter or scalarized.
1072
197k
  void collectUniformsAndScalars(unsigned VF) {
1073
197k
    // Do the analysis once.
1074
197k
    if (VF == 1 || 
Uniforms.find(VF) != Uniforms.end()177k
)
1075
164k
      return;
1076
32.3k
    setCostBasedWideningDecision(VF);
1077
32.3k
    collectLoopUniforms(VF);
1078
32.3k
    collectLoopScalars(VF);
1079
32.3k
  }
1080
1081
  /// Returns true if the target machine supports masked store operation
1082
  /// for the given \p DataType and kind of access to \p Ptr.
1083
4.52k
  bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
1084
4.52k
    return Legal->isConsecutivePtr(Ptr) && 
TTI.isLegalMaskedStore(DataType)3.02k
;
1085
4.52k
  }
1086
1087
  /// Returns true if the target machine supports masked load operation
1088
  /// for the given \p DataType and kind of access to \p Ptr.
1089
3.64k
  bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
1090
3.64k
    return Legal->isConsecutivePtr(Ptr) && 
TTI.isLegalMaskedLoad(DataType)1.89k
;
1091
3.64k
  }
1092
1093
  /// Returns true if the target machine supports masked scatter operation
1094
  /// for the given \p DataType.
1095
10.6k
  bool isLegalMaskedScatter(Type *DataType) {
1096
10.6k
    return TTI.isLegalMaskedScatter(DataType);
1097
10.6k
  }
1098
1099
  /// Returns true if the target machine supports masked gather operation
1100
  /// for the given \p DataType.
1101
13.7k
  bool isLegalMaskedGather(Type *DataType) {
1102
13.7k
    return TTI.isLegalMaskedGather(DataType);
1103
13.7k
  }
1104
1105
  /// Returns true if the target machine can represent \p V as a masked gather
1106
  /// or scatter operation.
1107
16.8k
  bool isLegalGatherOrScatter(Value *V) {
1108
16.8k
    bool LI = isa<LoadInst>(V);
1109
16.8k
    bool SI = isa<StoreInst>(V);
1110
16.8k
    if (!LI && 
!SI6.49k
)
1111
0
      return false;
1112
16.8k
    auto *Ty = getMemInstValueType(V);
1113
16.8k
    return (LI && 
isLegalMaskedGather(Ty)10.3k
) ||
(16.7k
SI16.7k
&&
isLegalMaskedScatter(Ty)6.49k
);
1114
16.8k
  }
1115
1116
  /// Returns true if \p I is an instruction that will be scalarized with
1117
  /// predication. Such instructions include conditional stores and
1118
  /// instructions that may divide by zero.
1119
  /// If a non-zero VF has been calculated, we check if I will be scalarized
1120
  /// predication for that VF.
1121
  bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1122
1123
  // Returns true if \p I is an instruction that will be predicated either
1124
  // through scalar predication or masked load/store or masked gather/scatter.
1125
  // Superset of instructions that return true for isScalarWithPredication.
1126
16.5k
  bool isPredicatedInst(Instruction *I) {
1127
16.5k
    if (!blockNeedsPredication(I->getParent()))
1128
14.2k
      return false;
1129
2.34k
    // Loads and stores that need some form of masked operation are predicated
1130
2.34k
    // instructions.
1131
2.34k
    if (isa<LoadInst>(I) || 
isa<StoreInst>(I)1.08k
)
1132
2.34k
      return Legal->isMaskRequired(I);
1133
0
    return isScalarWithPredication(I);
1134
0
  }
1135
1136
  /// Returns true if \p I is a memory instruction with consecutive memory
1137
  /// access that can be widened.
1138
  bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1139
1140
  /// Returns true if \p I is a memory instruction in an interleaved-group
1141
  /// of memory accesses that can be vectorized with wide vector loads/stores
1142
  /// and shuffles.
1143
  bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1144
1145
  /// Check if \p Instr belongs to any interleaved access group.
1146
20.4k
  bool isAccessInterleaved(Instruction *Instr) {
1147
20.4k
    return InterleaveInfo.isInterleaved(Instr);
1148
20.4k
  }
1149
1150
  /// Get the interleaved access group that \p Instr belongs to.
1151
  const InterleaveGroup<Instruction> *
1152
790k
  getInterleavedAccessGroup(Instruction *Instr) {
1153
790k
    return InterleaveInfo.getInterleaveGroup(Instr);
1154
790k
  }
1155
1156
  /// Returns true if an interleaved group requires a scalar iteration
1157
  /// to handle accesses with gaps, and there is nothing preventing us from
1158
  /// creating a scalar epilogue.
1159
32.3k
  bool requiresScalarEpilogue() const {
1160
32.3k
    return IsScalarEpilogueAllowed && 
InterleaveInfo.requiresScalarEpilogue()32.2k
;
1161
32.3k
  }
1162
1163
  /// Returns true if a scalar epilogue is not allowed due to optsize.
1164
68
  bool isScalarEpilogueAllowed() const { return IsScalarEpilogueAllowed; }
1165
1166
  /// Returns true if all loop blocks should be masked to fold tail loop.
1167
915k
  bool foldTailByMasking() const { return FoldTailByMasking; }
1168
1169
844k
  bool blockNeedsPredication(BasicBlock *BB) {
1170
844k
    return foldTailByMasking() || 
Legal->blockNeedsPredication(BB)840k
;
1171
844k
  }
1172
1173
  /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1174
  /// with factor VF.  Return the cost of the instruction, including
1175
  /// scalarization overhead if it's needed.
1176
  unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
1177
1178
  /// Estimate cost of a call instruction CI if it were vectorized with factor
1179
  /// VF. Return the cost of the instruction, including scalarization overhead
1180
  /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1181
  /// scalarized -
1182
  /// i.e. either vector version isn't available, or is too expensive.
1183
  unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
1184
1185
private:
1186
  unsigned NumPredStores = 0;
1187
1188
  /// \return An upper bound for the vectorization factor, larger than zero.
1189
  /// One is returned if vectorization should best be avoided due to cost.
1190
  unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount);
1191
1192
  /// The vectorization cost is a combination of the cost itself and a boolean
1193
  /// indicating whether any of the contributing operations will actually
1194
  /// operate on
1195
  /// vector values after type legalization in the backend. If this latter value
1196
  /// is
1197
  /// false, then all operations will be scalarized (i.e. no vectorization has
1198
  /// actually taken place).
1199
  using VectorizationCostTy = std::pair<unsigned, bool>;
1200
1201
  /// Returns the expected execution cost. The unit of the cost does
1202
  /// not matter because we use the 'cost' units to compare different
1203
  /// vector widths. The cost that is returned is *not* normalized by
1204
  /// the factor width.
1205
  VectorizationCostTy expectedCost(unsigned VF);
1206
1207
  /// Returns the execution time cost of an instruction for a given vector
1208
  /// width. Vector width of one means scalar.
1209
  VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1210
1211
  /// The cost-computation logic from getInstructionCost which provides
1212
  /// the vector type as an output parameter.
1213
  unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1214
1215
  /// Calculate vectorization cost of memory instruction \p I.
1216
  unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1217
1218
  /// The cost computation for scalarized memory instruction.
1219
  unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1220
1221
  /// The cost computation for interleaving group of memory instructions.
1222
  unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1223
1224
  /// The cost computation for Gather/Scatter instruction.
1225
  unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1226
1227
  /// The cost computation for widening instruction \p I with consecutive
1228
  /// memory access.
1229
  unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1230
1231
  /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1232
  /// Load: scalar load + broadcast.
1233
  /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1234
  /// element)
1235
  unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1236
1237
  /// Estimate the overhead of scalarizing an instruction. This is a
1238
  /// convenience wrapper for the type-based getScalarizationOverhead API.
1239
  unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
1240
1241
  /// Returns whether the instruction is a load or store and will be a emitted
1242
  /// as a vector operation.
1243
  bool isConsecutiveLoadOrStore(Instruction *I);
1244
1245
  /// Returns true if an artificially high cost for emulated masked memrefs
1246
  /// should be used.
1247
  bool useEmulatedMaskMemRefHack(Instruction *I);
1248
1249
  /// Create an analysis remark that explains why vectorization failed
1250
  ///
1251
  /// \p RemarkName is the identifier for the remark.  \return the remark object
1252
  /// that can be streamed to.
1253
103
  OptimizationRemarkAnalysis createMissedAnalysis(StringRef RemarkName) {
1254
103
    return createLVMissedAnalysis(Hints->vectorizeAnalysisPassName(),
1255
103
                                  RemarkName, TheLoop);
1256
103
  }
1257
1258
  /// Map of scalar integer values to the smallest bitwidth they can be legally
1259
  /// represented as. The vector equivalents of these values should be truncated
1260
  /// to this type.
1261
  MapVector<Instruction *, uint64_t> MinBWs;
1262
1263
  /// A type representing the costs for instructions if they were to be
1264
  /// scalarized rather than vectorized. The entries are Instruction-Cost
1265
  /// pairs.
1266
  using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1267
1268
  /// A set containing all BasicBlocks that are known to present after
1269
  /// vectorization as a predicated block.
1270
  SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1271
1272
  /// Records whether it is allowed to have the original scalar loop execute at
1273
  /// least once. This may be needed as a fallback loop in case runtime 
1274
  /// aliasing/dependence checks fail, or to handle the tail/remainder
1275
  /// iterations when the trip count is unknown or doesn't divide by the VF,
1276
  /// or as a peel-loop to handle gaps in interleave-groups.
1277
  /// Under optsize and when the trip count is very small we don't allow any
1278
  /// iterations to execute in the scalar loop.
1279
  bool IsScalarEpilogueAllowed = true;
1280
1281
  /// All blocks of loop are to be masked to fold tail of scalar iterations.
1282
  bool FoldTailByMasking = false;
1283
1284
  /// A map holding scalar costs for different vectorization factors. The
1285
  /// presence of a cost for an instruction in the mapping indicates that the
1286
  /// instruction will be scalarized when vectorizing with the associated
1287
  /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1288
  DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1289
1290
  /// Holds the instructions known to be uniform after vectorization.
1291
  /// The data is collected per VF.
1292
  DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
1293
1294
  /// Holds the instructions known to be scalar after vectorization.
1295
  /// The data is collected per VF.
1296
  DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
1297
1298
  /// Holds the instructions (address computations) that are forced to be
1299
  /// scalarized.
1300
  DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1301
1302
  /// Returns the expected difference in cost from scalarizing the expression
1303
  /// feeding a predicated instruction \p PredInst. The instructions to
1304
  /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1305
  /// non-negative return value implies the expression will be scalarized.
1306
  /// Currently, only single-use chains are considered for scalarization.
1307
  int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1308
                              unsigned VF);
1309
1310
  /// Collect the instructions that are uniform after vectorization. An
1311
  /// instruction is uniform if we represent it with a single scalar value in
1312
  /// the vectorized loop corresponding to each vector iteration. Examples of
1313
  /// uniform instructions include pointer operands of consecutive or
1314
  /// interleaved memory accesses. Note that although uniformity implies an
1315
  /// instruction will be scalar, the reverse is not true. In general, a
1316
  /// scalarized instruction will be represented by VF scalar values in the
1317
  /// vectorized loop, each corresponding to an iteration of the original
1318
  /// scalar loop.
1319
  void collectLoopUniforms(unsigned VF);
1320
1321
  /// Collect the instructions that are scalar after vectorization. An
1322
  /// instruction is scalar if it is known to be uniform or will be scalarized
1323
  /// during vectorization. Non-uniform scalarized instructions will be
1324
  /// represented by VF values in the vectorized loop, each corresponding to an
1325
  /// iteration of the original scalar loop.
1326
  void collectLoopScalars(unsigned VF);
1327
1328
  /// Keeps cost model vectorization decision and cost for instructions.
1329
  /// Right now it is used for memory instructions only.
1330
  using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
1331
                                std::pair<InstWidening, unsigned>>;
1332
1333
  DecisionList WideningDecisions;
1334
1335
  /// Returns true if \p V is expected to be vectorized and it needs to be
1336
  /// extracted.
1337
30.3k
  bool needsExtract(Value *V, unsigned VF) const {
1338
30.3k
    Instruction *I = dyn_cast<Instruction>(V);
1339
30.3k
    if (VF == 1 || !I || 
!TheLoop->contains(I)28.4k
||
TheLoop->isLoopInvariant(I)27.8k
)
1340
2.51k
      return false;
1341
27.8k
1342
27.8k
    // Assume we can vectorize V (and hence we need extraction) if the
1343
27.8k
    // scalars are not computed yet. This can happen, because it is called
1344
27.8k
    // via getScalarizationOverhead from setCostBasedWideningDecision, before
1345
27.8k
    // the scalars are collected. That should be a safe assumption in most
1346
27.8k
    // cases, because we check if the operands have vectorizable types
1347
27.8k
    // beforehand in LoopVectorizationLegality.
1348
27.8k
    return Scalars.find(VF) == Scalars.end() ||
1349
27.8k
           
!isScalarAfterVectorization(I, VF)2.59k
;
1350
27.8k
  };
1351
1352
  /// Returns a range containing only operands needing to be extracted.
1353
  SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1354
18.1k
                                                   unsigned VF) {
1355
18.1k
    return SmallVector<Value *, 4>(make_filter_range(
1356
29.6k
        Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1357
18.1k
  }
1358
1359
public:
1360
  /// The loop that we evaluate.
1361
  Loop *TheLoop;
1362
1363
  /// Predicated scalar evolution analysis.
1364
  PredicatedScalarEvolution &PSE;
1365
1366
  /// Loop Info analysis.
1367
  LoopInfo *LI;
1368
1369
  /// Vectorization legality.
1370
  LoopVectorizationLegality *Legal;
1371
1372
  /// Vector target information.
1373
  const TargetTransformInfo &TTI;
1374
1375
  /// Target Library Info.
1376
  const TargetLibraryInfo *TLI;
1377
1378
  /// Demanded bits analysis.
1379
  DemandedBits *DB;
1380
1381
  /// Assumption cache.
1382
  AssumptionCache *AC;
1383
1384
  /// Interface to emit optimization remarks.
1385
  OptimizationRemarkEmitter *ORE;
1386
1387
  const Function *TheFunction;
1388
1389
  /// Loop Vectorize Hint.
1390
  const LoopVectorizeHints *Hints;
1391
1392
  /// The interleave access information contains groups of interleaved accesses
1393
  /// with the same stride and close to each other.
1394
  InterleavedAccessInfo &InterleaveInfo;
1395
1396
  /// Values to ignore in the cost model.
1397
  SmallPtrSet<const Value *, 16> ValuesToIgnore;
1398
1399
  /// Values to ignore in the cost model when VF > 1.
1400
  SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1401
};
1402
1403
} // end namespace llvm
1404
1405
// Return true if \p OuterLp is an outer loop annotated with hints for explicit
1406
// vectorization. The loop needs to be annotated with #pragma omp simd
1407
// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1408
// vector length information is not provided, vectorization is not considered
1409
// explicit. Interleave hints are not allowed either. These limitations will be
1410
// relaxed in the future.
1411
// Please, note that we are currently forced to abuse the pragma 'clang
1412
// vectorize' semantics. This pragma provides *auto-vectorization hints*
1413
// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1414
// provides *explicit vectorization hints* (LV can bypass legal checks and
1415
// assume that vectorization is legal). However, both hints are implemented
1416
// using the same metadata (llvm.loop.vectorize, processed by
1417
// LoopVectorizeHints). This will be fixed in the future when the native IR
1418
// representation for pragma 'omp simd' is introduced.
1419
static bool isExplicitVecOuterLoop(Loop *OuterLp,
1420
7
                                   OptimizationRemarkEmitter *ORE) {
1421
7
  assert(!OuterLp->empty() && "This is not an outer loop");
1422
7
  LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1423
7
1424
7
  // Only outer loops with an explicit vectorization hint are supported.
1425
7
  // Unannotated outer loops are ignored.
1426
7
  if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1427
0
    return false;
1428
7
1429
7
  Function *Fn = OuterLp->getHeader()->getParent();
1430
7
  if (!Hints.allowVectorization(Fn, OuterLp,
1431
7
                                true /*VectorizeOnlyWhenForced*/)) {
1432
0
    LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1433
0
    return false;
1434
0
  }
1435
7
1436
7
  if (Hints.getInterleave() > 1) {
1437
0
    // TODO: Interleave support is future work.
1438
0
    LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1439
0
                         "outer loops.\n");
1440
0
    Hints.emitRemarkWithHints();
1441
0
    return false;
1442
0
  }
1443
7
1444
7
  return true;
1445
7
}
1446
1447
static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1448
                                  OptimizationRemarkEmitter *ORE,
1449
179k
                                  SmallVectorImpl<Loop *> &V) {
1450
179k
  // Collect inner loops and outer loops without irreducible control flow. For
1451
179k
  // now, only collect outer loops that have explicit vectorization hints. If we
1452
179k
  // are stress testing the VPlan H-CFG construction, we collect the outermost
1453
179k
  // loop of every loop nest.
1454
179k
  if (L.empty() || 
VPlanBuildStressTest32.7k
||
1455
179k
      
(32.7k
EnableVPlanNativePath32.7k
&&
isExplicitVecOuterLoop(&L, ORE)7
)) {
1456
146k
    LoopBlocksRPO RPOT(&L);
1457
146k
    RPOT.perform(LI);
1458
146k
    if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1459
146k
      V.push_back(&L);
1460
146k
      // TODO: Collect inner loops inside marked outer loops in case
1461
146k
      // vectorization fails for the outer loop. Do not invoke
1462
146k
      // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1463
146k
      // already known to be reducible. We can use an inherited attribute for
1464
146k
      // that.
1465
146k
      return;
1466
146k
    }
1467
32.7k
  }
1468
32.7k
  for (Loop *InnerL : L)
1469
49.7k
    collectSupportedLoops(*InnerL, LI, ORE, V);
1470
32.7k
}
1471
1472
namespace {
1473
1474
/// The LoopVectorize Pass.
1475
struct LoopVectorize : public FunctionPass {
1476
  /// Pass identification, replacement for typeid
1477
  static char ID;
1478
1479
  LoopVectorizePass Impl;
1480
1481
  explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1482
                         bool VectorizeOnlyWhenForced = false)
1483
13.7k
      : FunctionPass(ID) {
1484
13.7k
    Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
1485
13.7k
    Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
1486
13.7k
    initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1487
13.7k
  }
1488
1489
279k
  bool runOnFunction(Function &F) override {
1490
279k
    if (skipFunction(F))
1491
44
      return false;
1492
279k
1493
279k
    auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1494
279k
    auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1495
279k
    auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1496
279k
    auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1497
279k
    auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1498
279k
    auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1499
279k
    auto *TLI = TLIP ? &TLIP->getTLI() : 
nullptr0
;
1500
279k
    auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1501
279k
    auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1502
279k
    auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1503
279k
    auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1504
279k
    auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1505
279k
    auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1506
279k
1507
279k
    std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1508
279k
        [&](Loop &L) -> const LoopAccessInfo & 
{ return LAA->getInfo(&L); }27.3k
;
1509
279k
1510
279k
    return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1511
279k
                        GetLAA, *ORE, PSI);
1512
279k
  }
1513
1514
13.7k
  void getAnalysisUsage(AnalysisUsage &AU) const override {
1515
13.7k
    AU.addRequired<AssumptionCacheTracker>();
1516
13.7k
    AU.addRequired<BlockFrequencyInfoWrapperPass>();
1517
13.7k
    AU.addRequired<DominatorTreeWrapperPass>();
1518
13.7k
    AU.addRequired<LoopInfoWrapperPass>();
1519
13.7k
    AU.addRequired<ScalarEvolutionWrapperPass>();
1520
13.7k
    AU.addRequired<TargetTransformInfoWrapperPass>();
1521
13.7k
    AU.addRequired<AAResultsWrapperPass>();
1522
13.7k
    AU.addRequired<LoopAccessLegacyAnalysis>();
1523
13.7k
    AU.addRequired<DemandedBitsWrapperPass>();
1524
13.7k
    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1525
13.7k
1526
13.7k
    // We currently do not preserve loopinfo/dominator analyses with outer loop
1527
13.7k
    // vectorization. Until this is addressed, mark these analyses as preserved
1528
13.7k
    // only for non-VPlan-native path.
1529
13.7k
    // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1530
13.7k
    if (!EnableVPlanNativePath) {
1531
13.7k
      AU.addPreserved<LoopInfoWrapperPass>();
1532
13.7k
      AU.addPreserved<DominatorTreeWrapperPass>();
1533
13.7k
    }
1534
13.7k
1535
13.7k
    AU.addPreserved<BasicAAWrapperPass>();
1536
13.7k
    AU.addPreserved<GlobalsAAWrapperPass>();
1537
13.7k
    AU.addRequired<ProfileSummaryInfoWrapperPass>();
1538
13.7k
  }
1539
};
1540
1541
} // end anonymous namespace
1542
1543
//===----------------------------------------------------------------------===//
1544
// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1545
// LoopVectorizationCostModel and LoopVectorizationPlanner.
1546
//===----------------------------------------------------------------------===//
1547
1548
52.8k
Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1549
52.8k
  // We need to place the broadcast of invariant variables outside the loop,
1550
52.8k
  // but only if it's proven safe to do so. Else, broadcast will be inside
1551
52.8k
  // vector loop body.
1552
52.8k
  Instruction *Instr = dyn_cast<Instruction>(V);
1553
52.8k
  bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1554
52.8k
                     (!Instr ||
1555
52.8k
                      
DT->dominates(Instr->getParent(), LoopVectorPreHeader)14.1k
);
1556
52.8k
  // Place the code for broadcasting invariant variables in the new preheader.
1557
52.8k
  IRBuilder<>::InsertPointGuard Guard(Builder);
1558
52.8k
  if (SafeToHoist)
1559
46.6k
    Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1560
52.8k
1561
52.8k
  // Broadcast the scalar into all locations in the vector.
1562
52.8k
  Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1563
52.8k
1564
52.8k
  return Shuf;
1565
52.8k
}
1566
1567
void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1568
11.2k
    const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1569
11.2k
  assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1570
11.2k
         "Expected either an induction phi-node or a truncate of it!");
1571
11.2k
  Value *Start = II.getStartValue();
1572
11.2k
1573
11.2k
  // Construct the initial value of the vector IV in the vector loop preheader
1574
11.2k
  auto CurrIP = Builder.saveIP();
1575
11.2k
  Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1576
11.2k
  if (isa<TruncInst>(EntryVal)) {
1577
2.39k
    assert(Start->getType()->isIntegerTy() &&
1578
2.39k
           "Truncation requires an integer type");
1579
2.39k
    auto *TruncType = cast<IntegerType>(EntryVal->getType());
1580
2.39k
    Step = Builder.CreateTrunc(Step, TruncType);
1581
2.39k
    Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1582
2.39k
  }
1583
11.2k
  Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1584
11.2k
  Value *SteppedStart =
1585
11.2k
      getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1586
11.2k
1587
11.2k
  // We create vector phi nodes for both integer and floating-point induction
1588
11.2k
  // variables. Here, we determine the kind of arithmetic we will perform.
1589
11.2k
  Instruction::BinaryOps AddOp;
1590
11.2k
  Instruction::BinaryOps MulOp;
1591
11.2k
  if (Step->getType()->isIntegerTy()) {
1592
11.1k
    AddOp = Instruction::Add;
1593
11.1k
    MulOp = Instruction::Mul;
1594
11.1k
  } else {
1595
20
    AddOp = II.getInductionOpcode();
1596
20
    MulOp = Instruction::FMul;
1597
20
  }
1598
11.2k
1599
11.2k
  // Multiply the vectorization factor by the step using integer or
1600
11.2k
  // floating-point arithmetic as appropriate.
1601
11.2k
  Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1602
11.2k
  Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1603
11.2k
1604
11.2k
  // Create a vector splat to use in the induction update.
1605
11.2k
  //
1606
11.2k
  // FIXME: If the step is non-constant, we create the vector splat with
1607
11.2k
  //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1608
11.2k
  //        handle a constant vector splat.
1609
11.2k
  Value *SplatVF = isa<Constant>(Mul)
1610
11.2k
                       ? 
ConstantVector::getSplat(VF, cast<Constant>(Mul))11.1k
1611
11.2k
                       : 
Builder.CreateVectorSplat(VF, Mul)32
;
1612
11.2k
  Builder.restoreIP(CurrIP);
1613
11.2k
1614
11.2k
  // We may need to add the step a number of times, depending on the unroll
1615
11.2k
  // factor. The last of those goes into the PHI.
1616
11.2k
  PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1617
11.2k
                                    &*LoopVectorBody->getFirstInsertionPt());
1618
11.2k
  VecInd->setDebugLoc(EntryVal->getDebugLoc());
1619
11.2k
  Instruction *LastInduction = VecInd;
1620
32.9k
  for (unsigned Part = 0; Part < UF; 
++Part21.7k
) {
1621
21.7k
    VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1622
21.7k
1623
21.7k
    if (isa<TruncInst>(EntryVal))
1624
4.46k
      addMetadata(LastInduction, EntryVal);
1625
21.7k
    recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1626
21.7k
1627
21.7k
    LastInduction = cast<Instruction>(addFastMathFlag(
1628
21.7k
        Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1629
21.7k
    LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1630
21.7k
  }
1631
11.2k
1632
11.2k
  // Move the last step to the end of the latch block. This ensures consistent
1633
11.2k
  // placement of all induction updates.
1634
11.2k
  auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1635
11.2k
  auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1636
11.2k
  auto *ICmp = cast<Instruction>(Br->getCondition());
1637
11.2k
  LastInduction->moveBefore(ICmp);
1638
11.2k
  LastInduction->setName("vec.ind.next");
1639
11.2k
1640
11.2k
  VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1641
11.2k
  VecInd->addIncoming(LastInduction, LoopVectorLatch);
1642
11.2k
}
1643
1644
48.8k
bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1645
48.8k
  return Cost->isScalarAfterVectorization(I, VF) ||
1646
48.8k
         
Cost->isProfitableToScalarize(I, VF)27.8k
;
1647
48.8k
}
1648
1649
17.4k
bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1650
17.4k
  if (shouldScalarizeInstruction(IV))
1651
6.20k
    return true;
1652
13.9k
  
auto isScalarInst = [&](User *U) -> bool 11.2k
{
1653
13.9k
    auto *I = cast<Instruction>(U);
1654
13.9k
    return (OrigLoop->contains(I) && 
shouldScalarizeInstruction(I)13.9k
);
1655
13.9k
  };
1656
11.2k
  return llvm::any_of(IV->users(), isScalarInst);
1657
11.2k
}
1658
1659
void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1660
    const InductionDescriptor &ID, const Instruction *EntryVal,
1661
96.1k
    Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1662
96.1k
  assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1663
96.1k
         "Expected either an induction phi-node or a truncate of it!");
1664
96.1k
1665
96.1k
  // This induction variable is not the phi from the original loop but the
1666
96.1k
  // newly-created IV based on the proof that casted Phi is equal to the
1667
96.1k
  // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1668
96.1k
  // re-uses the same InductionDescriptor that original IV uses but we don't
1669
96.1k
  // have to do any recording in this case - that is done when original IV is
1670
96.1k
  // processed.
1671
96.1k
  if (isa<TruncInst>(EntryVal))
1672
4.81k
    return;
1673
91.3k
1674
91.3k
  const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1675
91.3k
  if (Casts.empty())
1676
91.3k
    return;
1677
24
  // Only the first Cast instruction in the Casts vector is of interest.
1678
24
  // The rest of the Casts (if exist) have no uses outside the
1679
24
  // induction update chain itself.
1680
24
  Instruction *CastInst = *Casts.begin();
1681
24
  if (Lane < UINT_MAX)
1682
24
    
VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal)4
;
1683
20
  else
1684
20
    VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1685
24
}
1686
1687
19.2k
void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1688
19.2k
  assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1689
19.2k
         "Primary induction variable must have an integer type");
1690
19.2k
1691
19.2k
  auto II = Legal->getInductionVars()->find(IV);
1692
19.2k
  assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
1693
19.2k
1694
19.2k
  auto ID = II->second;
1695
19.2k
  assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1696
19.2k
1697
19.2k
  // The scalar value to broadcast. This will be derived from the canonical
1698
19.2k
  // induction variable.
1699
19.2k
  Value *ScalarIV = nullptr;
1700
19.2k
1701
19.2k
  // The value from the original loop to which we are mapping the new induction
1702
19.2k
  // variable.
1703
19.2k
  Instruction *EntryVal = Trunc ? 
cast<Instruction>(Trunc)2.50k
:
IV16.7k
;
1704
19.2k
1705
19.2k
  // True if we have vectorized the induction variable.
1706
19.2k
  auto VectorizedIV = false;
1707
19.2k
1708
19.2k
  // Determine if we want a scalar version of the induction variable. This is
1709
19.2k
  // true if the induction variable itself is not widened, or if it has at
1710
19.2k
  // least one user in the loop that is not widened.
1711
19.2k
  auto NeedsScalarIV = VF > 1 && 
needsScalarInduction(EntryVal)17.4k
;
1712
19.2k
1713
19.2k
  // Generate code for the induction step. Note that induction steps are
1714
19.2k
  // required to be loop-invariant
1715
19.2k
  assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
1716
19.2k
         "Induction step should be loop invariant");
1717
19.2k
  auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1718
19.2k
  Value *Step = nullptr;
1719
19.2k
  if (PSE.getSE()->isSCEVable(IV->getType())) {
1720
19.2k
    SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1721
19.2k
    Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
1722
19.2k
                             LoopVectorPreHeader->getTerminator());
1723
19.2k
  } else {
1724
26
    Step = cast<SCEVUnknown>(ID.getStep())->getValue();
1725
26
  }
1726
19.2k
1727
19.2k
  // Try to create a new independent vector induction variable. If we can't
1728
19.2k
  // create the phi node, we will splat the scalar induction variable in each
1729
19.2k
  // loop iteration.
1730
19.2k
  if (VF > 1 && 
!shouldScalarizeInstruction(EntryVal)17.4k
) {
1731
11.2k
    createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1732
11.2k
    VectorizedIV = true;
1733
11.2k
  }
1734
19.2k
1735
19.2k
  // If we haven't yet vectorized the induction variable, or if we will create
1736
19.2k
  // a scalar one, we need to define the scalar induction variable and step
1737
19.2k
  // values. If we were given a truncation type, truncate the canonical
1738
19.2k
  // induction variable and step. Otherwise, derive these values from the
1739
19.2k
  // induction descriptor.
1740
19.2k
  if (!VectorizedIV || 
NeedsScalarIV11.2k
) {
1741
16.6k
    ScalarIV = Induction;
1742
16.6k
    if (IV != OldInduction) {
1743
3.31k
      ScalarIV = IV->getType()->isIntegerTy()
1744
3.31k
                     ? 
Builder.CreateSExtOrTrunc(Induction, IV->getType())3.30k
1745
3.31k
                     : Builder.CreateCast(Instruction::SIToFP, Induction,
1746
9
                                          IV->getType());
1747
3.31k
      ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1748
3.31k
      ScalarIV->setName("offset.idx");
1749
3.31k
    }
1750
16.6k
    if (Trunc) {
1751
118
      auto *TruncType = cast<IntegerType>(Trunc->getType());
1752
118
      assert(Step->getType()->isIntegerTy() &&
1753
118
             "Truncation requires an integer step");
1754
118
      ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1755
118
      Step = Builder.CreateTrunc(Step, TruncType);
1756
118
    }
1757
16.6k
  }
1758
19.2k
1759
19.2k
  // If we haven't yet vectorized the induction variable, splat the scalar
1760
19.2k
  // induction variable, and build the necessary step vectors.
1761
19.2k
  // TODO: Don't do it unless the vectorized IV is really required.
1762
19.2k
  if (!VectorizedIV) {
1763
8.01k
    Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1764
23.1k
    for (unsigned Part = 0; Part < UF; 
++Part15.1k
) {
1765
15.1k
      Value *EntryPart =
1766
15.1k
          getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1767
15.1k
      VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1768
15.1k
      if (Trunc)
1769
199
        addMetadata(EntryPart, Trunc);
1770
15.1k
      recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1771
15.1k
    }
1772
8.01k
  }
1773
19.2k
1774
19.2k
  // If an induction variable is only used for counting loop iterations or
1775
19.2k
  // calculating addresses, it doesn't need to be widened. Create scalar steps
1776
19.2k
  // that can be used by instructions we will later scalarize. Note that the
1777
19.2k
  // addition of the scalar steps will not increase the number of instructions
1778
19.2k
  // in the loop in the common case prior to InstCombine. We will be trading
1779
19.2k
  // one vector extract for each scalar step.
1780
19.2k
  if (NeedsScalarIV)
1781
14.8k
    buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1782
19.2k
}
1783
1784
Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
1785
22.7k
                                          Instruction::BinaryOps BinOp) {
1786
22.7k
  // Create and check the types.
1787
22.7k
  assert(Val->getType()->isVectorTy() && "Must be a vector");
1788
22.7k
  int VLen = Val->getType()->getVectorNumElements();
1789
22.7k
1790
22.7k
  Type *STy = Val->getType()->getScalarType();
1791
22.7k
  assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1792
22.7k
         "Induction Step must be an integer or FP");
1793
22.7k
  assert(Step->getType() == STy && "Step has wrong type");
1794
22.7k
1795
22.7k
  SmallVector<Constant *, 8> Indices;
1796
22.7k
1797
22.7k
  if (STy->isIntegerTy()) {
1798
22.7k
    // Create a vector of consecutive numbers from zero to VF.
1799
111k
    for (int i = 0; i < VLen; 
++i89.0k
)
1800
89.0k
      Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1801
22.7k
1802
22.7k
    // Add the consecutive indices to the vector value.
1803
22.7k
    Constant *Cv = ConstantVector::get(Indices);
1804
22.7k
    assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1805
22.7k
    Step = Builder.CreateVectorSplat(VLen, Step);
1806
22.7k
    assert(Step->getType() == Val->getType() && "Invalid step vec");
1807
22.7k
    // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1808
22.7k
    // which can be found from the original scalar operations.
1809
22.7k
    Step = Builder.CreateMul(Cv, Step);
1810
22.7k
    return Builder.CreateAdd(Val, Step, "induction");
1811
22.7k
  }
1812
20
1813
20
  // Floating point induction.
1814
20
  assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1815
20
         "Binary Opcode should be specified for FP induction");
1816
20
  // Create a vector of consecutive numbers from zero to VF.
1817
92
  for (int i = 0; i < VLen; 
++i72
)
1818
72
    Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1819
20
1820
20
  // Add the consecutive indices to the vector value.
1821
20
  Constant *Cv = ConstantVector::get(Indices);
1822
20
1823
20
  Step = Builder.CreateVectorSplat(VLen, Step);
1824
20
1825
20
  // Floating point operations had to be 'fast' to enable the induction.
1826
20
  FastMathFlags Flags;
1827
20
  Flags.setFast();
1828
20
1829
20
  Value *MulOp = Builder.CreateFMul(Cv, Step);
1830
20
  if (isa<Instruction>(MulOp))
1831
6
    // Have to check, MulOp may be a constant
1832
6
    cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1833
20
1834
20
  Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1835
20
  if (isa<Instruction>(BOp))
1836
9
    cast<Instruction>(BOp)->setFastMathFlags(Flags);
1837
20
  return BOp;
1838
20
}
1839
1840
void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
1841
                                           Instruction *EntryVal,
1842
14.8k
                                           const InductionDescriptor &ID) {
1843
14.8k
  // We shouldn't have to build scalar steps if we aren't vectorizing.
1844
14.8k
  assert(VF > 1 && "VF should be greater than one");
1845
14.8k
1846
14.8k
  // Get the value type and ensure it and the step have the same integer type.
1847
14.8k
  Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1848
14.8k
  assert(ScalarIVTy == Step->getType() &&
1849
14.8k
         "Val and Step should have the same type");
1850
14.8k
1851
14.8k
  // We build scalar steps for both integer and floating-point induction
1852
14.8k
  // variables. Here, we determine the kind of arithmetic we will perform.
1853
14.8k
  Instruction::BinaryOps AddOp;
1854
14.8k
  Instruction::BinaryOps MulOp;
1855
14.8k
  if (ScalarIVTy->isIntegerTy()) {
1856
14.8k
    AddOp = Instruction::Add;
1857
14.8k
    MulOp = Instruction::Mul;
1858
14.8k
  } else {
1859
3
    AddOp = ID.getInductionOpcode();
1860
3
    MulOp = Instruction::FMul;
1861
3
  }
1862
14.8k
1863
14.8k
  // Determine the number of scalars we need to generate for each unroll
1864
14.8k
  // iteration. If EntryVal is uniform, we only need to generate the first
1865
14.8k
  // lane. Otherwise, we generate all VF values.
1866
14.8k
  unsigned Lanes =
1867
14.8k
      Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 
16.11k
1868
14.8k
                                                                         : 
VF8.71k
;
1869
14.8k
  // Compute the scalar steps and save the results in VectorLoopValueMap.
1870
43.2k
  for (unsigned Part = 0; Part < UF; 
++Part28.4k
) {
1871
87.7k
    for (unsigned Lane = 0; Lane < Lanes; 
++Lane59.2k
) {
1872
59.2k
      auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
1873
59.2k
      auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
1874
59.2k
      auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
1875
59.2k
      VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
1876
59.2k
      recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
1877
59.2k
    }
1878
28.4k
  }
1879
14.8k
}
1880
1881
197k
Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
1882
197k
  assert(V != Induction && "The new induction variable should not be used.");
1883
197k
  assert(!V->getType()->isVectorTy() && "Can't widen a vector");
1884
197k
  assert(!V->getType()->isVoidTy() && "Type does not produce a value");
1885
197k
1886
197k
  // If we have a stride that is replaced by one, do it here. Defer this for
1887
197k
  // the VPlan-native path until we start running Legal checks in that path.
1888
197k
  if (!EnableVPlanNativePath && 
Legal->hasStride(V)197k
)
1889
0
    V = ConstantInt::get(V->getType(), 1);
1890
197k
1891
197k
  // If we have a vector mapped to this value, return it.
1892
197k
  if (VectorLoopValueMap.hasVectorValue(V, Part))
1893
149k
    return VectorLoopValueMap.getVectorValue(V, Part);
1894
48.1k
1895
48.1k
  // If the value has not been vectorized, check if it has been scalarized
1896
48.1k
  // instead. If it has been scalarized, and we actually need the value in
1897
48.1k
  // vector form, we will construct the vector values on demand.
1898
48.1k
  if (VectorLoopValueMap.hasAnyScalarValue(V)) {
1899
1.49k
    Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
1900
1.49k
1901
1.49k
    // If we've scalarized a value, that value should be an instruction.
1902
1.49k
    auto *I = cast<Instruction>(V);
1903
1.49k
1904
1.49k
    // If we aren't vectorizing, we can just copy the scalar map values over to
1905
1.49k
    // the vector map.
1906
1.49k
    if (VF == 1) {
1907
980
      VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
1908
980
      return ScalarValue;
1909
980
    }
1910
519
1911
519
    // Get the last scalar instruction we generated for V and Part. If the value
1912
519
    // is known to be uniform after vectorization, this corresponds to lane zero
1913
519
    // of the Part unroll iteration. Otherwise, the last instruction is the one
1914
519
    // we created for the last vector lane of the Part unroll iteration.
1915
519
    unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 
00
: VF - 1;
1916
519
    auto *LastInst = cast<Instruction>(
1917
519
        VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
1918
519
1919
519
    // Set the insert point after the last scalarized instruction. This ensures
1920
519
    // the insertelement sequence will directly follow the scalar definitions.
1921
519
    auto OldIP = Builder.saveIP();
1922
519
    auto NewIP = std::next(BasicBlock::iterator(LastInst));
1923
519
    Builder.SetInsertPoint(&*NewIP);
1924
519
1925
519
    // However, if we are vectorizing, we need to construct the vector values.
1926
519
    // If the value is known to be uniform after vectorization, we can just
1927
519
    // broadcast the scalar value corresponding to lane zero for each unroll
1928
519
    // iteration. Otherwise, we construct the vector values using insertelement
1929
519
    // instructions. Since the resulting vectors are stored in
1930
519
    // VectorLoopValueMap, we will only generate the insertelements once.
1931
519
    Value *VectorValue = nullptr;
1932
519
    if (Cost->isUniformAfterVectorization(I, VF)) {
1933
0
      VectorValue = getBroadcastInstrs(ScalarValue);
1934
0
      VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
1935
519
    } else {
1936
519
      // Initialize packing with insertelements to start from undef.
1937
519
      Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
1938
519
      VectorLoopValueMap.setVectorValue(V, Part, Undef);
1939
2.27k
      for (unsigned Lane = 0; Lane < VF; 
++Lane1.75k
)
1940
1.75k
        packScalarIntoVectorValue(V, {Part, Lane});
1941
519
      VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
1942
519
    }
1943
519
    Builder.restoreIP(OldIP);
1944
519
    return VectorValue;
1945
519
  }
1946
46.6k
1947
46.6k
  // If this scalar is unknown, assume that it is a constant or that it is
1948
46.6k
  // loop invariant. Broadcast V and save the value for future uses.
1949
46.6k
  Value *B = getBroadcastInstrs(V);
1950
46.6k
  VectorLoopValueMap.setVectorValue(V, Part, B);
1951
46.6k
  return B;
1952
46.6k
}
1953
1954
Value *
1955
InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
1956
189k
                                            const VPIteration &Instance) {
1957
189k
  // If the value is not an instruction contained in the loop, it should
1958
189k
  // already be scalar.
1959
189k
  if (OrigLoop->isLoopInvariant(V))
1960
91.7k
    return V;
1961
98.1k
1962
98.1k
  assert(Instance.Lane > 0
1963
98.1k
             ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
1964
98.1k
             : true && "Uniform values only have lane zero");
1965
98.1k
1966
98.1k
  // If the value from the original loop has not been vectorized, it is
1967
98.1k
  // represented by UF x VF scalar values in the new loop. Return the requested
1968
98.1k
  // scalar value.
1969
98.1k
  if (VectorLoopValueMap.hasScalarValue(V, Instance))
1970
92.2k
    return VectorLoopValueMap.getScalarValue(V, Instance);
1971
5.90k
1972
5.90k
  // If the value has not been scalarized, get its entry in VectorLoopValueMap
1973
5.90k
  // for the given unroll part. If this entry is not a vector type (i.e., the
1974
5.90k
  // vectorization factor is one), there is no need to generate an
1975
5.90k
  // extractelement instruction.
1976
5.90k
  auto *U = getOrCreateVectorValue(V, Instance.Part);
1977
5.90k
  if (!U->getType()->isVectorTy()) {
1978
4.27k
    assert(VF == 1 && "Value not scalarized has non-vector type");
1979
4.27k
    return U;
1980
4.27k
  }
1981
1.63k
1982
1.63k
  // Otherwise, the value from the original loop has been vectorized and is
1983
1.63k
  // represented by UF vector values. Extract and return the requested scalar
1984
1.63k
  // value from the appropriate vector lane.
1985
1.63k
  return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
1986
1.63k
}
1987
1988
void InnerLoopVectorizer::packScalarIntoVectorValue(
1989
1.93k
    Value *V, const VPIteration &Instance) {
1990
1.93k
  assert(V != Induction && "The new induction variable should not be used.");
1991
1.93k
  assert(!V->getType()->isVectorTy() && "Can't pack a vector");
1992
1.93k
  assert(!V->getType()->isVoidTy() && "Type does not produce a value");
1993
1.93k
1994
1.93k
  Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
1995
1.93k
  Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
1996
1.93k
  VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
1997
1.93k
                                            Builder.getInt32(Instance.Lane));
1998
1.93k
  VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
1999
1.93k
}
2000
2001
1.24k
Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2002
1.24k
  assert(Vec->getType()->isVectorTy() && "Invalid type");
2003
1.24k
  SmallVector<Constant *, 8> ShuffleMask;
2004
14.3k
  for (unsigned i = 0; i < VF; 
++i13.1k
)
2005
13.1k
    ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
2006
1.24k
2007
1.24k
  return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2008
1.24k
                                     ConstantVector::get(ShuffleMask),
2009
1.24k
                                     "reverse");
2010
1.24k
}
2011
2012
// Return whether we allow using masked interleave-groups (for dealing with
2013
// strided loads/stores that reside in predicated blocks, or for dealing
2014
// with gaps).
2015
19.6k
static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2016
19.6k
  // If an override option has been passed in for interleaved accesses, use it.
2017
19.6k
  if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2018
24
    return EnableMaskedInterleavedMemAccesses;
2019
19.5k
2020
19.5k
  return TTI.enableMaskedInterleavedAccessVectorization();
2021
19.5k
}
2022
2023
// Try to vectorize the interleave group that \p Instr belongs to.
2024
//
2025
// E.g. Translate following interleaved load group (factor = 3):
2026
//   for (i = 0; i < N; i+=3) {
2027
//     R = Pic[i];             // Member of index 0
2028
//     G = Pic[i+1];           // Member of index 1
2029
//     B = Pic[i+2];           // Member of index 2
2030
//     ... // do something to R, G, B
2031
//   }
2032
// To:
2033
//   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2034
//   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2035
//   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2036
//   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2037
//
2038
// Or translate following interleaved store group (factor = 3):
2039
//   for (i = 0; i < N; i+=3) {
2040
//     ... do something to R, G, B
2041
//     Pic[i]   = R;           // Member of index 0
2042
//     Pic[i+1] = G;           // Member of index 1
2043
//     Pic[i+2] = B;           // Member of index 2
2044
//   }
2045
// To:
2046
//   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2047
//   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2048
//   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2049
//        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2050
//   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2051
void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
2052
669
                                                   VectorParts *BlockInMask) {
2053
669
  const InterleaveGroup<Instruction> *Group =
2054
669
      Cost->getInterleavedAccessGroup(Instr);
2055
669
  assert(Group && "Fail to get an interleaved access group.");
2056
669
2057
669
  // Skip if current instruction is not the insert position.
2058
669
  if (Instr != Group->getInsertPos())
2059
0
    return;
2060
669
2061
669
  const DataLayout &DL = Instr->getModule()->getDataLayout();
2062
669
  Value *Ptr = getLoadStorePointerOperand(Instr);
2063
669
2064
669
  // Prepare for the vector type of the interleaved load/store.
2065
669
  Type *ScalarTy = getMemInstValueType(Instr);
2066
669
  unsigned InterleaveFactor = Group->getFactor();
2067
669
  Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2068
669
  Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr));
2069
669
2070
669
  // Prepare for the new pointers.
2071
669
  setDebugLocFromInst(Builder, Ptr);
2072
669
  SmallVector<Value *, 2> NewPtrs;
2073
669
  unsigned Index = Group->getIndex(Instr);
2074
669
2075
669
  VectorParts Mask;
2076
669
  bool IsMaskForCondRequired = BlockInMask;
2077
669
  if (IsMaskForCondRequired) {
2078
11
    Mask = *BlockInMask;
2079
11
    // TODO: extend the masked interleaved-group support to reversed access.
2080
11
    assert(!Group->isReverse() && "Reversed masked interleave-group "
2081
11
                                  "not supported.");
2082
11
  }
2083
669
2084
669
  // If the group is reverse, adjust the index to refer to the last vector lane
2085
669
  // instead of the first. We adjust the index from the first vector lane,
2086
669
  // rather than directly getting the pointer for lane VF - 1, because the
2087
669
  // pointer operand of the interleaved access is supposed to be uniform. For
2088
669
  // uniform instructions, we're only required to generate a value for the
2089
669
  // first vector lane in each unroll iteration.
2090
669
  if (Group->isReverse())
2091
4
    Index += (VF - 1) * Group->getFactor();
2092
669
2093
669
  bool InBounds = false;
2094
669
  if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2095
567
    InBounds = gep->isInBounds();
2096
669
2097
1.76k
  for (unsigned Part = 0; Part < UF; 
Part++1.09k
) {
2098
1.09k
    Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});
2099
1.09k
2100
1.09k
    // Notice current instruction could be any index. Need to adjust the address
2101
1.09k
    // to the member of index 0.
2102
1.09k
    //
2103
1.09k
    // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2104
1.09k
    //       b = A[i];       // Member of index 0
2105
1.09k
    // Current pointer is pointed to A[i+1], adjust it to A[i].
2106
1.09k
    //
2107
1.09k
    // E.g.  A[i+1] = a;     // Member of index 1
2108
1.09k
    //       A[i]   = b;     // Member of index 0
2109
1.09k
    //       A[i+2] = c;     // Member of index 2 (Current instruction)
2110
1.09k
    // Current pointer is pointed to A[i+2], adjust it to A[i].
2111
1.09k
    NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index));
2112
1.09k
    if (InBounds)
2113
860
      cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true);
2114
1.09k
2115
1.09k
    // Cast to the vector pointer type.
2116
1.09k
    NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
2117
1.09k
  }
2118
669
2119
669
  setDebugLocFromInst(Builder, Instr);
2120
669
  Value *UndefVec = UndefValue::get(VecTy);
2121
669
2122
669
  Value *MaskForGaps = nullptr;
2123
669
  if (Group->requiresScalarEpilogue() && 
!Cost->isScalarEpilogueAllowed()68
) {
2124
5
    MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2125
5
    assert(MaskForGaps && "Mask for Gaps is required but it is null");
2126
5
  }
2127
669
2128
669
  // Vectorize the interleaved load group.
2129
669
  if (isa<LoadInst>(Instr)) {
2130
324
    // For each unroll part, create a wide load for the group.
2131
324
    SmallVector<Value *, 2> NewLoads;
2132
841
    for (unsigned Part = 0; Part < UF; 
Part++517
) {
2133
517
      Instruction *NewLoad;
2134
517
      if (IsMaskForCondRequired || 
MaskForGaps509
) {
2135
9
        assert(useMaskedInterleavedAccesses(*TTI) &&
2136
9
               "masked interleaved groups are not allowed.");
2137
9
        Value *GroupMask = MaskForGaps;
2138
9
        if (IsMaskForCondRequired) {
2139
8
          auto *Undefs = UndefValue::get(Mask[Part]->getType());
2140
8
          auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2141
8
          Value *ShuffledMask = Builder.CreateShuffleVector(
2142
8
              Mask[Part], Undefs, RepMask, "interleaved.mask");
2143
8
          GroupMask = MaskForGaps
2144
8
                          ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2145
4
                                                MaskForGaps)
2146
8
                          : 
ShuffledMask4
;
2147
8
        }
2148
9
        NewLoad =
2149
9
            Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(),
2150
9
                                     GroupMask, UndefVec, "wide.masked.vec");
2151
9
      }
2152
508
      else
2153
508
        NewLoad = Builder.CreateAlignedLoad(VecTy, NewPtrs[Part],
2154
508
                                            Group->getAlignment(), "wide.vec");
2155
517
      Group->addMetadata(NewLoad);
2156
517
      NewLoads.push_back(NewLoad);
2157
517
    }
2158
324
2159
324
    // For each member in the group, shuffle out the appropriate data from the
2160
324
    // wide loads.
2161
1.17k
    for (unsigned I = 0; I < InterleaveFactor; 
++I849
) {
2162
849
      Instruction *Member = Group->getMember(I);
2163
849
2164
849
      // Skip the gaps in the group.
2165
849
      if (!Member)
2166
137
        continue;
2167
712
2168
712
      Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2169
1.89k
      for (unsigned Part = 0; Part < UF; 
Part++1.17k
) {
2170
1.17k
        Value *StridedVec = Builder.CreateShuffleVector(
2171
1.17k
            NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2172
1.17k
2173
1.17k
        // If this member has different type, cast the result type.
2174
1.17k
        if (Member->getType() != ScalarTy) {
2175
7
          VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2176
7
          StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2177
7
        }
2178
1.17k
2179
1.17k
        if (Group->isReverse())
2180
6
          StridedVec = reverseVector(StridedVec);
2181
1.17k
2182
1.17k
        VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2183
1.17k
      }
2184
712
    }
2185
324
    return;
2186
324
  }
2187
345
2188
345
  // The sub vector type for current instruction.
2189
345
  VectorType *SubVT = VectorType::get(ScalarTy, VF);
2190
345
2191
345
  // Vectorize the interleaved store group.
2192
922
  for (unsigned Part = 0; Part < UF; 
Part++577
) {
2193
577
    // Collect the stored vector from each member.
2194
577
    SmallVector<Value *, 4> StoredVecs;
2195
1.90k
    for (unsigned i = 0; i < InterleaveFactor; 
i++1.33k
) {
2196
1.33k
      // Interleaved store group doesn't allow a gap, so each index has a member
2197
1.33k
      Instruction *Member = Group->getMember(i);
2198
1.33k
      assert(Member && "Fail to get a member from an interleaved store group");
2199
1.33k
2200
1.33k
      Value *StoredVec = getOrCreateVectorValue(
2201
1.33k
          cast<StoreInst>(Member)->getValueOperand(), Part);
2202
1.33k
      if (Group->isReverse())
2203
2
        StoredVec = reverseVector(StoredVec);
2204
1.33k
2205
1.33k
      // If this member has different type, cast it to a unified type.
2206
1.33k
2207
1.33k
      if (StoredVec->getType() != SubVT)
2208
7
        StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2209
1.33k
2210
1.33k
      StoredVecs.push_back(StoredVec);
2211
1.33k
    }
2212
577
2213
577
    // Concatenate all vectors into a wide vector.
2214
577
    Value *WideVec = concatenateVectors(Builder, StoredVecs);
2215
577
2216
577
    // Interleave the elements in the wide vector.
2217
577
    Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2218
577
    Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2219
577
                                              "interleaved.vec");
2220
577
2221
577
    Instruction *NewStoreInstr;
2222
577
    if (IsMaskForCondRequired) {
2223
3
      auto *Undefs = UndefValue::get(Mask[Part]->getType());
2224
3
      auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2225
3
      Value *ShuffledMask = Builder.CreateShuffleVector(
2226
3
          Mask[Part], Undefs, RepMask, "interleaved.mask");
2227
3
      NewStoreInstr = Builder.CreateMaskedStore(
2228
3
          IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask);
2229
3
    }
2230
574
    else
2231
574
      NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part], 
2232
574
        Group->getAlignment());
2233
577
2234
577
    Group->addMetadata(NewStoreInstr);
2235
577
  }
2236
345
}
2237
2238
void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2239
21.6k
                                                     VectorParts *BlockInMask) {
2240
21.6k
  // Attempt to issue a wide load.
2241
21.6k
  LoadInst *LI = dyn_cast<LoadInst>(Instr);
2242
21.6k
  StoreInst *SI = dyn_cast<StoreInst>(Instr);
2243
21.6k
2244
21.6k
  assert((LI || SI) && "Invalid Load/Store instruction");
2245
21.6k
2246
21.6k
  LoopVectorizationCostModel::InstWidening Decision =
2247
21.6k
      Cost->getWideningDecision(Instr, VF);
2248
21.6k
  assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
2249
21.6k
         "CM decision should be taken at this point");
2250
21.6k
  if (Decision == LoopVectorizationCostModel::CM_Interleave)
2251
0
    return vectorizeInterleaveGroup(Instr);
2252
21.6k
2253
21.6k
  Type *ScalarDataTy = getMemInstValueType(Instr);
2254
21.6k
  Type *DataTy = VectorType::get(ScalarDataTy, VF);
2255
21.6k
  Value *Ptr = getLoadStorePointerOperand(Instr);
2256
21.6k
  unsigned Alignment = getLoadStoreAlignment(Instr);
2257
21.6k
  // An alignment of 0 means target abi alignment. We need to use the scalar's
2258
21.6k
  // target abi alignment in such a case.
2259
21.6k
  const DataLayout &DL = Instr->getModule()->getDataLayout();
2260
21.6k
  if (!Alignment)
2261
79
    Alignment = DL.getABITypeAlignment(ScalarDataTy);
2262
21.6k
  unsigned AddressSpace = getLoadStoreAddressSpace(Instr);
2263
21.6k
2264
21.6k
  // Determine if the pointer operand of the access is either consecutive or
2265
21.6k
  // reverse consecutive.
2266
21.6k
  bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2267
21.6k
  bool ConsecutiveStride =
2268
21.6k
      Reverse || 
(Decision == LoopVectorizationCostModel::CM_Widen)21.0k
;
2269
21.6k
  bool CreateGatherScatter =
2270
21.6k
      (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2271
21.6k
2272
21.6k
  // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2273
21.6k
  // gather/scatter. Otherwise Decision should have been to Scalarize.
2274
21.6k
  assert((ConsecutiveStride || CreateGatherScatter) &&
2275
21.6k
         "The instruction should be scalarized");
2276
21.6k
2277
21.6k
  // Handle consecutive loads/stores.
2278
21.6k
  if (ConsecutiveStride)
2279
21.5k
    Ptr = getOrCreateScalarValue(Ptr, {0, 0});
2280
21.6k
2281
21.6k
  VectorParts Mask;
2282
21.6k
  bool isMaskRequired = BlockInMask;
2283
21.6k
  if (isMaskRequired)
2284
95
    Mask = *BlockInMask;
2285
21.6k
2286
21.6k
  bool InBounds = false;
2287
21.6k
  if (auto *gep = dyn_cast<GetElementPtrInst>(
2288
19.2k
          getLoadStorePointerOperand(Instr)->stripPointerCasts()))
2289
19.2k
    InBounds = gep->isInBounds();
2290
21.6k
2291
40.2k
  const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2292
40.2k
    // Calculate the pointer for the specific unroll-part.
2293
40.2k
    GetElementPtrInst *PartPtr = nullptr;
2294
40.2k
2295
40.2k
    if (Reverse) {
2296
1.21k
      // If the address is consecutive but reversed, then the
2297
1.21k
      // wide store needs to start at the last vector element.
2298
1.21k
      PartPtr = cast<GetElementPtrInst>(
2299
1.21k
          Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2300
1.21k
      PartPtr->setIsInBounds(InBounds);
2301
1.21k
      PartPtr = cast<GetElementPtrInst>(
2302
1.21k
          Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2303
1.21k
      PartPtr->setIsInBounds(InBounds);
2304
1.21k
      if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2305
16
        Mask[Part] = reverseVector(Mask[Part]);
2306
39.0k
    } else {
2307
39.0k
      PartPtr = cast<GetElementPtrInst>(
2308
39.0k
          Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2309
39.0k
      PartPtr->setIsInBounds(InBounds);
2310
39.0k
    }
2311
40.2k
2312
40.2k
    return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2313
40.2k
  };
2314
21.6k
2315
21.6k
  // Handle Stores:
2316
21.6k
  if (SI) {
2317
14.4k
    setDebugLocFromInst(Builder, SI);
2318
14.4k
2319
42.2k
    for (unsigned Part = 0; Part < UF; 
++Part27.7k
) {
2320
27.7k
      Instruction *NewSI = nullptr;
2321
27.7k
      Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
2322
27.7k
      if (CreateGatherScatter) {
2323
29
        Value *MaskPart = isMaskRequired ? 
Mask[Part]12
:
nullptr17
;
2324
29
        Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2325
29
        NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2326
29
                                            MaskPart);
2327
27.7k
      } else {
2328
27.7k
        if (Reverse) {
2329
574
          // If we store to reverse consecutive memory locations, then we need
2330
574
          // to reverse the order of elements in the stored value.
2331
574
          StoredVal = reverseVector(StoredVal);
2332
574
          // We don't want to update the value in the map as it might be used in
2333
574
          // another expression. So don't call resetVectorValue(StoredVal).
2334
574
        }
2335
27.7k
        auto *VecPtr = CreateVecPtr(Part, Ptr);
2336
27.7k
        if (isMaskRequired)
2337
100
          NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2338
100
                                            Mask[Part]);
2339
27.6k
        else
2340
27.6k
          NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2341
27.7k
      }
2342
27.7k
      addMetadata(NewSI, SI);
2343
27.7k
    }
2344
14.4k
    return;
2345
14.4k
  }
2346
7.20k
2347
7.20k
  // Handle loads.
2348
7.20k
  assert(LI && "Must have a load instruction");
2349
7.20k
  setDebugLocFromInst(Builder, LI);
2350
19.7k
  for (unsigned Part = 0; Part < UF; 
++Part12.5k
) {
2351
12.5k
    Value *NewLI;
2352
12.5k
    if (CreateGatherScatter) {
2353
58
      Value *MaskPart = isMaskRequired ? 
Mask[Part]29
:
nullptr29
;
2354
58
      Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2355
58
      NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2356
58
                                         nullptr, "wide.masked.gather");
2357
58
      addMetadata(NewLI, LI);
2358
12.4k
    } else {
2359
12.4k
      auto *VecPtr = CreateVecPtr(Part, Ptr);
2360
12.4k
      if (isMaskRequired)
2361
80
        NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
2362
80
                                         UndefValue::get(DataTy),
2363
80
                                         "wide.masked.load");
2364
12.4k
      else
2365
12.4k
        NewLI =
2366
12.4k
            Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2367
12.4k
2368
12.4k
      // Add metadata to the load, but setVectorValue to the reverse shuffle.
2369
12.4k
      addMetadata(NewLI, LI);
2370
12.4k
      if (Reverse)
2371
643
        NewLI = reverseVector(NewLI);
2372
12.4k
    }
2373
12.5k
    VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2374
12.5k
  }
2375
7.20k
}
2376
2377
void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2378
                                               const VPIteration &Instance,
2379
71.6k
                                               bool IfPredicateInstr) {
2380
71.6k
  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2381
71.6k
2382
71.6k
  setDebugLocFromInst(Builder, Instr);
2383
71.6k
2384
71.6k
  // Does this instruction return a value ?
2385
71.6k
  bool IsVoidRetTy = Instr->getType()->isVoidTy();
2386
71.6k
2387
71.6k
  Instruction *Cloned = Instr->clone();
2388
71.6k
  if (!IsVoidRetTy)
2389
68.4k
    Cloned->setName(Instr->getName() + ".cloned");
2390
71.6k
2391
71.6k
  // Replace the operands of the cloned instructions with their scalar
2392
71.6k
  // equivalents in the new loop.
2393
238k
  for (unsigned op = 0, e = Instr->getNumOperands(); op != e; 
++op166k
) {
2394
166k
    auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2395
166k
    Cloned->setOperand(op, NewOp);
2396
166k
  }
2397
71.6k
  addNewMetadata(Cloned, Instr);
2398
71.6k
2399
71.6k
  // Place the cloned scalar in the new loop.
2400
71.6k
  Builder.Insert(Cloned);
2401
71.6k
2402
71.6k
  // Add the cloned scalar to the scalar map entry.
2403
71.6k
  VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2404
71.6k
2405
71.6k
  // If we just cloned a new assumption, add it the assumption cache.
2406
71.6k
  if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2407
64
    if (II->getIntrinsicID() == Intrinsic::assume)
2408
24
      AC->registerAssumption(II);
2409
71.6k
2410
71.6k
  // End if-block.
2411
71.6k
  if (IfPredicateInstr)
2412
948
    PredicatedInstructions.push_back(Cloned);
2413
71.6k
}
2414
2415
PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2416
                                                      Value *End, Value *Step,
2417
17.0k
                                                      Instruction *DL) {
2418
17.0k
  BasicBlock *Header = L->getHeader();
2419
17.0k
  BasicBlock *Latch = L->getLoopLatch();
2420
17.0k
  // As we're just creating this loop, it's possible no latch exists
2421
17.0k
  // yet. If so, use the header as this will be a single block loop.
2422
17.0k
  if (!Latch)
2423
17.0k
    Latch = Header;
2424
17.0k
2425
17.0k
  IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2426
17.0k
  Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2427
17.0k
  setDebugLocFromInst(Builder, OldInst);
2428
17.0k
  auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2429
17.0k
2430
17.0k
  Builder.SetInsertPoint(Latch->getTerminator());
2431
17.0k
  setDebugLocFromInst(Builder, OldInst);
2432
17.0k
2433
17.0k
  // Create i+1 and fill the PHINode.
2434
17.0k
  Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2435
17.0k
  Induction->addIncoming(Start, L->getLoopPreheader());
2436
17.0k
  Induction->addIncoming(Next, Latch);
2437
17.0k
  // Create the compare.
2438
17.0k
  Value *ICmp = Builder.CreateICmpEQ(Next, End);
2439
17.0k
  Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2440
17.0k
2441
17.0k
  // Now we have two terminators. Remove the old one from the block.
2442
17.0k
  Latch->getTerminator()->eraseFromParent();
2443
17.0k
2444
17.0k
  return Induction;
2445
17.0k
}
2446
2447
68.2k
Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2448
68.2k
  if (TripCount)
2449
51.1k
    return TripCount;
2450
17.0k
2451
17.0k
  assert(L && "Create Trip Count for null loop.");
2452
17.0k
  IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2453
17.0k
  // Find the loop boundaries.
2454
17.0k
  ScalarEvolution *SE = PSE.getSE();
2455
17.0k
  const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2456
17.0k
  assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2457
17.0k
         "Invalid loop count");
2458
17.0k
2459
17.0k
  Type *IdxTy = Legal->getWidestInductionType();
2460
17.0k
  assert(IdxTy && "No type for induction");
2461
17.0k
2462
17.0k
  // The exit count might have the type of i64 while the phi is i32. This can
2463
17.0k
  // happen if we have an induction variable that is sign extended before the
2464
17.0k
  // compare. The only way that we get a backedge taken count is that the
2465
17.0k
  // induction variable was signed and as such will not overflow. In such a case
2466
17.0k
  // truncation is legal.
2467
17.0k
  if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
2468
17.0k
      IdxTy->getPrimitiveSizeInBits())
2469
2
    BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2470
17.0k
  BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2471
17.0k
2472
17.0k
  // Get the total trip count from the count by adding 1.
2473
17.0k
  const SCEV *ExitCount = SE->getAddExpr(
2474
17.0k
      BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2475
17.0k
2476
17.0k
  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2477
17.0k
2478
17.0k
  // Expand the trip count and place the new instructions in the preheader.
2479
17.0k
  // Notice that the pre-header does not change, only the loop body.
2480
17.0k
  SCEVExpander Exp(*SE, DL, "induction");
2481
17.0k
2482
17.0k
  // Count holds the overall loop count (N).
2483
17.0k
  TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2484
17.0k
                                L->getLoopPreheader()->getTerminator());
2485
17.0k
2486
17.0k
  if (TripCount->getType()->isPointerTy())
2487
159
    TripCount =
2488
159
        CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2489
159
                                    L->getLoopPreheader()->getTerminator());
2490
17.0k
2491
17.0k
  return TripCount;
2492
17.0k
}
2493
2494
37.3k
Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2495
37.3k
  if (VectorTripCount)
2496
20.3k
    return VectorTripCount;
2497
17.0k
2498
17.0k
  Value *TC = getOrCreateTripCount(L);
2499
17.0k
  IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2500
17.0k
2501
17.0k
  Type *Ty = TC->getType();
2502
17.0k
  Constant *Step = ConstantInt::get(Ty, VF * UF);
2503
17.0k
2504
17.0k
  // If the tail is to be folded by masking, round the number of iterations N
2505
17.0k
  // up to a multiple of Step instead of rounding down. This is done by first
2506
17.0k
  // adding Step-1 and then rounding down. Note that it's ok if this addition
2507
17.0k
  // overflows: the vector induction variable will eventually wrap to zero given
2508
17.0k
  // that it starts at zero and its Step is a power of two; the loop will then
2509
17.0k
  // exit, with the last early-exit vector comparison also producing all-true.
2510
17.0k
  if (Cost->foldTailByMasking()) {
2511
18
    assert(isPowerOf2_32(VF * UF) &&
2512
18
           "VF*UF must be a power of 2 when folding tail by masking");
2513
18
    TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2514
18
  }
2515
17.0k
2516
17.0k
  // Now we need to generate the expression for the part of the loop that the
2517
17.0k
  // vectorized body will execute. This is equal to N - (N % Step) if scalar
2518
17.0k
  // iterations are not required for correctness, or N - Step, otherwise. Step
2519
17.0k
  // is equal to the vectorization factor (number of SIMD elements) times the
2520
17.0k
  // unroll factor (number of SIMD instructions).
2521
17.0k
  Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2522
17.0k
2523
17.0k
  // If there is a non-reversed interleaved group that may speculatively access
2524
17.0k
  // memory out-of-bounds, we need to ensure that there will be at least one
2525
17.0k
  // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2526
17.0k
  // the trip count, we set the remainder to be equal to the step. If the step
2527
17.0k
  // does not evenly divide the trip count, no adjustment is necessary since
2528
17.0k
  // there will already be scalar iterations. Note that the minimum iterations
2529
17.0k
  // check ensures that N >= Step.
2530
17.0k
  if (VF > 1 && 
Cost->requiresScalarEpilogue()15.3k
) {
2531
56
    auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2532
56
    R = Builder.CreateSelect(IsZero, Step, R);
2533
56
  }
2534
17.0k
2535
17.0k
  VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2536
17.0k
2537
17.0k
  return VectorTripCount;
2538
17.0k
}
2539
2540
Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2541
14
                                                   const DataLayout &DL) {
2542
14
  // Verify that V is a vector type with same number of elements as DstVTy.
2543
14
  unsigned VF = DstVTy->getNumElements();
2544
14
  VectorType *SrcVecTy = cast<VectorType>(V->getType());
2545
14
  assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2546
14
  Type *SrcElemTy = SrcVecTy->getElementType();
2547
14
  Type *DstElemTy = DstVTy->getElementType();
2548
14
  assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2549
14
         "Vector elements must have same size");
2550
14
2551
14
  // Do a direct cast if element types are castable.
2552
14
  if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2553
2
    return Builder.CreateBitOrPointerCast(V, DstVTy);
2554
2
  }
2555
12
  // V cannot be directly casted to desired vector type.
2556
12
  // May happen when V is a floating point vector but DstVTy is a vector of
2557
12
  // pointers or vice-versa. Handle this using a two-step bitcast using an
2558
12
  // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2559
12
  assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2560
12
         "Only one type should be a pointer type");
2561
12
  assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2562
12
         "Only one type should be a floating point type");
2563
12
  Type *IntTy =
2564
12
      IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2565
12
  VectorType *VecIntTy = VectorType::get(IntTy, VF);
2566
12
  Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2567
12
  return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2568
12
}
2569
2570
void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2571
17.0k
                                                         BasicBlock *Bypass) {
2572
17.0k
  Value *Count = getOrCreateTripCount(L);
2573
17.0k
  BasicBlock *BB = L->getLoopPreheader();
2574
17.0k
  IRBuilder<> Builder(BB->getTerminator());
2575
17.0k
2576
17.0k
  // Generate code to check if the loop's trip count is less than VF * UF, or
2577
17.0k
  // equal to it in case a scalar epilogue is required; this implies that the
2578
17.0k
  // vector trip count is zero. This check also covers the case where adding one
2579
17.0k
  // to the backedge-taken count overflowed leading to an incorrect trip count
2580
17.0k
  // of zero. In this case we will also jump to the scalar loop.
2581
17.0k
  auto P = Cost->requiresScalarEpilogue() ? 
ICmpInst::ICMP_ULE106
2582
17.0k
                                          : 
ICmpInst::ICMP_ULT16.9k
;
2583
17.0k
2584
17.0k
  // If tail is to be folded, vector loop takes care of all iterations.
2585
17.0k
  Value *CheckMinIters = Builder.getFalse();
2586
17.0k
  if (!Cost->foldTailByMasking())
2587
17.0k
    CheckMinIters = Builder.CreateICmp(
2588
17.0k
        P, Count, ConstantInt::get(Count->getType(), VF * UF),
2589
17.0k
        "min.iters.check");
2590
17.0k
2591
17.0k
  BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2592
17.0k
  // Update dominator tree immediately if the generated block is a
2593
17.0k
  // LoopBypassBlock because SCEV expansions to generate loop bypass
2594
17.0k
  // checks may query it before the current function is finished.
2595
17.0k
  DT->addNewBlock(NewBB, BB);
2596
17.0k
  if (L->getParentLoop())
2597
4.84k
    L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2598
17.0k
  ReplaceInstWithInst(BB->getTerminator(),
2599
17.0k
                      BranchInst::Create(Bypass, NewBB, CheckMinIters));
2600
17.0k
  LoopBypassBlocks.push_back(BB);
2601
17.0k
}
2602
2603
17.0k
void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2604
17.0k
  BasicBlock *BB = L->getLoopPreheader();
2605
17.0k
2606
17.0k
  // Generate the code to check that the SCEV assumptions that we made.
2607
17.0k
  // We want the new basic block to start at the first instruction in a
2608
17.0k
  // sequence of instructions that form a check.
2609
17.0k
  SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2610
17.0k
                   "scev.check");
2611
17.0k
  Value *SCEVCheck =
2612
17.0k
      Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
2613
17.0k
2614
17.0k
  if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2615
16.7k
    if (C->isZero())
2616
16.7k
      return;
2617
332
2618
332
  assert(!Cost->foldTailByMasking() &&
2619
332
         "Cannot SCEV check stride or overflow when folding tail");
2620
332
  // Create a new block containing the stride check.
2621
332
  BB->setName("vector.scevcheck");
2622
332
  auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2623
332
  // Update dominator tree immediately if the generated block is a
2624
332
  // LoopBypassBlock because SCEV expansions to generate loop bypass
2625
332
  // checks may query it before the current function is finished.
2626
332
  DT->addNewBlock(NewBB, BB);
2627
332
  if (L->getParentLoop())
2628
159
    L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2629
332
  ReplaceInstWithInst(BB->getTerminator(),
2630
332
                      BranchInst::Create(Bypass, NewBB, SCEVCheck));
2631
332
  LoopBypassBlocks.push_back(BB);
2632
332
  AddedSafetyChecks = true;
2633
332
}
2634
2635
17.0k
void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2636
17.0k
  // VPlan-native path does not do any analysis for runtime checks currently.
2637
17.0k
  if (EnableVPlanNativePath)
2638
7
    return;
2639
17.0k
2640
17.0k
  BasicBlock *BB = L->getLoopPreheader();
2641
17.0k
2642
17.0k
  // Generate the code that checks in runtime if arrays overlap. We put the
2643
17.0k
  // checks into a separate block to make the more common case of few elements
2644
17.0k
  // faster.
2645
17.0k
  Instruction *FirstCheckInst;
2646
17.0k
  Instruction *MemRuntimeCheck;
2647
17.0k
  std::tie(FirstCheckInst, MemRuntimeCheck) =
2648
17.0k
      Legal->getLAI()->addRuntimeChecks(BB->getTerminator());
2649
17.0k
  if (!MemRuntimeCheck)
2650
14.5k
    return;
2651
2.48k
2652
2.48k
  assert(!Cost->foldTailByMasking() && "Cannot check memory when folding tail");
2653
2.48k
  // Create a new block containing the memory check.
2654
2.48k
  BB->setName("vector.memcheck");
2655
2.48k
  auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2656
2.48k
  // Update dominator tree immediately if the generated block is a
2657
2.48k
  // LoopBypassBlock because SCEV expansions to generate loop bypass
2658
2.48k
  // checks may query it before the current function is finished.
2659
2.48k
  DT->addNewBlock(NewBB, BB);
2660
2.48k
  if (L->getParentLoop())
2661
887
    L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2662
2.48k
  ReplaceInstWithInst(BB->getTerminator(),
2663
2.48k
                      BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
2664
2.48k
  LoopBypassBlocks.push_back(BB);
2665
2.48k
  AddedSafetyChecks = true;
2666
2.48k
2667
2.48k
  // We currently don't use LoopVersioning for the actual loop cloning but we
2668
2.48k
  // still use it to add the noalias metadata.
2669
2.48k
  LVer = llvm::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2670
2.48k
                                           PSE.getSE());
2671
2.48k
  LVer->prepareNoAliasMetadata();
2672
2.48k
}
2673
2674
Value *InnerLoopVectorizer::emitTransformedIndex(
2675
    IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2676
17.7k
    const InductionDescriptor &ID) const {
2677
17.7k
2678
17.7k
  SCEVExpander Exp(*SE, DL, "induction");
2679
17.7k
  auto Step = ID.getStep();
2680
17.7k
  auto StartValue = ID.getStartValue();
2681
17.7k
  assert(Index->getType() == Step->getType() &&
2682
17.7k
         "Index type does not match StepValue type");
2683
17.7k
2684
17.7k
  // Note: the IR at this point is broken. We cannot use SE to create any new
2685
17.7k
  // SCEV and then expand it, hoping that SCEV's simplification will give us
2686
17.7k
  // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2687
17.7k
  // lead to various SCEV crashes. So all we can do is to use builder and rely
2688
17.7k
  // on InstCombine for future simplifications. Here we handle some trivial
2689
17.7k
  // cases only.
2690
17.7k
  auto CreateAdd = [&B](Value *X, Value *Y) {
2691
5.07k
    assert(X->getType() == Y->getType() && "Types don't match!");
2692
5.07k
    if (auto *CX = dyn_cast<ConstantInt>(X))
2693
3.56k
      if (CX->isZero())
2694
2.74k
        return Y;
2695
2.32k
    if (auto *CY = dyn_cast<ConstantInt>(Y))
2696
150
      if (CY->isZero())
2697
9
        return X;
2698
2.31k
    return B.CreateAdd(X, Y);
2699
2.31k
  };
2700
17.7k
2701
17.7k
  auto CreateMul = [&B](Value *X, Value *Y) {
2702
16.0k
    assert(X->getType() == Y->getType() && "Types don't match!");
2703
16.0k
    if (auto *CX = dyn_cast<ConstantInt>(X))
2704
2.70k
      if (CX->isOne())
2705
0
        return Y;
2706
16.0k
    if (auto *CY = dyn_cast<ConstantInt>(Y))
2707
15.6k
      if (CY->isOne())
2708
8.13k
        return X;
2709
7.93k
    return B.CreateMul(X, Y);
2710
7.93k
  };
2711
17.7k
2712
17.7k
  switch (ID.getKind()) {
2713
17.7k
  case InductionDescriptor::IK_IntInduction: {
2714
6.75k
    assert(Index->getType() == StartValue->getType() &&
2715
6.75k
           "Index type does not match StartValue type");
2716
6.75k
    if (ID.getConstIntStepValue() && 
ID.getConstIntStepValue()->isMinusOne()6.35k
)
2717
1.68k
      return B.CreateSub(StartValue, Index);
2718
5.07k
    auto *Offset = CreateMul(
2719
5.07k
        Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
2720
5.07k
    return CreateAdd(StartValue, Offset);
2721
5.07k
  }
2722
10.9k
  case InductionDescriptor::IK_PtrInduction: {
2723
10.9k
    assert(isa<SCEVConstant>(Step) &&
2724
10.9k
           "Expected constant step for pointer induction");
2725
10.9k
    return B.CreateGEP(
2726
10.9k
        StartValue->getType()->getPointerElementType(), StartValue,
2727
10.9k
        CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
2728
10.9k
                                           &*B.GetInsertPoint())));
2729
5.07k
  }
2730
5.07k
  case InductionDescriptor::IK_FpInduction: {
2731
36
    assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2732
36
    auto InductionBinOp = ID.getInductionBinOp();
2733
36
    assert(InductionBinOp &&
2734
36
           (InductionBinOp->getOpcode() == Instruction::FAdd ||
2735
36
            InductionBinOp->getOpcode() == Instruction::FSub) &&
2736
36
           "Original bin op should be defined for FP induction");
2737
36
2738
36
    Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2739
36
2740
36
    // Floating point operations had to be 'fast' to enable the induction.
2741
36
    FastMathFlags Flags;
2742
36
    Flags.setFast();
2743
36
2744
36
    Value *MulExp = B.CreateFMul(StepValue, Index);
2745
36
    if (isa<Instruction>(MulExp))
2746
36
      // We have to check, the MulExp may be a constant.
2747
36
      cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2748
36
2749
36
    Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2750
36
                               "induction");
2751
36
    if (isa<Instruction>(BOp))
2752
36
      cast<Instruction>(BOp)->setFastMathFlags(Flags);
2753
36
2754
36
    return BOp;
2755
5.07k
  }
2756
5.07k
  case InductionDescriptor::IK_NoInduction:
2757
0
    return nullptr;
2758
0
  }
2759
0
  llvm_unreachable("invalid enum");
2760
0
}
2761
2762
17.0k
BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2763
17.0k
  /*
2764
17.0k
   In this function we generate a new loop. The new loop will contain
2765
17.0k
   the vectorized instructions while the old loop will continue to run the
2766
17.0k
   scalar remainder.
2767
17.0k
2768
17.0k
       [ ] <-- loop iteration number check.
2769
17.0k
    /   |
2770
17.0k
   /    v
2771
17.0k
  |    [ ] <-- vector loop bypass (may consist of multiple blocks).
2772
17.0k
  |  /  |
2773
17.0k
  | /   v
2774
17.0k
  ||   [ ]     <-- vector pre header.
2775
17.0k
  |/    |
2776
17.0k
  |     v
2777
17.0k
  |    [  ] \
2778
17.0k
  |    [  ]_|   <-- vector loop.
2779
17.0k
  |     |
2780
17.0k
  |     v
2781
17.0k
  |   -[ ]   <--- middle-block.
2782
17.0k
  |  /  |
2783
17.0k
  | /   v
2784
17.0k
  -|- >[ ]     <--- new preheader.
2785
17.0k
   |    |
2786
17.0k
   |    v
2787
17.0k
   |   [ ] \
2788
17.0k
   |   [ ]_|   <-- old scalar loop to handle remainder.
2789
17.0k
    \   |
2790
17.0k
     \  v
2791
17.0k
      >[ ]     <-- exit block.
2792
17.0k
   ...
2793
17.0k
   */
2794
17.0k
2795
17.0k
  BasicBlock *OldBasicBlock = OrigLoop->getHeader();
2796
17.0k
  BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
2797
17.0k
  BasicBlock *ExitBlock = OrigLoop->getExitBlock();
2798
17.0k
  MDNode *OrigLoopID = OrigLoop->getLoopID();
2799
17.0k
  assert(VectorPH && "Invalid loop structure");
2800
17.0k
  assert(ExitBlock && "Must have an exit block");
2801
17.0k
2802
17.0k
  // Some loops have a single integer induction variable, while other loops
2803
17.0k
  // don't. One example is c++ iterators that often have multiple pointer
2804
17.0k
  // induction variables. In the code below we also support a case where we
2805
17.0k
  // don't have a single induction variable.
2806
17.0k
  //
2807
17.0k
  // We try to obtain an induction variable from the original loop as hard
2808
17.0k
  // as possible. However if we don't find one that:
2809
17.0k
  //   - is an integer
2810
17.0k
  //   - counts from zero, stepping by one
2811
17.0k
  //   - is the size of the widest induction variable type
2812
17.0k
  // then we create a new one.
2813
17.0k
  OldInduction = Legal->getPrimaryInduction();
2814
17.0k
  Type *IdxTy = Legal->getWidestInductionType();
2815
17.0k
2816
17.0k
  // Split the single block loop into the two loop structure described above.
2817
17.0k
  BasicBlock *VecBody =
2818
17.0k
      VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
2819
17.0k
  BasicBlock *MiddleBlock =
2820
17.0k
      VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
2821
17.0k
  BasicBlock *ScalarPH =
2822
17.0k
      MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
2823
17.0k
2824
17.0k
  // Create and register the new vector loop.
2825
17.0k
  Loop *Lp = LI->AllocateLoop();
2826
17.0k
  Loop *ParentLoop = OrigLoop->getParentLoop();
2827
17.0k
2828
17.0k
  // Insert the new loop into the loop nest and register the new basic blocks
2829
17.0k
  // before calling any utilities such as SCEV that require valid LoopInfo.
2830
17.0k
  if (ParentLoop) {
2831
4.84k
    ParentLoop->addChildLoop(Lp);
2832
4.84k
    ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
2833
4.84k
    ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
2834
12.2k
  } else {
2835
12.2k
    LI->addTopLevelLoop(Lp);
2836
12.2k
  }
2837
17.0k
  Lp->addBasicBlockToLoop(VecBody, *LI);
2838
17.0k
2839
17.0k
  // Find the loop boundaries.
2840
17.0k
  Value *Count = getOrCreateTripCount(Lp);
2841
17.0k
2842
17.0k
  Value *StartIdx = ConstantInt::get(IdxTy, 0);
2843
17.0k
2844
17.0k
  // Now, compare the new count to zero. If it is zero skip the vector loop and
2845
17.0k
  // jump to the scalar loop. This check also covers the case where the
2846
17.0k
  // backedge-taken count is uint##_max: adding one to it will overflow leading
2847
17.0k
  // to an incorrect trip count of zero. In this (rare) case we will also jump
2848
17.0k
  // to the scalar loop.
2849
17.0k
  emitMinimumIterationCountCheck(Lp, ScalarPH);
2850
17.0k
2851
17.0k
  // Generate the code to check any assumptions that we've made for SCEV
2852
17.0k
  // expressions.
2853
17.0k
  emitSCEVChecks(Lp, ScalarPH);
2854
17.0k
2855
17.0k
  // Generate the code that checks in runtime if arrays overlap. We put the
2856
17.0k
  // checks into a separate block to make the more common case of few elements
2857
17.0k
  // faster.
2858
17.0k
  emitMemRuntimeChecks(Lp, ScalarPH);
2859
17.0k
2860
17.0k
  // Generate the induction variable.
2861
17.0k
  // The loop step is equal to the vectorization factor (num of SIMD elements)
2862
17.0k
  // times the unroll factor (num of SIMD instructions).
2863
17.0k
  Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
2864
17.0k
  Constant *Step = ConstantInt::get(IdxTy, VF * UF);
2865
17.0k
  Induction =
2866
17.0k
      createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
2867
17.0k
                              getDebugLocFromInstOrOperands(OldInduction));
2868
17.0k
2869
17.0k
  // We are going to resume the execution of the scalar loop.
2870
17.0k
  // Go over all of the induction variables that we found and fix the
2871
17.0k
  // PHIs that are left in the scalar version of the loop.
2872
17.0k
  // The starting values of PHI nodes depend on the counter of the last
2873
17.0k
  // iteration in the vectorized loop.
2874
17.0k
  // If we come from a bypass edge then we need to start from the original
2875
17.0k
  // start value.
2876
17.0k
2877
17.0k
  // This variable saves the new starting index for the scalar loop. It is used
2878
17.0k
  // to test if there are any tail iterations left once the vector loop has
2879
17.0k
  // completed.
2880
17.0k
  LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
2881
20.3k
  for (auto &InductionEntry : *List) {
2882
20.3k
    PHINode *OrigPhi = InductionEntry.first;
2883
20.3k
    InductionDescriptor II = InductionEntry.second;
2884
20.3k
2885
20.3k
    // Create phi nodes to merge from the  backedge-taken check block.
2886
20.3k
    PHINode *BCResumeVal = PHINode::Create(
2887
20.3k
        OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
2888
20.3k
    // Copy original phi DL over to the new one.
2889
20.3k
    BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
2890
20.3k
    Value *&EndValue = IVEndValues[OrigPhi];
2891
20.3k
    if (OrigPhi == OldInduction) {
2892
13.2k
      // We know what the end value is.
2893
13.2k
      EndValue = CountRoundDown;
2894
13.2k
    } else {
2895
7.04k
      IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
2896
7.04k
      Type *StepType = II.getStep()->getType();
2897
7.04k
      Instruction::CastOps CastOp =
2898
7.04k
        CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
2899
7.04k
      Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
2900
7.04k
      const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2901
7.04k
      EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
2902
7.04k
      EndValue->setName("ind.end");
2903
7.04k
    }
2904
20.3k
2905
20.3k
    // The new PHI merges the original incoming value, in case of a bypass,
2906
20.3k
    // or the value at the end of the vectorized loop.
2907
20.3k
    BCResumeVal->addIncoming(EndValue, MiddleBlock);
2908
20.3k
2909
20.3k
    // Fix the scalar body counter (PHI node).
2910
20.3k
    // The old induction's phi node in the scalar body needs the truncated
2911
20.3k
    // value.
2912
20.3k
    for (BasicBlock *BB : LoopBypassBlocks)
2913
24.3k
      BCResumeVal->addIncoming(II.getStartValue(), BB);
2914
20.3k
    OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal);
2915
20.3k
  }
2916
17.0k
2917
17.0k
  // We need the OrigLoop (scalar loop part) latch terminator to help
2918
17.0k
  // produce correct debug info for the middle block BB instructions.
2919
17.0k
  // The legality check stage guarantees that the loop will have a single
2920
17.0k
  // latch.
2921
17.0k
  assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
2922
17.0k
         "Scalar loop latch terminator isn't a branch");
2923
17.0k
  BranchInst *ScalarLatchBr =
2924
17.0k
      cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
2925
17.0k
2926
17.0k
  // Add a check in the middle block to see if we have completed
2927
17.0k
  // all of the iterations in the first vector loop.
2928
17.0k
  // If (N - N%VF) == N, then we *don't* need to run the remainder.
2929
17.0k
  // If tail is to be folded, we know we don't need to run the remainder.
2930
17.0k
  Value *CmpN = Builder.getTrue();
2931
17.0k
  if (!Cost->foldTailByMasking()) {
2932
17.0k
    CmpN =
2933
17.0k
        CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
2934
17.0k
                        CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
2935
17.0k
2936
17.0k
    // Here we use the same DebugLoc as the scalar loop latch branch instead
2937
17.0k
    // of the corresponding compare because they may have ended up with
2938
17.0k
    // different line numbers and we want to avoid awkward line stepping while
2939
17.0k
    // debugging. Eg. if the compare has got a line number inside the loop.
2940
17.0k
    cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
2941
17.0k
  }
2942
17.0k
2943
17.0k
  BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN);
2944
17.0k
  BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
2945
17.0k
  ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst);
2946
17.0k
2947
17.0k
  // Get ready to start creating new instructions into the vectorized body.
2948
17.0k
  Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt());
2949
17.0k
2950
17.0k
  // Save the state.
2951
17.0k
  LoopVectorPreHeader = Lp->getLoopPreheader();
2952
17.0k
  LoopScalarPreHeader = ScalarPH;
2953
17.0k
  LoopMiddleBlock = MiddleBlock;
2954
17.0k
  LoopExitBlock = ExitBlock;
2955
17.0k
  LoopVectorBody = VecBody;
2956
17.0k
  LoopScalarBody = OldBasicBlock;
2957
17.0k
2958
17.0k
  Optional<MDNode *> VectorizedLoopID =
2959
17.0k
      makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
2960
17.0k
                                      LLVMLoopVectorizeFollowupVectorized});
2961
17.0k
  if (VectorizedLoopID.hasValue()) {
2962
1
    Lp->setLoopID(VectorizedLoopID.getValue());
2963
1
2964
1
    // Do not setAlreadyVectorized if loop attributes have been defined
2965
1
    // explicitly.
2966
1
    return LoopVectorPreHeader;
2967
1
  }
2968
17.0k
2969
17.0k
  // Keep all loop hints from the original loop on the vector loop (we'll
2970
17.0k
  // replace the vectorizer-specific hints below).
2971
17.0k
  if (MDNode *LID = OrigLoop->getLoopID())
2972
1.06k
    Lp->setLoopID(LID);
2973
17.0k
2974
17.0k
  LoopVectorizeHints Hints(Lp, true, *ORE);
2975
17.0k
  Hints.setAlreadyVectorized();
2976
17.0k
2977
17.0k
  return LoopVectorPreHeader;
2978
17.0k
}
2979
2980
// Fix up external users of the induction variable. At this point, we are
2981
// in LCSSA form, with all external PHIs that use the IV having one input value,
2982
// coming from the remainder loop. We need those PHIs to also have a correct
2983
// value for the IV when arriving directly from the middle block.
2984
void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
2985
                                       const InductionDescriptor &II,
2986
                                       Value *CountRoundDown, Value *EndValue,
2987
20.3k
                                       BasicBlock *MiddleBlock) {
2988
20.3k
  // There are two kinds of external IV usages - those that use the value
2989
20.3k
  // computed in the last iteration (the PHI) and those that use the penultimate
2990
20.3k
  // value (the value that feeds into the phi from the loop latch).
2991
20.3k
  // We allow both, but they, obviously, have different values.
2992
20.3k
2993
20.3k
  assert(OrigLoop->getExitBlock() && "Expected a single exit block");
2994
20.3k
2995
20.3k
  DenseMap<Value *, Value *> MissingVals;
2996
20.3k
2997
20.3k
  // An external user of the last iteration's value should see the value that
2998
20.3k
  // the remainder loop uses to initialize its own IV.
2999
20.3k
  Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3000
45.4k
  for (User *U : PostInc->users()) {
3001
45.4k
    Instruction *UI = cast<Instruction>(U);
3002
45.4k
    if (!OrigLoop->contains(UI)) {
3003
1.16k
      assert(isa<PHINode>(UI) && "Expected LCSSA form");
3004
1.16k
      MissingVals[UI] = EndValue;
3005
1.16k
    }
3006
45.4k
  }
3007
20.3k
3008
20.3k
  // An external user of the penultimate value need to see EndValue - Step.
3009
20.3k
  // The simplest way to get this is to recompute it from the constituent SCEVs,
3010
20.3k
  // that is Start + (Step * (CRD - 1)).
3011
46.5k
  for (User *U : OrigPhi->users()) {
3012
46.5k
    auto *UI = cast<Instruction>(U);
3013
46.5k
    if (!OrigLoop->contains(UI)) {
3014
37
      const DataLayout &DL =
3015
37
          OrigLoop->getHeader()->getModule()->getDataLayout();
3016
37
      assert(isa<PHINode>(UI) && "Expected LCSSA form");
3017
37
3018
37
      IRBuilder<> B(MiddleBlock->getTerminator());
3019
37
      Value *CountMinusOne = B.CreateSub(
3020
37
          CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3021
37
      Value *CMO =
3022
37
          !II.getStep()->getType()->isIntegerTy()
3023
37
              ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3024
1
                             II.getStep()->getType())
3025
37
              : 
B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType())36
;
3026
37
      CMO->setName("cast.cmo");
3027
37
      Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3028
37
      Escape->setName("ind.escape");
3029
37
      MissingVals[UI] = Escape;
3030
37
    }
3031
46.5k
  }
3032
20.3k
3033
20.3k
  for (auto &I : MissingVals) {
3034
1.19k
    PHINode *PHI = cast<PHINode>(I.first);
3035
1.19k
    // One corner case we have to handle is two IVs "chasing" each-other,
3036
1.19k
    // that is %IV2 = phi [...], [ %IV1, %latch ]
3037
1.19k
    // In this case, if IV1 has an external use, we need to avoid adding both
3038
1.19k
    // "last value of IV1" and "penultimate value of IV2". So, verify that we
3039
1.19k
    // don't already have an incoming value for the middle block.
3040
1.19k
    if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3041
1.19k
      PHI->addIncoming(I.second, MiddleBlock);
3042
1.19k
  }
3043
20.3k
}
3044
3045
namespace {
3046
3047
struct CSEDenseMapInfo {
3048
509k
  static bool canHandle(const Instruction *I) {
3049
509k
    return isa<InsertElementInst>(I) || 
isa<ExtractElementInst>(I)501k
||
3050
509k
           
isa<ShuffleVectorInst>(I)499k
||
isa<GetElementPtrInst>(I)489k
;
3051
509k
  }
3052
3053
3.40M
  static inline Instruction *getEmptyKey() {
3054
3.40M
    return DenseMapInfo<Instruction *>::getEmptyKey();
3055
3.40M
  }
3056
3057
907k
  static inline Instruction *getTombstoneKey() {
3058
907k
    return DenseMapInfo<Instruction *>::getTombstoneKey();
3059
907k
  }
3060
3061
275k
  static unsigned getHashValue(const Instruction *I) {
3062
275k
    assert(canHandle(I) && "Unknown instruction!");
3063
275k
    return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3064
275k
                                                           I->value_op_end()));
3065
275k
  }
3066
3067
2.13M
  static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3068
2.13M
    if (LHS == getEmptyKey() || 
RHS == getEmptyKey()799k
||
3069
2.13M
        
LHS == getTombstoneKey()291k
||
RHS == getTombstoneKey()291k
)
3070
2.07M
      return LHS == RHS;
3071
57.2k
    return LHS->isIdenticalTo(RHS);
3072
57.2k
  }
3073
};
3074
3075
} // end anonymous namespace
3076
3077
///Perform cse of induction variable instructions.
3078
17.0k
static void cse(BasicBlock *BB) {
3079
17.0k
  // Perform simple cse.
3080
17.0k
  SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3081
526k
  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3082
509k
    Instruction *In = &*I++;
3083
509k
3084
509k
    if (!CSEDenseMapInfo::canHandle(In))
3085
395k
      continue;
3086
113k
3087
113k
    // Check if we can replace this instruction with any of the
3088
113k
    // visited instructions.
3089
113k
    if (Instruction *V = CSEMap.lookup(In)) {
3090
1.49k
      In->replaceAllUsesWith(V);
3091
1.49k
      In->eraseFromParent();
3092
1.49k
      continue;
3093
1.49k
    }
3094
112k
3095
112k
    CSEMap[In] = In;
3096
112k
  }
3097
17.0k
}
3098
3099
unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3100
                                                       unsigned VF,
3101
1.93k
                                                       bool &NeedToScalarize) {
3102
1.93k
  Function *F = CI->getCalledFunction();
3103
1.93k
  StringRef FnName = CI->getCalledFunction()->getName();
3104
1.93k
  Type *ScalarRetTy = CI->getType();
3105
1.93k
  SmallVector<Type *, 4> Tys, ScalarTys;
3106
1.93k
  for (auto &ArgOp : CI->arg_operands())
3107
2.17k
    ScalarTys.push_back(ArgOp->getType());
3108
1.93k
3109
1.93k
  // Estimate cost of scalarized vector call. The source operands are assumed
3110
1.93k
  // to be vectors, so we need to extract individual elements from there,
3111
1.93k
  // execute VF scalar calls, and then gather the result into the vector return
3112
1.93k
  // value.
3113
1.93k
  unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3114
1.93k
  if (VF == 1)
3115
363
    return ScalarCallCost;
3116
1.57k
3117
1.57k
  // Compute corresponding vector type for return value and arguments.
3118
1.57k
  Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3119
1.57k
  for (Type *ScalarTy : ScalarTys)
3120
1.77k
    Tys.push_back(ToVectorTy(ScalarTy, VF));
3121
1.57k
3122
1.57k
  // Compute costs of unpacking argument values for the scalar calls and
3123
1.57k
  // packing the return values to a vector.
3124
1.57k
  unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3125
1.57k
3126
1.57k
  unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3127
1.57k
3128
1.57k
  // If we can't emit a vector call for this function, then the currently found
3129
1.57k
  // cost is the cost we need to return.
3130
1.57k
  NeedToScalarize = true;
3131
1.57k
  if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || 
CI->isNoBuiltin()378
)
3132
1.20k
    return Cost;
3133
376
3134
376
  // If the corresponding vector cost is cheaper, return its cost.
3135
376
  unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3136
376
  if (VectorCallCost < Cost) {
3137
376
    NeedToScalarize = false;
3138
376
    return VectorCallCost;
3139
376
  }
3140
0
  return Cost;
3141
0
}
3142
3143
unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3144
1.57k
                                                            unsigned VF) {
3145
1.57k
  Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3146
1.57k
  assert(ID && "Expected intrinsic call!");
3147
1.57k
3148
1.57k
  FastMathFlags FMF;
3149
1.57k
  if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3150
1.30k
    FMF = FPMO->getFastMathFlags();
3151
1.57k
3152
1.57k
  SmallVector<Value *, 4> Operands(CI->arg_operands());
3153
1.57k
  return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
3154
1.57k
}
3155
3156
334
static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3157
334
  auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3158
334
  auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3159
334
  return I1->getBitWidth() < I2->getBitWidth() ? 
T10
: T2;
3160
334
}
3161
275
static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3162
275
  auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3163
275
  auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3164
275
  return I1->getBitWidth() > I2->getBitWidth() ? 
T13
:
T2272
;
3165
275
}
3166
3167
15.3k
void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3168
15.3k
  // For every instruction `I` in MinBWs, truncate the operands, create a
3169
15.3k
  // truncated version of `I` and reextend its result. InstCombine runs
3170
15.3k
  // later and will remove any ext/trunc pairs.
3171
15.3k
  SmallPtrSet<Value *, 4> Erased;
3172
15.3k
  for (const auto &KV : Cost->getMinimalBitwidths()) {
3173
186
    // If the value wasn't vectorized, we must maintain the original scalar
3174
186
    // type. The absence of the value from VectorLoopValueMap indicates that it
3175
186
    // wasn't vectorized.
3176
186
    if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3177
2
      continue;
3178
420
    
for (unsigned Part = 0; 184
Part < UF;
++Part236
) {
3179
236
      Value *I = getOrCreateVectorValue(KV.first, Part);
3180
236
      if (Erased.find(I) != Erased.end() || I->use_empty() ||
3181
236
          
!isa<Instruction>(I)232
)
3182
6
        continue;
3183
230
      Type *OriginalTy = I->getType();
3184
230
      Type *ScalarTruncatedTy =
3185
230
          IntegerType::get(OriginalTy->getContext(), KV.second);
3186
230
      Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3187
230
                                          OriginalTy->getVectorNumElements());
3188
230
      if (TruncatedTy == OriginalTy)
3189
17
        continue;
3190
213
3191
213
      IRBuilder<> B(cast<Instruction>(I));
3192
240
      auto ShrinkOperand = [&](Value *V) -> Value * {
3193
240
        if (auto *ZI = dyn_cast<ZExtInst>(V))
3194
63
          if (ZI->getSrcTy() == TruncatedTy)
3195
5
            return ZI->getOperand(0);
3196
235
        return B.CreateZExtOrTrunc(V, TruncatedTy);
3197
235
      };
3198
213
3199
213
      // The actual instruction modification depends on the instruction type,
3200
213
      // unfortunately.
3201
213
      Value *NewI = nullptr;
3202
213
      if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3203
112
        NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3204
112
                             ShrinkOperand(BO->getOperand(1)));
3205
112
3206
112
        // Any wrapping introduced by shrinking this operation shouldn't be
3207
112
        // considered undefined behavior. So, we can't unconditionally copy
3208
112
        // arithmetic wrapping flags to NewI.
3209
112
        cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3210
112
      } else 
if (auto *101
CI101
= dyn_cast<ICmpInst>(I)) {
3211
0
        NewI =
3212
0
            B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3213
0
                         ShrinkOperand(CI->getOperand(1)));
3214
101
      } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3215
0
        NewI = B.CreateSelect(SI->getCondition(),
3216
0
                              ShrinkOperand(SI->getTrueValue()),
3217
0
                              ShrinkOperand(SI->getFalseValue()));
3218
101
      } else if (auto *CI = dyn_cast<CastInst>(I)) {
3219
75
        switch (CI->getOpcode()) {
3220
75
        default:
3221
0
          llvm_unreachable("Unhandled cast!");
3222
75
        case Instruction::Trunc:
3223
16
          NewI = ShrinkOperand(CI->getOperand(0));
3224
16
          break;
3225
75
        case Instruction::SExt:
3226
1
          NewI = B.CreateSExtOrTrunc(
3227
1
              CI->getOperand(0),
3228
1
              smallestIntegerVectorType(OriginalTy, TruncatedTy));
3229
1
          break;
3230
75
        case Instruction::ZExt:
3231
58
          NewI = B.CreateZExtOrTrunc(
3232
58
              CI->getOperand(0),
3233
58
              smallestIntegerVectorType(OriginalTy, TruncatedTy));
3234
58
          break;
3235
26
        }
3236
26
      } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3237
24
        auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3238
24
        auto *O0 = B.CreateZExtOrTrunc(
3239
24
            SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3240
24
        auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3241
24
        auto *O1 = B.CreateZExtOrTrunc(
3242
24
            SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3243
24
3244
24
        NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3245
24
      } else 
if (2
isa<LoadInst>(I)2
||
isa<PHINode>(I)2
) {
3246
1
        // Don't do anything with the operands, just extend the result.
3247
1
        continue;
3248
1
      } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3249
1
        auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3250
1
        auto *O0 = B.CreateZExtOrTrunc(
3251
1
            IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3252
1
        auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3253
1
        NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3254
1
      } else 
if (auto *0
EE0
= dyn_cast<ExtractElementInst>(I)) {
3255
0
        auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3256
0
        auto *O0 = B.CreateZExtOrTrunc(
3257
0
            EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3258
0
        NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3259
0
      } else {
3260
0
        // If we don't know what to do, be conservative and don't do anything.
3261
0
        continue;
3262
0
      }
3263
212
3264
212
      // Lastly, extend the result.
3265
212
      NewI->takeName(cast<Instruction>(I));
3266
212
      Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3267
212
      I->replaceAllUsesWith(Res);
3268
212
      cast<Instruction>(I)->eraseFromParent();
3269
212
      Erased.insert(I);
3270
212
      VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3271
212
    }
3272
184
  }
3273
15.3k
3274
15.3k
  // We'll have created a bunch of ZExts that are now parentless. Clean up.
3275
15.3k
  for (const auto &KV : Cost->getMinimalBitwidths()) {
3276
186
    // If the value wasn't vectorized, we must maintain the original scalar
3277
186
    // type. The absence of the value from VectorLoopValueMap indicates that it
3278
186
    // wasn't vectorized.
3279
186
    if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3280
2
      continue;
3281
420
    
for (unsigned Part = 0; 184
Part < UF;
++Part236
) {
3282
236
      Value *I = getOrCreateVectorValue(KV.first, Part);
3283
236
      ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3284
236
      if (Inst && 
Inst->use_empty()198
) {
3285
3
        Value *NewI = Inst->getOperand(0);
3286
3
        Inst->eraseFromParent();
3287
3
        VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3288
3
      }
3289
236
    }
3290
184
  }
3291
15.3k
}
3292
3293
17.0k
void InnerLoopVectorizer::fixVectorizedLoop() {
3294
17.0k
  // Insert truncates and extends for any truncated instructions as hints to
3295
17.0k
  // InstCombine.
3296
17.0k
  if (VF > 1)
3297
15.3k
    truncateToMinimalBitwidths();
3298
17.0k
3299
17.0k
  // Fix widened non-induction PHIs by setting up the PHI operands.
3300
17.0k
  if (OrigPHIsToFix.size()) {
3301
7
    assert(EnableVPlanNativePath &&
3302
7
           "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3303
7
    fixNonInductionPHIs();
3304
7
  }
3305
17.0k
3306
17.0k
  // At this point every instruction in the original loop is widened to a
3307
17.0k
  // vector form. Now we need to fix the recurrences in the loop. These PHI
3308
17.0k
  // nodes are currently empty because we did not want to introduce cycles.
3309
17.0k
  // This is the second stage of vectorizing recurrences.
3310
17.0k
  fixCrossIterationPHIs();
3311
17.0k
3312
17.0k
  // Update the dominator tree.
3313
17.0k
  //
3314
17.0k
  // FIXME: After creating the structure of the new loop, the dominator tree is
3315
17.0k
  //        no longer up-to-date, and it remains that way until we update it
3316
17.0k
  //        here. An out-of-date dominator tree is problematic for SCEV,
3317
17.0k
  //        because SCEVExpander uses it to guide code generation. The
3318
17.0k
  //        vectorizer use SCEVExpanders in several places. Instead, we should
3319
17.0k
  //        keep the dominator tree up-to-date as we go.
3320
17.0k
  updateAnalysis();
3321
17.0k
3322
17.0k
  // Fix-up external users of the induction variables.
3323
17.0k
  for (auto &Entry : *Legal->getInductionVars())
3324
20.3k
    fixupIVUsers(Entry.first, Entry.second,
3325
20.3k
                 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3326
20.3k
                 IVEndValues[Entry.first], LoopMiddleBlock);
3327
17.0k
3328
17.0k
  fixLCSSAPHIs();
3329
17.0k
  for (Instruction *PI : PredicatedInstructions)
3330
948
    sinkScalarOperands(&*PI);
3331
17.0k
3332
17.0k
  // Remove redundant induction instructions.
3333
17.0k
  cse(LoopVectorBody);
3334
17.0k
}
3335
3336
17.0k
void InnerLoopVectorizer::fixCrossIterationPHIs() {
3337
17.0k
  // In order to support recurrences we need to be able to vectorize Phi nodes.
3338
17.0k
  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3339
17.0k
  // stage #2: We now need to fix the recurrences by adding incoming edges to
3340
17.0k
  // the currently empty PHI nodes. At this point every instruction in the
3341
17.0k
  // original loop is widened to a vector form so we can use them to construct
3342
17.0k
  // the incoming edges.
3343
21.7k
  for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3344
21.7k
    // Handle first-order recurrences and reductions that need to be fixed.
3345
21.7k
    if (Legal->isFirstOrderRecurrence(&Phi))
3346
90
      fixFirstOrderRecurrence(&Phi);
3347
21.6k
    else if (Legal->isReductionVariable(&Phi))
3348
1.36k
      fixReduction(&Phi);
3349
21.7k
  }
3350
17.0k
}
3351
3352
90
void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3353
90
  // This is the second phase of vectorizing first-order recurrences. An
3354
90
  // overview of the transformation is described below. Suppose we have the
3355
90
  // following loop.
3356
90
  //
3357
90
  //   for (int i = 0; i < n; ++i)
3358
90
  //     b[i] = a[i] - a[i - 1];
3359
90
  //
3360
90
  // There is a first-order recurrence on "a". For this loop, the shorthand
3361
90
  // scalar IR looks like:
3362
90
  //
3363
90
  //   scalar.ph:
3364
90
  //     s_init = a[-1]
3365
90
  //     br scalar.body
3366
90
  //
3367
90
  //   scalar.body:
3368
90
  //     i = phi [0, scalar.ph], [i+1, scalar.body]
3369
90
  //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3370
90
  //     s2 = a[i]
3371
90
  //     b[i] = s2 - s1
3372
90
  //     br cond, scalar.body, ...
3373
90
  //
3374
90
  // In this example, s1 is a recurrence because it's value depends on the
3375
90
  // previous iteration. In the first phase of vectorization, we created a
3376
90
  // temporary value for s1. We now complete the vectorization and produce the
3377
90
  // shorthand vector IR shown below (for VF = 4, UF = 1).
3378
90
  //
3379
90
  //   vector.ph:
3380
90
  //     v_init = vector(..., ..., ..., a[-1])
3381
90
  //     br vector.body
3382
90
  //
3383
90
  //   vector.body
3384
90
  //     i = phi [0, vector.ph], [i+4, vector.body]
3385
90
  //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3386
90
  //     v2 = a[i, i+1, i+2, i+3];
3387
90
  //     v3 = vector(v1(3), v2(0, 1, 2))
3388
90
  //     b[i, i+1, i+2, i+3] = v2 - v3
3389
90
  //     br cond, vector.body, middle.block
3390
90
  //
3391
90
  //   middle.block:
3392
90
  //     x = v2(3)
3393
90
  //     br scalar.ph
3394
90
  //
3395
90
  //   scalar.ph:
3396
90
  //     s_init = phi [x, middle.block], [a[-1], otherwise]
3397
90
  //     br scalar.body
3398
90
  //
3399
90
  // After execution completes the vector loop, we extract the next value of
3400
90
  // the recurrence (x) to use as the initial value in the scalar loop.
3401
90
3402
90
  // Get the original loop preheader and single loop latch.
3403
90
  auto *Preheader = OrigLoop->getLoopPreheader();
3404
90
  auto *Latch = OrigLoop->getLoopLatch();
3405
90
3406
90
  // Get the initial and previous values of the scalar recurrence.
3407
90
  auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3408
90
  auto *Previous = Phi->getIncomingValueForBlock(Latch);
3409
90
3410
90
  // Create a vector from the initial value.
3411
90
  auto *VectorInit = ScalarInit;
3412
90
  if (VF > 1) {
3413
79
    Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3414
79
    VectorInit = Builder.CreateInsertElement(
3415
79
        UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3416
79
        Builder.getInt32(VF - 1), "vector.recur.init");
3417
79
  }
3418
90
3419
90
  // We constructed a temporary phi node in the first phase of vectorization.
3420
90
  // This phi node will eventually be deleted.
3421
90
  Builder.SetInsertPoint(
3422
90
      cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3423
90
3424
90
  // Create a phi node for the new recurrence. The current value will either be
3425
90
  // the initial value inserted into a vector or loop-varying vector value.
3426
90
  auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3427
90
  VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3428
90
3429
90
  // Get the vectorized previous value of the last part UF - 1. It appears last
3430
90
  // among all unrolled iterations, due to the order of their construction.
3431
90
  Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3432
90
3433
90
  // Set the insertion point after the previous value if it is an instruction.
3434
90
  // Note that the previous value may have been constant-folded so it is not
3435
90
  // guaranteed to be an instruction in the vector loop. Also, if the previous
3436
90
  // value is a phi node, we should insert after all the phi nodes to avoid
3437
90
  // breaking basic block verification.
3438
90
  if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) ||
3439
90
      
isa<PHINode>(PreviousLastPart)85
)
3440
7
    Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3441
83
  else
3442
83
    Builder.SetInsertPoint(
3443
83
        &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart)));
3444
90
3445
90
  // We will construct a vector for the recurrence by combining the values for
3446
90
  // the current and previous iterations. This is the required shuffle mask.
3447
90
  SmallVector<Constant *, 8> ShuffleMask(VF);
3448
90
  ShuffleMask[0] = Builder.getInt32(VF - 1);
3449
323
  for (unsigned I = 1; I < VF; 
++I233
)
3450
233
    ShuffleMask[I] = Builder.getInt32(I + VF - 1);
3451
90
3452
90
  // The vector from which to take the initial value for the current iteration
3453
90
  // (actual or unrolled). Initially, this is the vector phi node.
3454
90
  Value *Incoming = VecPhi;
3455
90
3456
90
  // Shuffle the current and previous vector and update the vector parts.
3457
221
  for (unsigned Part = 0; Part < UF; 
++Part131
) {
3458
131
    Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3459
131
    Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3460
131
    auto *Shuffle =
3461
131
        VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3462
109
                                             ConstantVector::get(ShuffleMask))
3463
131
               : 
Incoming22
;
3464
131
    PhiPart->replaceAllUsesWith(Shuffle);
3465
131
    cast<Instruction>(PhiPart)->eraseFromParent();
3466
131
    VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3467
131
    Incoming = PreviousPart;
3468
131
  }
3469
90
3470
90
  // Fix the latch value of the new recurrence in the vector loop.
3471
90
  VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3472
90
3473
90
  // Extract the last vector element in the middle block. This will be the
3474
90
  // initial value for the recurrence when jumping to the scalar loop.
3475
90
  auto *ExtractForScalar = Incoming;
3476
90
  if (VF > 1) {
3477
79
    Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3478
79
    ExtractForScalar = Builder.CreateExtractElement(
3479
79
        ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3480
79
  }
3481
90
  // Extract the second last element in the middle block if the
3482
90
  // Phi is used outside the loop. We need to extract the phi itself
3483
90
  // and not the last element (the phi update in the current iteration). This
3484
90
  // will be the value when jumping to the exit block from the LoopMiddleBlock,
3485
90
  // when the scalar loop is not run at all.
3486
90
  Value *ExtractForPhiUsedOutsideLoop = nullptr;
3487
90
  if (VF > 1)
3488
79
    ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3489
79
        Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3490
11
  // When loop is unrolled without vectorizing, initialize
3491
11
  // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3492
11
  // `Incoming`. This is analogous to the vectorized case above: extracting the
3493
11
  // second last element when VF > 1.
3494
11
  else if (UF > 1)
3495
11
    ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3496
90
3497
90
  // Fix the initial value of the original recurrence in the scalar loop.
3498
90
  Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3499
90
  auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3500
205
  for (auto *BB : predecessors(LoopScalarPreHeader)) {
3501
205
    auto *Incoming = BB == LoopMiddleBlock ? 
ExtractForScalar90
:
ScalarInit115
;
3502
205
    Start->addIncoming(Incoming, BB);
3503
205
  }
3504
90
3505
90
  Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3506
90
  Phi->setName("scalar.recur");
3507
90
3508
90
  // Finally, fix users of the recurrence outside the loop. The users will need
3509
90
  // either the last value of the scalar recurrence or the last value of the
3510
90
  // vector recurrence we extracted in the middle block. Since the loop is in
3511
90
  // LCSSA form, we just need to find all the phi nodes for the original scalar
3512
90
  // recurrence in the exit block, and then add an edge for the middle block.
3513
90
  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3514
49
    if (LCSSAPhi.getIncomingValue(0) == Phi) {
3515
9
      LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3516
9
    }
3517
49
  }
3518
90
}
3519
3520
1.36k
void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3521
1.36k
  Constant *Zero = Builder.getInt32(0);
3522
1.36k
3523
1.36k
  // Get it's reduction variable descriptor.
3524
1.36k
  assert(Legal->isReductionVariable(Phi) &&
3525
1.36k
         "Unable to find the reduction variable");
3526
1.36k
  RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
3527
1.36k
3528
1.36k
  RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3529
1.36k
  TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3530
1.36k
  Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3531
1.36k
  RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3532
1.36k
    RdxDesc.getMinMaxRecurrenceKind();
3533
1.36k
  setDebugLocFromInst(Builder, ReductionStartValue);
3534
1.36k
3535
1.36k
  // We need to generate a reduction vector from the incoming scalar.
3536
1.36k
  // To do so, we need to generate the 'identity' vector and override
3537
1.36k
  // one of the elements with the incoming scalar reduction. We need
3538
1.36k
  // to do it in the vector-loop preheader.
3539
1.36k
  Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3540
1.36k
3541
1.36k
  // This is the vector-clone of the value that leaves the loop.
3542
1.36k
  Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3543
1.36k
3544
1.36k
  // Find the reduction identity variable. Zero for addition, or, xor,
3545
1.36k
  // one for multiplication, -1 for And.
3546
1.36k
  Value *Identity;
3547
1.36k
  Value *VectorStart;
3548
1.36k
  if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3549
1.36k
      
RK == RecurrenceDescriptor::RK_FloatMinMax1.27k
) {
3550
107
    // MinMax reduction have the start value as their identify.
3551
107
    if (VF == 1) {
3552
44
      VectorStart = Identity = ReductionStartValue;
3553
63
    } else {
3554
63
      VectorStart = Identity =
3555
63
        Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3556
63
    }
3557
1.25k
  } else {
3558
1.25k
    // Handle other reduction kinds:
3559
1.25k
    Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3560
1.25k
        RK, VecTy->getScalarType());
3561
1.25k
    if (VF == 1) {
3562
261
      Identity = Iden;
3563
261
      // This vector is the Identity vector where the first element is the
3564
261
      // incoming scalar reduction.
3565
261
      VectorStart = ReductionStartValue;
3566
994
    } else {
3567
994
      Identity = ConstantVector::getSplat(VF, Iden);
3568
994
3569
994
      // This vector is the Identity vector where the first element is the
3570
994
      // incoming scalar reduction.
3571
994
      VectorStart =
3572
994
        Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3573
994
    }
3574
1.25k
  }
3575
1.36k
3576
1.36k
  // Fix the vector-loop phi.
3577
1.36k
3578
1.36k
  // Reductions do not have to start at zero. They can start with
3579
1.36k
  // any loop invariant values.
3580
1.36k
  BasicBlock *Latch = OrigLoop->getLoopLatch();
3581
1.36k
  Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3582
3.98k
  for (unsigned Part = 0; Part < UF; 
++Part2.62k
) {
3583
2.62k
    Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3584
2.62k
    Value *Val = getOrCreateVectorValue(LoopVal, Part);
3585
2.62k
    // Make sure to add the reduction stat value only to the
3586
2.62k
    // first unroll part.
3587
2.62k
    Value *StartVal = (Part == 0) ? 
VectorStart1.36k
:
Identity1.26k
;
3588
2.62k
    cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3589
2.62k
    cast<PHINode>(VecRdxPhi)
3590
2.62k
      ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3591
2.62k
  }
3592
1.36k
3593
1.36k
  // Before each round, move the insertion point right between
3594
1.36k
  // the PHIs and the values we are going to write.
3595
1.36k
  // This allows us to write both PHINodes and the extractelement
3596
1.36k
  // instructions.
3597
1.36k
  Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3598
1.36k
3599
1.36k
  setDebugLocFromInst(Builder, LoopExitInst);
3600
1.36k
3601
1.36k
  // If the vector reduction can be performed in a smaller type, we truncate
3602
1.36k
  // then extend the loop exit value to enable InstCombine to evaluate the
3603
1.36k
  // entire expression in the smaller type.
3604
1.36k
  if (VF > 1 && 
Phi->getType() != RdxDesc.getRecurrenceType()1.05k
) {
3605
6
    Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3606
6
    Builder.SetInsertPoint(
3607
6
        LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
3608
6
    VectorParts RdxParts(UF);
3609
13
    for (unsigned Part = 0; Part < UF; 
++Part7
) {
3610
7
      RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3611
7
      Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3612
7
      Value *Extnd = RdxDesc.isSigned() ? 
Builder.CreateSExt(Trunc, VecTy)1
3613
7
                                        : 
Builder.CreateZExt(Trunc, VecTy)6
;
3614
7
      for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3615
21
           UI != RdxParts[Part]->user_end();)
3616
14
        if (*UI != Trunc) {
3617
7
          (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3618
7
          RdxParts[Part] = Extnd;
3619
7
        } else {
3620
7
          ++UI;
3621
7
        }
3622
7
    }
3623
6
    Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3624
13
    for (unsigned Part = 0; Part < UF; 
++Part7
) {
3625
7
      RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3626
7
      VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3627
7
    }
3628
6
  }
3629
1.36k
3630
1.36k
  // Reduce all of the unrolled parts into a single vector.
3631
1.36k
  Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3632
1.36k
  unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
3633
1.36k
3634
1.36k
  // The middle block terminator has already been assigned a DebugLoc here (the
3635
1.36k
  // OrigLoop's single latch terminator). We want the whole middle block to
3636
1.36k
  // appear to execute on this line because: (a) it is all compiler generated,
3637
1.36k
  // (b) these instructions are always executed after evaluating the latch
3638
1.36k
  // conditional branch, and (c) other passes may add new predecessors which
3639
1.36k
  // terminate on this line. This is the easiest way to ensure we don't
3640
1.36k
  // accidentally cause an extra step back into the loop while debugging.
3641
1.36k
  setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
3642
2.62k
  for (unsigned Part = 1; Part < UF; 
++Part1.26k
) {
3643
1.26k
    Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3644
1.26k
    if (Op != Instruction::ICmp && 
Op != Instruction::FCmp1.19k
)
3645
1.19k
      // Floating point operations had to be 'fast' to enable the reduction.
3646
1.19k
      ReducedPartRdx = addFastMathFlag(
3647
1.19k
          Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
3648
1.19k
                              ReducedPartRdx, "bin.rdx"),
3649
1.19k
          RdxDesc.getFastMathFlags());
3650
73
    else
3651
73
      ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3652
73
                                      RdxPart);
3653
1.26k
  }
3654
1.36k
3655
1.36k
  if (VF > 1) {
3656
1.05k
    bool NoNaN = Legal->hasFunNoNaNAttr();
3657
1.05k
    ReducedPartRdx =
3658
1.05k
        createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3659
1.05k
    // If the reduction can be performed in a smaller type, we need to extend
3660
1.05k
    // the reduction to the wider type before we branch to the original loop.
3661
1.05k
    if (Phi->getType() != RdxDesc.getRecurrenceType())
3662
6
      ReducedPartRdx =
3663
6
        RdxDesc.isSigned()
3664
6
        ? 
Builder.CreateSExt(ReducedPartRdx, Phi->getType())1
3665
6
        : 
Builder.CreateZExt(ReducedPartRdx, Phi->getType())5
;
3666
1.05k
  }
3667
1.36k
3668
1.36k
  // Create a phi node that merges control-flow from the backedge-taken check
3669
1.36k
  // block and the middle block.
3670
1.36k
  PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3671
1.36k
                                        LoopScalarPreHeader->getTerminator());
3672
2.76k
  for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; 
++I1.40k
)
3673
1.40k
    BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3674
1.36k
  BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3675
1.36k
3676
1.36k
  // Now, we need to fix the users of the reduction variable
3677
1.36k
  // inside and outside of the scalar remainder loop.
3678
1.36k
  // We know that the loop is in LCSSA form. We need to update the
3679
1.36k
  // PHI nodes in the exit blocks.
3680
1.70k
  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3681
1.70k
    // All PHINodes need to have a single entry edge, or two if
3682
1.70k
    // we already fixed them.
3683
1.70k
    assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3684
1.70k
3685
1.70k
    // We found a reduction value exit-PHI. Update it with the
3686
1.70k
    // incoming bypass edge.
3687
1.70k
    if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3688
1.36k
      LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3689
1.70k
  } // end of the LCSSA phi scan.
3690
1.36k
3691
1.36k
    // Fix the scalar loop reduction variable with the incoming reduction sum
3692
1.36k
    // from the vector body and from the backedge value.
3693
1.36k
  int IncomingEdgeBlockIdx =
3694
1.36k
    Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
3695
1.36k
  assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3696
1.36k
  // Pick the other block.
3697
1.36k
  int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 
0851
:
1511
);
3698
1.36k
  Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3699
1.36k
  Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3700
1.36k
}
3701
3702
17.0k
void InnerLoopVectorizer::fixLCSSAPHIs() {
3703
17.0k
  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3704
2.58k
    if (LCSSAPhi.getNumIncomingValues() == 1) {
3705
15
      auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3706
15
      // Non-instruction incoming values will have only one value.
3707
15
      unsigned LastLane = 0;
3708
15
      if (isa<Instruction>(IncomingValue)) 
3709
13
          LastLane = Cost->isUniformAfterVectorization(
3710
13
                         cast<Instruction>(IncomingValue), VF)
3711
13
                         ? 
02
3712
13
                         : 
VF - 111
;
3713
15
      // Can be a loop invariant incoming value or the last scalar value to be
3714
15
      // extracted from the vectorized loop.
3715
15
      Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3716
15
      Value *lastIncomingValue =
3717
15
          getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3718
15
      LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3719
15
    }
3720
2.58k
  }
3721
17.0k
}
3722
3723
948
void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3724
948
  // The basic block and loop containing the predicated instruction.
3725
948
  auto *PredBB = PredInst->getParent();
3726
948
  auto *VectorLoop = LI->getLoopFor(PredBB);
3727
948
3728
948
  // Initialize a worklist with the operands of the predicated instruction.
3729
948
  SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3730
948
3731
948
  // Holds instructions that we need to analyze again. An instruction may be
3732
948
  // reanalyzed if we don't yet know if we can sink it or not.
3733
948
  SmallVector<Instruction *, 8> InstsToReanalyze;
3734
948
3735
948
  // Returns true if a given use occurs in the predicated block. Phi nodes use
3736
948
  // their operands in their corresponding predecessor blocks.
3737
3.39k
  auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3738
3.39k
    auto *I = cast<Instruction>(U.getUser());
3739
3.39k
    BasicBlock *BB = I->getParent();
3740
3.39k
    if (auto *Phi = dyn_cast<PHINode>(I))
3741
0
      BB = Phi->getIncomingBlock(
3742
0
          PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3743
3.39k
    return BB == PredBB;
3744
3.39k
  };
3745
948
3746
948
  // Iteratively sink the scalarized operands of the predicated instruction
3747
948
  // into the block we created for it. When an instruction is sunk, it's
3748
948
  // operands are then added to the worklist. The algorithm ends after one pass
3749
948
  // through the worklist doesn't sink a single instruction.
3750
948
  bool Changed;
3751
1.66k
  do {
3752
1.66k
    // Add the instructions that need to be reanalyzed to the worklist, and
3753
1.66k
    // reset the changed indicator.
3754
1.66k
    Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3755
1.66k
    InstsToReanalyze.clear();
3756
1.66k
    Changed = false;
3757
1.66k
3758
6.86k
    while (!Worklist.empty()) {
3759
5.19k
      auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3760
5.19k
3761
5.19k
      // We can't sink an instruction if it is a phi node, is already in the
3762
5.19k
      // predicated block, is not in the loop, or may have side effects.
3763
5.19k
      if (!I || 
isa<PHINode>(I)3.50k
||
I->getParent() == PredBB3.23k
||
3764
5.19k
          
!VectorLoop->contains(I)3.00k
||
I->mayHaveSideEffects()2.76k
)
3765
2.42k
        continue;
3766
2.76k
3767
2.76k
      // It's legal to sink the instruction if all its uses occur in the
3768
2.76k
      // predicated block. Otherwise, there's nothing to do yet, and we may
3769
2.76k
      // need to reanalyze the instruction.
3770
2.76k
      if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3771
1.41k
        InstsToReanalyze.push_back(I);
3772
1.41k
        continue;
3773
1.41k
      }
3774
1.34k
3775
1.34k
      // Move the instruction to the beginning of the predicated block, and add
3776
1.34k
      // it's operands to the worklist.
3777
1.34k
      I->moveBefore(&*PredBB->getFirstInsertionPt());
3778
1.34k
      Worklist.insert(I->op_begin(), I->op_end());
3779
1.34k
3780
1.34k
      // The sinking may have enabled other instructions to be sunk, so we will
3781
1.34k
      // need to iterate.
3782
1.34k
      Changed = true;
3783
1.34k
    }
3784
1.66k
  } while (Changed);
3785
948
}
3786
3787
7
void InnerLoopVectorizer::fixNonInductionPHIs() {
3788
9
  for (PHINode *OrigPhi : OrigPHIsToFix) {
3789
9
    PHINode *NewPhi =
3790
9
        cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
3791
9
    unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
3792
9
3793
9
    SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
3794
9
        predecessors(OrigPhi->getParent()));
3795
9
    SmallVector<BasicBlock *, 2> VectorBBPredecessors(
3796
9
        predecessors(NewPhi->getParent()));
3797
9
    assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
3798
9
           "Scalar and Vector BB should have the same number of predecessors");
3799
9
3800
9
    // The insertion point in Builder may be invalidated by the time we get
3801
9
    // here. Force the Builder insertion point to something valid so that we do
3802
9
    // not run into issues during insertion point restore in
3803
9
    // getOrCreateVectorValue calls below.
3804
9
    Builder.SetInsertPoint(NewPhi);
3805
9
3806
9
    // The predecessor order is preserved and we can rely on mapping between
3807
9
    // scalar and vector block predecessors.
3808
26
    for (unsigned i = 0; i < NumIncomingValues; 
++i17
) {
3809
17
      BasicBlock *NewPredBB = VectorBBPredecessors[i];
3810
17
3811
17
      // When looking up the new scalar/vector values to fix up, use incoming
3812
17
      // values from original phi.
3813
17
      Value *ScIncV =
3814
17
          OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
3815
17
3816
17
      // Scalar incoming value may need a broadcast
3817
17
      Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
3818
17
      NewPhi->addIncoming(NewIncV, NewPredBB);
3819
17
    }
3820
9
  }
3821
7
}
3822
3823
void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
3824
5.05k
                                              unsigned VF) {
3825
5.05k
  PHINode *P = cast<PHINode>(PN);
3826
5.05k
  if (EnableVPlanNativePath) {
3827
9
    // Currently we enter here in the VPlan-native path for non-induction
3828
9
    // PHIs where all control flow is uniform. We simply widen these PHIs.
3829
9
    // Create a vector phi with no operands - the vector phi operands will be
3830
9
    // set at the end of vector code generation.
3831
9
    Type *VecTy =
3832
9
        (VF == 1) ? 
PN->getType()0
: VectorType::get(PN->getType(), VF);
3833
9
    Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
3834
9
    VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
3835
9
    OrigPHIsToFix.push_back(P);
3836
9
3837
9
    return;
3838
9
  }
3839
5.04k
3840
5.04k
  assert(PN->getParent() == OrigLoop->getHeader() &&
3841
5.04k
         "Non-header phis should have been handled elsewhere");
3842
5.04k
3843
5.04k
  // In order to support recurrences we need to be able to vectorize Phi nodes.
3844
5.04k
  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3845
5.04k
  // stage #1: We create a new vector PHI node with no incoming edges. We'll use
3846
5.04k
  // this value when we vectorize all of the instructions that use the PHI.
3847
5.04k
  if (Legal->isReductionVariable(P) || 
Legal->isFirstOrderRecurrence(P)3.68k
) {
3848
4.20k
    for (unsigned Part = 0; Part < UF; 
++Part2.75k
) {
3849
2.75k
      // This is phase one of vectorizing PHIs.
3850
2.75k
      Type *VecTy =
3851
2.75k
          (VF == 1) ? 
PN->getType()640
:
VectorType::get(PN->getType(), VF)2.11k
;
3852
2.75k
      Value *EntryPart = PHINode::Create(
3853
2.75k
          VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
3854
2.75k
      VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
3855
2.75k
    }
3856
1.45k
    return;
3857
1.45k
  }
3858
3.59k
3859
3.59k
  setDebugLocFromInst(Builder, P);
3860
3.59k
3861
3.59k
  // This PHINode must be an induction variable.
3862
3.59k
  // Make sure that we know about it.
3863
3.59k
  assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
3864
3.59k
3865
3.59k
  InductionDescriptor II = Legal->getInductionVars()->lookup(P);
3866
3.59k
  const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
3867
3.59k
3868
3.59k
  // FIXME: The newly created binary instructions should contain nsw/nuw flags,
3869
3.59k
  // which can be found from the original scalar operations.
3870
3.59k
  switch (II.getKind()) {
3871
3.59k
  case InductionDescriptor::IK_NoInduction:
3872
0
    llvm_unreachable("Unknown induction");
3873
3.59k
  case InductionDescriptor::IK_IntInduction:
3874
0
  case InductionDescriptor::IK_FpInduction:
3875
0
    llvm_unreachable("Integer/fp induction is handled elsewhere.");
3876
3.59k
  case InductionDescriptor::IK_PtrInduction: {
3877
3.59k
    // Handle the pointer induction variable case.
3878
3.59k
    assert(P->getType()->isPointerTy() && "Unexpected type.");
3879
3.59k
    // This is the normalized GEP that starts counting at zero.
3880
3.59k
    Value *PtrInd = Induction;
3881
3.59k
    PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
3882
3.59k
    // Determine the number of scalars we need to generate for each unroll
3883
3.59k
    // iteration. If the instruction is uniform, we only need to generate the
3884
3.59k
    // first lane. Otherwise, we generate all VF values.
3885
3.59k
    unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 
13.56k
:
VF25
;
3886
3.59k
    // These are the scalar results. Notice that we don't generate vector GEPs
3887
3.59k
    // because scalar GEPs result in better code.
3888
10.9k
    for (unsigned Part = 0; Part < UF; 
++Part7.33k
) {
3889
14.7k
      for (unsigned Lane = 0; Lane < Lanes; 
++Lane7.38k
) {
3890
7.38k
        Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
3891
7.38k
        Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
3892
7.38k
        Value *SclrGep =
3893
7.38k
            emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
3894
7.38k
        SclrGep->setName("next.gep");
3895
7.38k
        VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
3896
7.38k
      }
3897
7.33k
    }
3898
3.59k
    return;
3899
0
  }
3900
3.59k
  }
3901
3.59k
}
3902
3903
/// A helper function for checking whether an integer division-related
3904
/// instruction may divide by zero (in which case it must be predicated if
3905
/// executed conditionally in the scalar code).
3906
/// TODO: It may be worthwhile to generalize and check isKnownNonZero().
3907
/// Non-zero divisors that are non compile-time constants will not be
3908
/// converted into multiplication, so we will still end up scalarizing
3909
/// the division, but can do so w/o predication.
3910
575
static bool mayDivideByZero(Instruction &I) {
3911
575
  assert((I.getOpcode() == Instruction::UDiv ||
3912
575
          I.getOpcode() == Instruction::SDiv ||
3913
575
          I.getOpcode() == Instruction::URem ||
3914
575
          I.getOpcode() == Instruction::SRem) &&
3915
575
         "Unexpected instruction");
3916
575
  Value *Divisor = I.getOperand(1);
3917
575
  auto *CInt = dyn_cast<ConstantInt>(Divisor);
3918
575
  return !CInt || 
CInt->isZero()117
;
3919
575
}
3920
3921
50.4k
void InnerLoopVectorizer::widenInstruction(Instruction &I) {
3922
50.4k
  switch (I.getOpcode()) {
3923
50.4k
  case Instruction::Br:
3924
0
  case Instruction::PHI:
3925
0
    llvm_unreachable("This instruction is handled by a different recipe.");
3926
93
  case Instruction::GetElementPtr: {
3927
93
    // Construct a vector GEP by widening the operands of the scalar GEP as
3928
93
    // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
3929
93
    // results in a vector of pointers when at least one operand of the GEP
3930
93
    // is vector-typed. Thus, to keep the representation compact, we only use
3931
93
    // vector-typed operands for loop-varying values.
3932
93
    auto *GEP = cast<GetElementPtrInst>(&I);
3933
93
3934
93
    if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
3935
1
      // If we are vectorizing, but the GEP has only loop-invariant operands,
3936
1
      // the GEP we build (by only using vector-typed operands for
3937
1
      // loop-varying values) would be a scalar pointer. Thus, to ensure we
3938
1
      // produce a vector of pointers, we need to either arbitrarily pick an
3939
1
      // operand to broadcast, or broadcast a clone of the original GEP.
3940
1
      // Here, we broadcast a clone of the original.
3941
1
      //
3942
1
      // TODO: If at some point we decide to scalarize instructions having
3943
1
      //       loop-invariant operands, this special case will no longer be
3944
1
      //       required. We would add the scalarization decision to
3945
1
      //       collectLoopScalars() and teach getVectorValue() to broadcast
3946
1
      //       the lane-zero scalar value.
3947
1
      auto *Clone = Builder.Insert(GEP->clone());
3948
2
      for (unsigned Part = 0; Part < UF; 
++Part1
) {
3949
1
        Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
3950
1
        VectorLoopValueMap.setVectorValue(&I, Part, EntryPart);
3951
1
        addMetadata(EntryPart, GEP);
3952
1
      }
3953
92
    } else {
3954
92
      // If the GEP has at least one loop-varying operand, we are sure to
3955
92
      // produce a vector of pointers. But if we are only unrolling, we want
3956
92
      // to produce a scalar GEP for each unroll part. Thus, the GEP we
3957
92
      // produce with the code below will be scalar (if VF == 1) or vector
3958
92
      // (otherwise). Note that for the unroll-only case, we still maintain
3959
92
      // values in the vector mapping with initVector, as we do for other
3960
92
      // instructions.
3961
248
      for (unsigned Part = 0; Part < UF; 
++Part156
) {
3962
156
        // The pointer operand of the new GEP. If it's loop-invariant, we
3963
156
        // won't broadcast it.
3964
156
        auto *Ptr =
3965
156
            OrigLoop->isLoopInvariant(GEP->getPointerOperand())
3966
156
                ? 
GEP->getPointerOperand()141
3967
156
                : 
getOrCreateVectorValue(GEP->getPointerOperand(), Part)15
;
3968
156
3969
156
        // Collect all the indices for the new GEP. If any index is
3970
156
        // loop-invariant, we won't broadcast it.
3971
156
        SmallVector<Value *, 4> Indices;
3972
215
        for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
3973
215
          if (OrigLoop->isLoopInvariant(U.get()))
3974
63
            Indices.push_back(U.get());
3975
152
          else
3976
152
            Indices.push_back(getOrCreateVectorValue(U.get(), Part));
3977
215
        }
3978
156
3979
156
        // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
3980
156
        // but it should be a vector, otherwise.
3981
156
        auto *NewGEP =
3982
156
            GEP->isInBounds()
3983
156
                ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
3984
154
                                            Indices)
3985
156
                : 
Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices)2
;
3986
156
        assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
3987
156
               "NewGEP is not a pointer vector");
3988
156
        VectorLoopValueMap.setVectorValue(&I, Part, NewGEP);
3989
156
        addMetadata(NewGEP, GEP);
3990
156
      }
3991
92
    }
3992
93
3993
93
    break;
3994
0
  }
3995
30.6k
  case Instruction::UDiv:
3996
30.6k
  case Instruction::SDiv:
3997
30.6k
  case Instruction::SRem:
3998
30.6k
  case Instruction::URem:
3999
30.6k
  case Instruction::Add:
4000
30.6k
  case Instruction::FAdd:
4001
30.6k
  case Instruction::Sub:
4002
30.6k
  case Instruction::FSub:
4003
30.6k
  case Instruction::FNeg:
4004
30.6k
  case Instruction::Mul:
4005
30.6k
  case Instruction::FMul:
4006
30.6k
  case Instruction::FDiv:
4007
30.6k
  case Instruction::FRem:
4008
30.6k
  case Instruction::Shl:
4009
30.6k
  case Instruction::LShr:
4010
30.6k
  case Instruction::AShr:
4011
30.6k
  case Instruction::And:
4012
30.6k
  case Instruction::Or:
4013
30.6k
  case Instruction::Xor: {
4014
30.6k
    // Just widen unops and binops.
4015
30.6k
    setDebugLocFromInst(Builder, &I);
4016
30.6k
4017
87.2k
    for (unsigned Part = 0; Part < UF; 
++Part56.5k
) {
4018
56.5k
      SmallVector<Value *, 2> Ops;
4019
56.5k
      for (Value *Op : I.operands())
4020
113k
        Ops.push_back(getOrCreateVectorValue(Op, Part));
4021
56.5k
4022
56.5k
      Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4023
56.5k
4024
56.5k
      if (auto *VecOp = dyn_cast<Instruction>(V))
4025
56.5k
        VecOp->copyIRFlags(&I);
4026
56.5k
4027
56.5k
      // Use this vector value for all users of the original instruction.
4028
56.5k
      VectorLoopValueMap.setVectorValue(&I, Part, V);
4029
56.5k
      addMetadata(V, &I);
4030
56.5k
    }
4031
30.6k
4032
30.6k
    break;
4033
30.6k
  }
4034
30.6k
  case Instruction::Select: {
4035
596
    // Widen selects.
4036
596
    // If the selector is loop invariant we can create a select
4037
596
    // instruction with a scalar condition. Otherwise, use vector-select.
4038
596
    auto *SE = PSE.getSE();
4039
596
    bool InvariantCond =
4040
596
        SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
4041
596
    setDebugLocFromInst(Builder, &I);
4042
596
4043
596
    // The condition can be loop invariant  but still defined inside the
4044
596
    // loop. This means that we can't just use the original 'cond' value.
4045
596
    // We have to take the 'vectorized' value and pick the first lane.
4046
596
    // Instcombine will make this a no-op.
4047
596
4048
596
    auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4049
596
4050
1.49k
    for (unsigned Part = 0; Part < UF; 
++Part902
) {
4051
902
      Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4052
902
      Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4053
902
      Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4054
902
      Value *Sel =
4055
902
          Builder.CreateSelect(InvariantCond ? 
ScalarCond1
:
Cond901
, Op0, Op1);
4056
902
      VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4057
902
      addMetadata(Sel, &I);
4058
902
    }
4059
596
4060
596
    break;
4061
30.6k
  }
4062
30.6k
4063
30.6k
  case Instruction::ICmp:
4064
907
  case Instruction::FCmp: {
4065
907
    // Widen compares. Generate vector compares.
4066
907
    bool FCmp = (I.getOpcode() == Instruction::FCmp);
4067
907
    auto *Cmp = dyn_cast<CmpInst>(&I);
4068
907
    setDebugLocFromInst(Builder, Cmp);
4069
2.31k
    for (unsigned Part = 0; Part < UF; 
++Part1.40k
) {
4070
1.40k
      Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4071
1.40k
      Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4072
1.40k
      Value *C = nullptr;
4073
1.40k
      if (FCmp) {
4074
181
        // Propagate fast math flags.
4075
181
        IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4076
181
        Builder.setFastMathFlags(Cmp->getFastMathFlags());
4077
181
        C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4078
1.22k
      } else {
4079
1.22k
        C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4080
1.22k
      }
4081
1.40k
      VectorLoopValueMap.setVectorValue(&I, Part, C);
4082
1.40k
      addMetadata(C, &I);
4083
1.40k
    }
4084
907
4085
907
    break;
4086
907
  }
4087
907
4088
17.8k
  case Instruction::ZExt:
4089
17.8k
  case Instruction::SExt:
4090
17.8k
  case Instruction::FPToUI:
4091
17.8k
  case Instruction::FPToSI:
4092
17.8k
  case Instruction::FPExt:
4093
17.8k
  case Instruction::PtrToInt:
4094
17.8k
  case Instruction::IntToPtr:
4095
17.8k
  case Instruction::SIToFP:
4096
17.8k
  case Instruction::UIToFP:
4097
17.8k
  case Instruction::Trunc:
4098
17.8k
  case Instruction::FPTrunc:
4099
17.8k
  case Instruction::BitCast: {
4100
17.8k
    auto *CI = dyn_cast<CastInst>(&I);
4101
17.8k
    setDebugLocFromInst(Builder, CI);
4102
17.8k
4103
17.8k
    /// Vectorize casts.
4104
17.8k
    Type *DestTy =
4105
17.8k
        (VF == 1) ? 
CI->getType()0
: VectorType::get(CI->getType(), VF);
4106
17.8k
4107
52.0k
    for (unsigned Part = 0; Part < UF; 
++Part34.1k
) {
4108
34.1k
      Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4109
34.1k
      Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4110
34.1k
      VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4111
34.1k
      addMetadata(Cast, &I);
4112
34.1k
    }
4113
17.8k
    break;
4114
17.8k
  }
4115
17.8k
4116
17.8k
  case Instruction::Call: {
4117
297
    // Ignore dbg intrinsics.
4118
297
    if (isa<DbgInfoIntrinsic>(I))
4119
0
      break;
4120
297
    setDebugLocFromInst(Builder, &I);
4121
297
4122
297
    Module *M = I.getParent()->getParent()->getParent();
4123
297
    auto *CI = cast<CallInst>(&I);
4124
297
4125
297
    StringRef FnName = CI->getCalledFunction()->getName();
4126
297
    Function *F = CI->getCalledFunction();
4127
297
    Type *RetTy = ToVectorTy(CI->getType(), VF);
4128
297
    SmallVector<Type *, 4> Tys;
4129
297
    for (Value *ArgOperand : CI->arg_operands())
4130
342
      Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4131
297
4132
297
    Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4133
297
4134
297
    // The flag shows whether we use Intrinsic or a usual Call for vectorized
4135
297
    // version of the instruction.
4136
297
    // Is it beneficial to perform intrinsic call compared to lib call?
4137
297
    bool NeedToScalarize;
4138
297
    unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4139
297
    bool UseVectorIntrinsic =
4140
297
        ID && 
Cost->getVectorIntrinsicCost(CI, VF) <= CallCost232
;
4141
297
    assert((UseVectorIntrinsic || !NeedToScalarize) &&
4142
297
           "Instruction should be scalarized elsewhere.");
4143
297
4144
652
    for (unsigned Part = 0; Part < UF; 
++Part355
) {
4145
355
      SmallVector<Value *, 4> Args;
4146
761
      for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; 
++i406
) {
4147
406
        Value *Arg = CI->getArgOperand(i);
4148
406
        // Some intrinsics have a scalar argument - don't replace it with a
4149
406
        // vector.
4150
406
        if (!UseVectorIntrinsic || 
!hasVectorInstrinsicScalarOpd(ID, i)262
)
4151
403
          Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4152
406
        Args.push_back(Arg);
4153
406
      }
4154
355
4155
355
      Function *VectorF;
4156
355
      if (UseVectorIntrinsic) {
4157
221
        // Use vector version of the intrinsic.
4158
221
        Type *TysForDecl[] = {CI->getType()};
4159
221
        if (VF > 1)
4160
221
          TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4161
221
        VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4162
221
      } else {
4163
134
        // Use vector version of the library call.
4164
134
        StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
4165
134
        assert(!VFnName.empty() && "Vector function name is empty.");
4166
134
        VectorF = M->getFunction(VFnName);
4167
134
        if (!VectorF) {
4168
83
          // Generate a declaration
4169
83
          FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
4170
83
          VectorF =
4171
83
              Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
4172
83
          VectorF->copyAttributesFrom(F);
4173
83
        }
4174
134
      }
4175
355
      assert(VectorF && "Can't create vector function.");
4176
355
4177
355
      SmallVector<OperandBundleDef, 1> OpBundles;
4178
355
      CI->getOperandBundlesAsDefs(OpBundles);
4179
355
      CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4180
355
4181
355
      if (isa<FPMathOperator>(V))
4182
335
        V->copyFastMathFlags(CI);
4183
355
4184
355
      VectorLoopValueMap.setVectorValue(&I, Part, V);
4185
355
      addMetadata(V, &I);
4186
355
    }
4187
297
4188
297
    break;
4189
297
  }
4190
297
4191
297
  default:
4192
0
    // This instruction is not vectorized by simple widening.
4193
0
    LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4194
0
    llvm_unreachable("Unhandled instruction!");
4195
50.4k
  } // end of switch.
4196
50.4k
}
4197
4198
17.0k
void InnerLoopVectorizer::updateAnalysis() {
4199
17.0k
  // Forget the original basic block.
4200
17.0k
  PSE.getSE()->forgetLoop(OrigLoop);
4201
17.0k
4202
17.0k
  // DT is not kept up-to-date for outer loop vectorization
4203
17.0k
  if (EnableVPlanNativePath)
4204
7
    return;
4205
17.0k
4206
17.0k
  // Update the dominator tree information.
4207
17.0k
  assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
4208
17.0k
         "Entry does not dominate exit.");
4209
17.0k
4210
17.0k
  DT->addNewBlock(LoopMiddleBlock,
4211
17.0k
                  LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4212
17.0k
  DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
4213
17.0k
  DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
4214
17.0k
  DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
4215
17.0k
  assert(DT->verify(DominatorTree::VerificationLevel::Fast));
4216
17.0k
}
4217
4218
32.3k
void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4219
32.3k
  // We should not collect Scalars more than once per VF. Right now, this
4220
32.3k
  // function is called from collectUniformsAndScalars(), which already does
4221
32.3k
  // this check. Collecting Scalars for VF=1 does not make any sense.
4222
32.3k
  assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4223
32.3k
         "This function should not be visited twice for the same VF");
4224
32.3k
4225
32.3k
  SmallSetVector<Instruction *, 8> Worklist;
4226
32.3k
4227
32.3k
  // These sets are used to seed the analysis with pointers used by memory
4228
32.3k
  // accesses that will remain scalar.
4229
32.3k
  SmallSetVector<Instruction *, 8> ScalarPtrs;
4230
32.3k
  SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4231
32.3k
4232
32.3k
  // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4233
32.3k
  // The pointer operands of loads and stores will be scalar as long as the
4234
32.3k
  // memory access is not a gather or scatter operation. The value operand of a
4235
32.3k
  // store will remain scalar if the store is scalarized.
4236
32.3k
  auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4237
16.3k
    InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4238
16.3k
    assert(WideningDecision != CM_Unknown &&
4239
16.3k
           "Widening decision should be ready at this moment");
4240
16.3k
    if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4241
6.44k
      if (Ptr == Store->getValueOperand())
4242
82
        return WideningDecision == CM_Scalarize;
4243
16.2k
    assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4244
16.2k
           "Ptr is neither a value or pointer operand");
4245
16.2k
    return WideningDecision != CM_GatherScatter;
4246
16.2k
  };
4247
32.3k
4248
32.3k
  // A helper that returns true if the given value is a bitcast or
4249
32.3k
  // getelementptr instruction contained in the loop.
4250
240k
  auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4251
240k
    return ((isa<BitCastInst>(V) && 
V->getType()->isPointerTy()8.18k
) ||
4252
240k
            
isa<GetElementPtrInst>(V)232k
) &&
4253
240k
           
!TheLoop->isLoopInvariant(V)76.6k
;
4254
240k
  };
4255
32.3k
4256
32.3k
  // A helper that evaluates a memory access's use of a pointer. If the use
4257
32.3k
  // will be a scalar use, and the pointer is only used by memory accesses, we
4258
32.3k
  // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4259
32.3k
  // PossibleNonScalarPtrs.
4260
96.9k
  auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4261
96.9k
    // We only care about bitcast and getelementptr instructions contained in
4262
96.9k
    // the loop.
4263
96.9k
    if (!isLoopVaryingBitCastOrGEP(Ptr))
4264
36.5k
      return;
4265
60.3k
4266
60.3k
    // If the pointer has already been identified as scalar (e.g., if it was
4267
60.3k
    // also identified as uniform), there's nothing to do.
4268
60.3k
    auto *I = cast<Instruction>(Ptr);
4269
60.3k
    if (Worklist.count(I))
4270
46.0k
      return;
4271
14.3k
4272
14.3k
    // If the use of the pointer will be a scalar use, and all users of the
4273
14.3k
    // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4274
14.3k
    // place the pointer in PossibleNonScalarPtrs.
4275
16.5k
    
if (14.3k
isScalarUse(MemAccess, Ptr)14.3k
&&
llvm::all_of(I->users(), [&](User *U) 14.1k
{
4276
16.5k
          return isa<LoadInst>(U) || 
isa<StoreInst>(U)6.66k
;
4277
16.5k
        }))
4278
14.1k
      ScalarPtrs.insert(I);
4279
196
    else
4280
196
      PossibleNonScalarPtrs.insert(I);
4281
14.3k
  };
4282
32.3k
4283
32.3k
  // We seed the scalars analysis with three classes of instructions: (1)
4284
32.3k
  // instructions marked uniform-after-vectorization, (2) bitcast and
4285
32.3k
  // getelementptr instructions used by memory accesses requiring a scalar use,
4286
32.3k
  // and (3) pointer induction variables and their update instructions (we
4287
32.3k
  // currently only scalarize these).
4288
32.3k
  //
4289
32.3k
  // (1) Add to the worklist all instructions that have been identified as
4290
32.3k
  // uniform-after-vectorization.
4291
32.3k
  Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4292
32.3k
4293
32.3k
  // (2) Add to the worklist all bitcast and getelementptr instructions used by
4294
32.3k
  // memory accesses requiring a scalar use. The pointer operands of loads and
4295
32.3k
  // stores will be scalar as long as the memory accesses is not a gather or
4296
32.3k
  // scatter operation. The value operand of a store will remain scalar if the
4297
32.3k
  // store is scalarized.
4298
32.3k
  for (auto *BB : TheLoop->blocks())
4299
389k
    
for (auto &I : *BB)35.8k
{
4300
389k
      if (auto *Load = dyn_cast<LoadInst>(&I)) {
4301
31.6k
        evaluatePtrUse(Load, Load->getPointerOperand());
4302
357k
      } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4303
32.6k
        evaluatePtrUse(Store, Store->getPointerOperand());
4304
32.6k
        evaluatePtrUse(Store, Store->getValueOperand());
4305
32.6k
      }
4306
389k
    }
4307
32.3k
  for (auto *I : ScalarPtrs)
4308
13.0k
    if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4309
13.0k
      LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4310
13.0k
      Worklist.insert(I);
4311
13.0k
    }
4312
32.3k
4313
32.3k
  // (3) Add to the worklist all pointer induction variables and their update
4314
32.3k
  // instructions.
4315
32.3k
  //
4316
32.3k
  // TODO: Once we are able to vectorize pointer induction variables we should
4317
32.3k
  //       no longer insert them into the worklist here.
4318
32.3k
  auto *Latch = TheLoop->getLoopLatch();
4319
39.0k
  for (auto &Induction : *Legal->getInductionVars()) {
4320
39.0k
    auto *Ind = Induction.first;
4321
39.0k
    auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4322
39.0k
    if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4323
31.8k
      continue;
4324
7.21k
    Worklist.insert(Ind);
4325
7.21k
    Worklist.insert(IndUpdate);
4326
7.21k
    LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4327
7.21k
    LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4328
7.21k
                      << "\n");
4329
7.21k
  }
4330
32.3k
4331
32.3k
  // Insert the forced scalars.
4332
32.3k
  // FIXME: Currently widenPHIInstruction() often creates a dead vector
4333
32.3k
  // induction variable when the PHI user is scalarized.
4334
32.3k
  auto ForcedScalar = ForcedScalars.find(VF);
4335
32.3k
  if (ForcedScalar != ForcedScalars.end())
4336
13
    for (auto *I : ForcedScalar->second)
4337
60
      Worklist.insert(I);
4338
32.3k
4339
32.3k
  // Expand the worklist by looking through any bitcasts and getelementptr
4340
32.3k
  // instructions we've already identified as scalar. This is similar to the
4341
32.3k
  // expansion step in collectLoopUniforms(); however, here we're only
4342
32.3k
  // expanding to include additional bitcasts and getelementptr instructions.
4343
32.3k
  unsigned Idx = 0;
4344
175k
  while (Idx != Worklist.size()) {
4345
143k
    Instruction *Dst = Worklist[Idx++];
4346
143k
    if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4347
132k
      continue;
4348
10.9k
    auto *Src = cast<Instruction>(Dst->getOperand(0));
4349
30.6k
    if (
llvm::all_of(Src->users(), [&](User *U) -> bool 10.9k
{
4350
30.6k
          auto *J = cast<Instruction>(U);
4351
30.6k
          return !TheLoop->contains(J) || 
Worklist.count(J)28.7k
||
4352
30.6k
                 
(1.99k
(1.99k
isa<LoadInst>(J)1.99k
||
isa<StoreInst>(J)1.03k
) &&
4353
1.99k
                  isScalarUse(J, Src));
4354
30.6k
        })) {
4355
10.9k
      Worklist.insert(Src);
4356
10.9k
      LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4357
10.9k
    }
4358
10.9k
  }
4359
32.3k
4360
32.3k
  // An induction variable will remain scalar if all users of the induction
4361
32.3k
  // variable and induction variable update remain scalar.
4362
39.0k
  for (auto &Induction : *Legal->getInductionVars()) {
4363
39.0k
    auto *Ind = Induction.first;
4364
39.0k
    auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4365
39.0k
4366
39.0k
    // We already considered pointer induction variables, so there's no reason
4367
39.0k
    // to look at their users again.
4368
39.0k
    //
4369
39.0k
    // TODO: Once we are able to vectorize pointer induction variables we
4370
39.0k
    //       should no longer skip over them here.
4371
39.0k
    if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4372
7.21k
      continue;
4373
31.8k
4374
31.8k
    // Determine if all users of the induction variable are scalar after
4375
31.8k
    // vectorization.
4376
73.1k
    
auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool 31.8k
{
4377
73.1k
      auto *I = cast<Instruction>(U);
4378
73.1k
      return I == IndUpdate || 
!TheLoop->contains(I)42.7k
||
Worklist.count(I)42.6k
;
4379
73.1k
    });
4380
31.8k
    if (!ScalarInd)
4381
4.16k
      continue;
4382
27.6k
4383
27.6k
    // Determine if all users of the induction variable update instruction are
4384
27.6k
    // scalar after vectorization.
4385
27.6k
    auto ScalarIndUpdate =
4386
65.9k
        llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4387
65.9k
          auto *I = cast<Instruction>(U);
4388
65.9k
          return I == Ind || 
!TheLoop->contains(I)38.3k
||
Worklist.count(I)37.6k
;
4389
65.9k
        });
4390
27.6k
    if (!ScalarIndUpdate)
4391
9.78k
      continue;
4392
17.8k
4393
17.8k
    // The induction variable and its update instruction will remain scalar.
4394
17.8k
    Worklist.insert(Ind);
4395
17.8k
    Worklist.insert(IndUpdate);
4396
17.8k
    LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4397
17.8k
    LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4398
17.8k
                      << "\n");
4399
17.8k
  }
4400
32.3k
4401
32.3k
  Scalars[VF].insert(Worklist.begin(), Worklist.end());
4402
32.3k
}
4403
4404
749k
bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
4405
749k
  if (!blockNeedsPredication(I->getParent()))
4406
708k
    return false;
4407
41.0k
  switch(I->getOpcode()) {
4408
41.0k
  default:
4409
27.6k
    break;
4410
41.0k
  case Instruction::Load:
4411
12.7k
  case Instruction::Store: {
4412
12.7k
    if (!Legal->isMaskRequired(I))
4413
86
      return false;
4414
12.7k
    auto *Ptr = getLoadStorePointerOperand(I);
4415
12.7k
    auto *Ty = getMemInstValueType(I);
4416
12.7k
    // We have already decided how to vectorize this instruction, get that
4417
12.7k
    // result.
4418
12.7k
    if (VF > 1) {
4419
4.54k
      InstWidening WideningDecision = getWideningDecision(I, VF);
4420
4.54k
      assert(WideningDecision != CM_Unknown &&
4421
4.54k
             "Widening decision should be ready at this moment");
4422
4.54k
      return WideningDecision == CM_Scalarize;
4423
4.54k
    }
4424
8.16k
    return isa<LoadInst>(I) ?
4425
3.64k
        !(isLegalMaskedLoad(Ty, Ptr)  || 
isLegalMaskedGather(Ty)3.41k
)
4426
8.16k
      : 
!(4.52k
isLegalMaskedStore(Ty, Ptr)4.52k
||
isLegalMaskedScatter(Ty)4.17k
);
4427
8.16k
  }
4428
8.16k
  case Instruction::UDiv:
4429
575
  case Instruction::SDiv:
4430
575
  case Instruction::SRem:
4431
575
  case Instruction::URem:
4432
575
    return mayDivideByZero(*I);
4433
27.6k
  }
4434
27.6k
  return false;
4435
27.6k
}
4436
4437
bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
4438
2.26k
                                                               unsigned VF) {
4439
2.26k
  assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4440
2.26k
  assert(getWideningDecision(I, VF) == CM_Unknown &&
4441
2.26k
         "Decision should not be set yet.");
4442
2.26k
  auto *Group = getInterleavedAccessGroup(I);
4443
2.26k
  assert(Group && "Must have a group.");
4444
2.26k
4445
2.26k
  // If the instruction's allocated size doesn't equal it's type size, it
4446
2.26k
  // requires padding and will be scalarized.
4447
2.26k
  auto &DL = I->getModule()->getDataLayout();
4448
2.26k
  auto *ScalarTy = getMemInstValueType(I);
4449
2.26k
  if (hasIrregularType(ScalarTy, DL, VF))
4450
1
    return false;
4451
2.26k
4452
2.26k
  // Check if masking is required.
4453
2.26k
  // A Group may need masking for one of two reasons: it resides in a block that
4454
2.26k
  // needs predication, or it was decided to use masking to deal with gaps.
4455
2.26k
  bool PredicatedAccessRequiresMasking = 
4456
2.26k
      Legal->blockNeedsPredication(I->getParent()) && 
Legal->isMaskRequired(I)8
;
4457
2.26k
  bool AccessWithGapsRequiresMasking = 
4458
2.26k
      Group->requiresScalarEpilogue() && 
!IsScalarEpilogueAllowed342
;
4459
2.26k
  if (!PredicatedAccessRequiresMasking && 
!AccessWithGapsRequiresMasking2.25k
)
4460
2.25k
    return true;
4461
10
4462
10
  // If masked interleaving is required, we expect that the user/target had
4463
10
  // enabled it, because otherwise it either wouldn't have been created or
4464
10
  // it should have been invalidated by the CostModel.
4465
10
  assert(useMaskedInterleavedAccesses(TTI) &&
4466
10
         "Masked interleave-groups for predicated accesses are not enabled.");
4467
10
4468
10
  auto *Ty = getMemInstValueType(I);
4469
10
  return isa<LoadInst>(I) ? 
TTI.isLegalMaskedLoad(Ty)8
4470
10
                          : 
TTI.isLegalMaskedStore(Ty)2
;
4471
10
}
4472
4473
bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
4474
63.6k
                                                               unsigned VF) {
4475
63.6k
  // Get and ensure we have a valid memory instruction.
4476
63.6k
  LoadInst *LI = dyn_cast<LoadInst>(I);
4477
63.6k
  StoreInst *SI = dyn_cast<StoreInst>(I);
4478
63.6k
  assert((LI || SI) && "Invalid memory instruction");
4479
63.6k
4480
63.6k
  auto *Ptr = getLoadStorePointerOperand(I);
4481
63.6k
4482
63.6k
  // In order to be widened, the pointer should be consecutive, first of all.
4483
63.6k
  if (!Legal->isConsecutivePtr(Ptr))
4484
19.1k
    return false;
4485
44.5k
4486
44.5k
  // If the instruction is a store located in a predicated block, it will be
4487
44.5k
  // scalarized.
4488
44.5k
  if (isScalarWithPredication(I))
4489
1.06k
    return false;
4490
43.4k
4491
43.4k
  // If the instruction's allocated size doesn't equal it's type size, it
4492
43.4k
  // requires padding and will be scalarized.
4493
43.4k
  auto &DL = I->getModule()->getDataLayout();
4494
43.4k
  auto *ScalarTy = LI ? 
LI->getType()18.8k
:
SI->getValueOperand()->getType()24.5k
;
4495
43.4k
  if (hasIrregularType(ScalarTy, DL, VF))
4496
1
    return false;
4497
43.4k
4498
43.4k
  return true;
4499
43.4k
}
4500
4501
32.3k
void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4502
32.3k
  // We should not collect Uniforms more than once per VF. Right now,
4503
32.3k
  // this function is called from collectUniformsAndScalars(), which
4504
32.3k
  // already does this check. Collecting Uniforms for VF=1 does not make any
4505
32.3k
  // sense.
4506
32.3k
4507
32.3k
  assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4508
32.3k
         "This function should not be visited twice for the same VF");
4509
32.3k
4510
32.3k
  // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4511
32.3k
  // not analyze again.  Uniforms.count(VF) will return 1.
4512
32.3k
  Uniforms[VF].clear();
4513
32.3k
4514
32.3k
  // We now know that the loop is vectorizable!
4515
32.3k
  // Collect instructions inside the loop that will remain uniform after
4516
32.3k
  // vectorization.
4517
32.3k
4518
32.3k
  // Global values, params and instructions outside of current loop are out of
4519
32.3k
  // scope.
4520
197k
  auto isOutOfScope = [&](Value *V) -> bool {
4521
197k
    Instruction *I = dyn_cast<Instruction>(V);
4522
197k
    return (!I || 
!TheLoop->contains(I)113k
);
4523
197k
  };
4524
32.3k
4525
32.3k
  SetVector<Instruction *> Worklist;
4526
32.3k
  BasicBlock *Latch = TheLoop->getLoopLatch();
4527
32.3k
4528
32.3k
  // Start with the conditional branch. If the branch condition is an
4529
32.3k
  // instruction contained in the loop that is only used by the branch, it is
4530
32.3k
  // uniform.
4531
32.3k
  auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4532
32.3k
  if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
4533
32.3k
    Worklist.insert(Cmp);
4534
32.3k
    LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
4535
32.3k
  }
4536
32.3k
4537
32.3k
  // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4538
32.3k
  // are pointers that are treated like consecutive pointers during
4539
32.3k
  // vectorization. The pointer operands of interleaved accesses are an
4540
32.3k
  // example.
4541
32.3k
  SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4542
32.3k
4543
32.3k
  // Holds pointer operands of instructions that are possibly non-uniform.
4544
32.3k
  SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4545
32.3k
4546
64.0k
  auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4547
64.0k
    InstWidening WideningDecision = getWideningDecision(I, VF);
4548
64.0k
    assert(WideningDecision != CM_Unknown &&
4549
64.0k
           "Widening decision should be ready at this moment");
4550
64.0k
4551
64.0k
    return (WideningDecision == CM_Widen ||
4552
64.0k
            
WideningDecision == CM_Widen_Reverse23.5k
||
4553
64.0k
            
WideningDecision == CM_Interleave20.5k
);
4554
64.0k
  };
4555
32.3k
  // Iterate over the instructions in the loop, and collect all
4556
32.3k
  // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4557
32.3k
  // that a consecutive-like pointer operand will be scalarized, we collect it
4558
32.3k
  // in PossibleNonUniformPtrs instead. We use two sets here because a single
4559
32.3k
  // getelementptr instruction can be used by both vectorized and scalarized
4560
32.3k
  // memory instructions. For example, if a loop loads and stores from the same
4561
32.3k
  // location, but the store is conditional, the store will be scalarized, and
4562
32.3k
  // the getelementptr won't remain uniform.
4563
32.3k
  for (auto *BB : TheLoop->blocks())
4564
389k
    
for (auto &I : *BB)35.8k
{
4565
389k
      // If there's no pointer operand, there's nothing to do.
4566
389k
      auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4567
389k
      if (!Ptr)
4568
324k
        continue;
4569
64.1k
4570
64.1k
      // True if all users of Ptr are memory accesses that have Ptr as their
4571
64.1k
      // pointer operand.
4572
64.1k
      auto UsersAreMemAccesses =
4573
73.7k
          llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4574
73.7k
            return getLoadStorePointerOperand(U) == Ptr;
4575
73.7k
          });
4576
64.1k
4577
64.1k
      // Ensure the memory instruction will not be scalarized or used by
4578
64.1k
      // gather/scatter, making its pointer operand non-uniform. If the pointer
4579
64.1k
      // operand is used by any instruction other than a memory access, we
4580
64.1k
      // conservatively assume the pointer operand may be non-uniform.
4581
64.1k
      if (!UsersAreMemAccesses || 
!isUniformDecision(&I, VF)59.8k
)
4582
18.8k
        PossibleNonUniformPtrs.insert(Ptr);
4583
45.2k
4584
45.2k
      // If the memory instruction will be vectorized and its pointer operand
4585
45.2k
      // is consecutive-like, or interleaving - the pointer operand should
4586
45.2k
      // remain uniform.
4587
45.2k
      else
4588
45.2k
        ConsecutiveLikePtrs.insert(Ptr);
4589
64.1k
    }
4590
32.3k
4591
32.3k
  // Add to the Worklist all consecutive and consecutive-like pointers that
4592
32.3k
  // aren't also identified as possibly non-uniform.
4593
32.3k
  for (auto *V : ConsecutiveLikePtrs)
4594
43.1k
    if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) {
4595
42.8k
      LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
4596
42.8k
      Worklist.insert(V);
4597
42.8k
    }
4598
32.3k
4599
32.3k
  // Expand Worklist in topological order: whenever a new instruction
4600
32.3k
  // is added , its users should be already inside Worklist.  It ensures
4601
32.3k
  // a uniform instruction will only be used by uniform instructions.
4602
32.3k
  unsigned idx = 0;
4603
115k
  while (idx != Worklist.size()) {
4604
82.6k
    Instruction *I = Worklist[idx++];
4605
82.6k
4606
197k
    for (auto OV : I->operand_values()) {
4607
197k
      // isOutOfScope operands cannot be uniform instructions.
4608
197k
      if (isOutOfScope(OV))
4609
114k
        continue;
4610
82.8k
      // First order recurrence Phi's should typically be considered
4611
82.8k
      // non-uniform.
4612
82.8k
      auto *OP = dyn_cast<PHINode>(OV);
4613
82.8k
      if (OP && 
Legal->isFirstOrderRecurrence(OP)39.5k
)
4614
0
        continue;
4615
82.8k
      // If all the users of the operand are uniform, then add the
4616
82.8k
      // operand into the uniform worklist.
4617
82.8k
      auto *OI = cast<Instruction>(OV);
4618
180k
      if (
llvm::all_of(OI->users(), [&](User *U) -> bool 82.8k
{
4619
180k
            auto *J = cast<Instruction>(U);
4620
180k
            return Worklist.count(J) ||
4621
180k
                   
(74.2k
OI == getLoadStorePointerOperand(J)74.2k
&&
4622
74.2k
                    
isUniformDecision(J, VF)282
);
4623
180k
          })) {
4624
8.88k
        Worklist.insert(OI);
4625
8.88k
        LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
4626
8.88k
      }
4627
82.8k
    }
4628
82.6k
  }
4629
32.3k
4630
32.3k
  // Returns true if Ptr is the pointer operand of a memory access instruction
4631
32.3k
  // I, and I is known to not require scalarization.
4632
32.3k
  auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4633
19.3k
    return getLoadStorePointerOperand(I) == Ptr && 
isUniformDecision(I, VF)3.89k
;
4634
19.3k
  };
4635
32.3k
4636
32.3k
  // For an instruction to be added into Worklist above, all its users inside
4637
32.3k
  // the loop should also be in Worklist. However, this condition cannot be
4638
32.3k
  // true for phi nodes that form a cyclic dependence. We must process phi
4639
32.3k
  // nodes separately. An induction variable will remain uniform if all users
4640
32.3k
  // of the induction variable and induction variable update remain uniform.
4641
32.3k
  // The code below handles both pointer and non-pointer induction variables.
4642
39.0k
  for (auto &Induction : *Legal->getInductionVars()) {
4643
39.0k
    auto *Ind = Induction.first;
4644
39.0k
    auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4645
39.0k
4646
39.0k
    // Determine if all users of the induction variable are uniform after
4647
39.0k
    // vectorization.
4648
83.6k
    auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4649
83.6k
      auto *I = cast<Instruction>(U);
4650
83.6k
      return I == IndUpdate || 
!TheLoop->contains(I)47.0k
||
Worklist.count(I)46.8k
||
4651
83.6k
             
isVectorizedMemAccessUse(I, Ind)8.65k
;
4652
83.6k
    });
4653
39.0k
    if (!UniformInd)
4654
5.77k
      continue;
4655
33.2k
4656
33.2k
    // Determine if all users of the induction variable update instruction are
4657
33.2k
    // uniform after vectorization.
4658
33.2k
    auto UniformIndUpdate =
4659
75.9k
        llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4660
75.9k
          auto *I = cast<Instruction>(U);
4661
75.9k
          return I == Ind || 
!TheLoop->contains(I)42.7k
||
Worklist.count(I)39.4k
||
4662
75.9k
                 
isVectorizedMemAccessUse(I, IndUpdate)10.7k
;
4663
75.9k
        });
4664
33.2k
    if (!UniformIndUpdate)
4665
9.82k
      continue;
4666
23.4k
4667
23.4k
    // The induction variable and its update instruction will remain uniform.
4668
23.4k
    Worklist.insert(Ind);
4669
23.4k
    Worklist.insert(IndUpdate);
4670
23.4k
    LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
4671
23.4k
    LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate
4672
23.4k
                      << "\n");
4673
23.4k
  }
4674
32.3k
4675
32.3k
  Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4676
32.3k
}
4677
4678
19.9k
Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
4679
19.9k
  if (Legal->getRuntimePointerChecking()->Need && 
TTI.hasBranchDivergence()3.16k
) {
4680
2
    // TODO: It may by useful to do since it's still likely to be dynamically
4681
2
    // uniform if the target can skip.
4682
2
    LLVM_DEBUG(
4683
2
        dbgs() << "LV: Not inserting runtime ptr check for divergent target");
4684
2
4685
2
    ORE->emit(
4686
2
      createMissedAnalysis("CantVersionLoopWithDivergentTarget")
4687
2
      << "runtime pointer checks needed. Not enabled for divergent target");
4688
2
4689
2
    return None;
4690
2
  }
4691
19.9k
4692
19.9k
  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4693
19.9k
  if (!OptForSize) // Remaining checks deal with scalar loop when OptForSize.
4694
19.7k
    return computeFeasibleMaxVF(OptForSize, TC);
4695
212
4696
212
  if (Legal->getRuntimePointerChecking()->Need) {
4697
37
    ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
4698
37
              << "runtime pointer checks needed. Enable vectorization of this "
4699
37
                 "loop with '#pragma clang loop vectorize(enable)' when "
4700
37
                 "compiling with -Os/-Oz");
4701
37
    LLVM_DEBUG(
4702
37
        dbgs()
4703
37
        << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");
4704
37
    return None;
4705
37
  }
4706
175
4707
175
  if (!PSE.getUnionPredicate().getPredicates().empty()) {
4708
9
    ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
4709
9
              << "runtime SCEV checks needed. Enable vectorization of this "
4710
9
                 "loop with '#pragma clang loop vectorize(enable)' when "
4711
9
                 "compiling with -Os/-Oz");
4712
9
    LLVM_DEBUG(
4713
9
        dbgs()
4714
9
        << "LV: Aborting. Runtime SCEV check is required with -Os/-Oz.\n");
4715
9
    return None;
4716
9
  }
4717
166
4718
166
  // FIXME: Avoid specializing for stride==1 instead of bailing out.
4719
166
  if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4720
1
    ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
4721
1
              << "runtime stride == 1 checks needed. Enable vectorization of "
4722
1
                 "this loop with '#pragma clang loop vectorize(enable)' when "
4723
1
                 "compiling with -Os/-Oz");
4724
1
    LLVM_DEBUG(
4725
1
        dbgs()
4726
1
        << "LV: Aborting. Runtime stride check is required with -Os/-Oz.\n");
4727
1
    return None;
4728
1
  }
4729
165
4730
165
  // If we optimize the program for size, avoid creating the tail loop.
4731
165
  LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4732
165
4733
165
  if (TC == 1) {
4734
5
    ORE->emit(createMissedAnalysis("SingleIterationLoop")
4735
5
              << "loop trip count is one, irrelevant for vectorization");
4736
5
    LLVM_DEBUG(dbgs() << "LV: Aborting, single iteration (non) loop.\n");
4737
5
    return None;
4738
5
  }
4739
160
4740
160
  // Record that scalar epilogue is not allowed.
4741
160
  LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4742
160
4743
160
  IsScalarEpilogueAllowed = !OptForSize;
4744
160
4745
160
  // We don't create an epilogue when optimizing for size.
4746
160
  // Invalidate interleave groups that require an epilogue if we can't mask
4747
160
  // the interleave-group.
4748
160
  if (!useMaskedInterleavedAccesses(TTI)) 
4749
153
    InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4750
160
4751
160
  unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC);
4752
160
4753
160
  if (TC > 0 && 
TC % MaxVF == 0129
) {
4754
85
    LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4755
85
    return MaxVF;
4756
85
  }
4757
75
4758
75
  // If we don't know the precise trip count, or if the trip count that we
4759
75
  // found modulo the vectorization factor is not zero, try to fold the tail
4760
75
  // by masking.
4761
75
  // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4762
75
  if (Legal->canFoldTailByMasking()) {
4763
26
    FoldTailByMasking = true;
4764
26
    return MaxVF;
4765
26
  }
4766
49
4767
49
  if (TC == 0) {
4768
16
    ORE->emit(
4769
16
        createMissedAnalysis("UnknownLoopCountComplexCFG")
4770
16
        << "unable to calculate the loop count due to complex control flow");
4771
16
    return None;
4772
16
  }
4773
33
4774
33
  ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize")
4775
33
            << "cannot optimize for size and vectorize at the same time. "
4776
33
               "Enable vectorization of this loop with '#pragma clang loop "
4777
33
               "vectorize(enable)' when compiling with -Os/-Oz");
4778
33
  return None;
4779
33
}
4780
4781
unsigned
4782
LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,
4783
19.9k
                                                 unsigned ConstTripCount) {
4784
19.9k
  MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4785
19.9k
  unsigned SmallestType, WidestType;
4786
19.9k
  std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4787
19.9k
  unsigned WidestRegister = TTI.getRegisterBitWidth(true);
4788
19.9k
4789
19.9k
  // Get the maximum safe dependence distance in bits computed by LAA.
4790
19.9k
  // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4791
19.9k
  // the memory accesses that is most restrictive (involved in the smallest
4792
19.9k
  // dependence distance).
4793
19.9k
  unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
4794
19.9k
4795
19.9k
  WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
4796
19.9k
4797
19.9k
  unsigned MaxVectorSize = WidestRegister / WidestType;
4798
19.9k
4799
19.9k
  LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4800
19.9k
                    << " / " << WidestType << " bits.\n");
4801
19.9k
  LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4802
19.9k
                    << WidestRegister << " bits.\n");
4803
19.9k
4804
19.9k
  assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
4805
19.9k
                                 " into one vector!");
4806
19.9k
  if (MaxVectorSize == 0) {
4807
76
    LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
4808
76
    MaxVectorSize = 1;
4809
76
    return MaxVectorSize;
4810
19.8k
  } else if (ConstTripCount && 
ConstTripCount < MaxVectorSize11.0k
&&
4811
19.8k
             
isPowerOf2_32(ConstTripCount)13
) {
4812
11
    // We need to clamp the VF to be the ConstTripCount. There is no point in
4813
11
    // choosing a higher viable VF as done in the loop below.
4814
11
    LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
4815
11
                      << ConstTripCount << "\n");
4816
11
    MaxVectorSize = ConstTripCount;
4817
11
    return MaxVectorSize;
4818
11
  }
4819
19.8k
4820
19.8k
  unsigned MaxVF = MaxVectorSize;
4821
19.8k
  if (TTI.shouldMaximizeVectorBandwidth(OptForSize) ||
4822
19.8k
      
(19.8k
MaximizeBandwidth19.8k
&&
!OptForSize0
)) {
4823
2
    // Collect all viable vectorization factors larger than the default MaxVF
4824
2
    // (i.e. MaxVectorSize).
4825
2
    SmallVector<unsigned, 8> VFs;
4826
2
    unsigned NewMaxVectorSize = WidestRegister / SmallestType;
4827
3
    for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; 
VS *= 21
)
4828
1
      VFs.push_back(VS);
4829
2
4830
2
    // For each VF calculate its register usage.
4831
2
    auto RUs = calculateRegisterUsage(VFs);
4832
2
4833
2
    // Select the largest VF which doesn't require more registers than existing
4834
2
    // ones.
4835
2
    unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);
4836
2
    for (int i = RUs.size() - 1; i >= 0; 
--i0
) {
4837
1
      if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
4838
1
        MaxVF = VFs[i];
4839
1
        break;
4840
1
      }
4841
1
    }
4842
2
    if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
4843
2
      if (MaxVF < MinVF) {
4844
0
        LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4845
0
                          << ") with target's minimum: " << MinVF << '\n');
4846
0
        MaxVF = MinVF;
4847
0
      }
4848
2
    }
4849
2
  }
4850
19.8k
  return MaxVF;
4851
19.8k
}
4852
4853
VectorizationFactor
4854
19.1k
LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
4855
19.1k
  float Cost = expectedCost(1).first;
4856
19.1k
  const float ScalarCost = Cost;
4857
19.1k
  unsigned Width = 1;
4858
19.1k
  LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
4859
19.1k
4860
19.1k
  bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
4861
19.1k
  if (ForceVectorization && 
MaxVF > 117
) {
4862
17
    // Ignore scalar width, because the user explicitly wants vectorization.
4863
17
    // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4864
17
    // evaluation.
4865
17
    Cost = std::numeric_limits<float>::max();
4866
17
  }
4867
19.1k
4868
50.9k
  for (unsigned i = 2; i <= MaxVF; 
i *= 231.7k
) {
4869
31.7k
    // Notice that the vector loop needs to be executed less times, so
4870
31.7k
    // we need to divide the cost of the vector loops by the width of
4871
31.7k
    // the vector elements.
4872
31.7k
    VectorizationCostTy C = expectedCost(i);
4873
31.7k
    float VectorCost = C.first / (float)i;
4874
31.7k
    LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
4875
31.7k
                      << " costs: " << (int)VectorCost << ".\n");
4876
31.7k
    if (!C.second && 
!ForceVectorization2.93k
) {
4877
2.93k
      LLVM_DEBUG(
4878
2.93k
          dbgs() << "LV: Not considering vector loop of width " << i
4879
2.93k
                 << " because it will not generate any vector instructions.\n");
4880
2.93k
      continue;
4881
2.93k
    }
4882
28.8k
    if (VectorCost < Cost) {
4883
23.9k
      Cost = VectorCost;
4884
23.9k
      Width = i;
4885
23.9k
    }
4886
28.8k
  }
4887
19.1k
4888
19.1k
  if (!EnableCondStoresVectorization && 
NumPredStores0
) {
4889
0
    ORE->emit(createMissedAnalysis("ConditionalStore")
4890
0
              << "store that is conditionally executed prevents vectorization");
4891
0
    LLVM_DEBUG(
4892
0
        dbgs() << "LV: No vectorization. There are conditional stores.\n");
4893
0
    Width = 1;
4894
0
    Cost = ScalarCost;
4895
0
  }
4896
19.1k
4897
19.1k
  LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
4898
19.1k
             << "LV: Vectorization seems to be not beneficial, "
4899
19.1k
             << "but was forced by a user.\n");
4900
19.1k
  LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
4901
19.1k
  VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
4902
19.1k
  return Factor;
4903
19.1k
}
4904
4905
std::pair<unsigned, unsigned>
4906
19.9k
LoopVectorizationCostModel::getSmallestAndWidestTypes() {
4907
19.9k
  unsigned MinWidth = -1U;
4908
19.9k
  unsigned MaxWidth = 8;
4909
19.9k
  const DataLayout &DL = TheFunction->getParent()->getDataLayout();
4910
19.9k
4911
19.9k
  // For each block.
4912
22.0k
  for (BasicBlock *BB : TheLoop->blocks()) {
4913
22.0k
    // For each instruction in the loop.
4914
251k
    for (Instruction &I : BB->instructionsWithoutDebug()) {
4915
251k
      Type *T = I.getType();
4916
251k
4917
251k
      // Skip ignored values.
4918
251k
      if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
4919
9
        continue;
4920
251k
4921
251k
      // Only examine Loads, Stores and PHINodes.
4922
251k
      if (!isa<LoadInst>(I) && 
!isa<StoreInst>(I)231k
&&
!isa<PHINode>(I)211k
)
4923
183k
        continue;
4924
68.4k
4925
68.4k
      // Examine PHI nodes that are reduction variables. Update the type to
4926
68.4k
      // account for the recurrence type.
4927
68.4k
      if (auto *PN = dyn_cast<PHINode>(&I)) {
4928
27.9k
        if (!Legal->isReductionVariable(PN))
4929
24.4k
          continue;
4930
3.43k
        RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
4931
3.43k
        T = RdxDesc.getRecurrenceType();
4932
3.43k
      }
4933
68.4k
4934
68.4k
      // Examine the stored values.
4935
68.4k
      
if (auto *43.9k
ST43.9k
= dyn_cast<StoreInst>(&I))
4936
20.7k
        T = ST->getValueOperand()->getType();
4937
43.9k
4938
43.9k
      // Ignore loaded pointer types and stored pointer types that are not
4939
43.9k
      // vectorizable.
4940
43.9k
      //
4941
43.9k
      // FIXME: The check here attempts to predict whether a load or store will
4942
43.9k
      //        be vectorized. We only know this for certain after a VF has
4943
43.9k
      //        been selected. Here, we assume that if an access can be
4944
43.9k
      //        vectorized, it will be. We should also look at extending this
4945
43.9k
      //        optimization to non-pointer types.
4946
43.9k
      //
4947
43.9k
      if (T->isPointerTy() && 
!isConsecutiveLoadOrStore(&I)627
&&
4948
43.9k
          
!isAccessInterleaved(&I)287
&&
!isLegalGatherOrScatter(&I)258
)
4949
258
        continue;
4950
43.7k
4951
43.7k
      MinWidth = std::min(MinWidth,
4952
43.7k
                          (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
4953
43.7k
      MaxWidth = std::max(MaxWidth,
4954
43.7k
                          (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
4955
43.7k
    }
4956
22.0k
  }
4957
19.9k
4958
19.9k
  return {MinWidth, MaxWidth};
4959
19.9k
}
4960
4961
unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
4962
                                                           unsigned VF,
4963
19.8k
                                                           unsigned LoopCost) {
4964
19.8k
  // -- The interleave heuristics --
4965
19.8k
  // We interleave the loop in order to expose ILP and reduce the loop overhead.
4966
19.8k
  // There are many micro-architectural considerations that we can't predict
4967
19.8k
  // at this level. For example, frontend pressure (on decode or fetch) due to
4968
19.8k
  // code size, or the number and capabilities of the execution ports.
4969
19.8k
  //
4970
19.8k
  // We use the following heuristics to select the interleave count:
4971
19.8k
  // 1. If the code has reductions, then we interleave to break the cross
4972
19.8k
  // iteration dependency.
4973
19.8k
  // 2. If the loop is really small, then we interleave to reduce the loop
4974
19.8k
  // overhead.
4975
19.8k
  // 3. We don't interleave if we think that we will spill registers to memory
4976
19.8k
  // due to the increased register pressure.
4977
19.8k
4978
19.8k
  // When we optimize for size, we don't interleave.
4979
19.8k
  if (OptForSize)
4980
111
    return 1;
4981
19.7k
4982
19.7k
  // We used the distance for the interleave count.
4983
19.7k
  if (Legal->getMaxSafeDepDistBytes() != -1U)
4984
140
    return 1;
4985
19.6k
4986
19.6k
  // Do not interleave loops with a relatively small trip count.
4987
19.6k
  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4988
19.6k
  if (TC > 1 && 
TC < TinyTripCountInterleaveThreshold10.8k
)
4989
496
    return 1;
4990
19.1k
4991
19.1k
  unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
4992
19.1k
  LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
4993
19.1k
                    << " registers\n");
4994
19.1k
4995
19.1k
  if (VF == 1) {
4996
2.91k
    if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
4997
1
      TargetNumRegisters = ForceTargetNumScalarRegs;
4998
16.1k
  } else {
4999
16.1k
    if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5000
0
      TargetNumRegisters = ForceTargetNumVectorRegs;
5001
16.1k
  }
5002
19.1k
5003
19.1k
  RegisterUsage R = calculateRegisterUsage({VF})[0];
5004
19.1k
  // We divide by these constants so assume that we have at least one
5005
19.1k
  // instruction that uses at least one register.
5006
19.1k
  R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
5007
19.1k
5008
19.1k
  // We calculate the interleave count using the following formula.
5009
19.1k
  // Subtract the number of loop invariants from the number of available
5010
19.1k
  // registers. These registers are used by all of the interleaved instances.
5011
19.1k
  // Next, divide the remaining registers by the number of registers that is
5012
19.1k
  // required by the loop, in order to estimate how many parallel instances
5013
19.1k
  // fit without causing spills. All of this is rounded down if necessary to be
5014
19.1k
  // a power of two. We want power of two interleave count to simplify any
5015
19.1k
  // addressing operations or alignment considerations.
5016
19.1k
  // We also want power of two interleave counts to ensure that the induction
5017
19.1k
  // variable of the vector loop wraps to zero, when tail is folded by masking;
5018
19.1k
  // this currently happens when OptForSize, in which case IC is set to 1 above.
5019
19.1k
  unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
5020
19.1k
                              R.MaxLocalUsers);
5021
19.1k
5022
19.1k
  // Don't count the induction variable as interleaved.
5023
19.1k
  if (EnableIndVarRegisterHeur)
5024
19.1k
    IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
5025
19.1k
                       std::max(1U, (R.MaxLocalUsers - 1)));
5026
19.1k
5027
19.1k
  // Clamp the interleave ranges to reasonable counts.
5028
19.1k
  unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5029
19.1k
5030
19.1k
  // Check if the user has overridden the max.
5031
19.1k
  if (VF == 1) {
5032
2.91k
    if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5033
1
      MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5034
16.1k
  } else {
5035
16.1k
    if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5036
0
      MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5037
16.1k
  }
5038
19.1k
5039
19.1k
  // If we did not calculate the cost for VF (because the user selected the VF)
5040
19.1k
  // then we calculate the cost of VF here.
5041
19.1k
  if (LoopCost == 0)
5042
583
    LoopCost = expectedCost(VF).first;
5043
19.1k
5044
19.1k
  assert(LoopCost && "Non-zero loop cost expected");
5045
19.1k
5046
19.1k
  // Clamp the calculated IC to be between the 1 and the max interleave count
5047
19.1k
  // that the target allows.
5048
19.1k
  if (IC > MaxInterleaveCount)
5049
18.5k
    IC = MaxInterleaveCount;
5050
605
  else if (IC < 1)
5051
118
    IC = 1;
5052
19.1k
5053
19.1k
  // Interleave if we vectorized this loop and there is a reduction that could
5054
19.1k
  // benefit from interleaving.
5055
19.1k
  if (VF > 1 && 
!Legal->getReductionVars()->empty()16.1k
) {
5056
2.17k
    LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5057
2.17k
    return IC;
5058
2.17k
  }
5059
16.9k
5060
16.9k
  // Note that if we've already vectorized the loop we will have done the
5061
16.9k
  // runtime check and so interleaving won't require further checks.
5062
16.9k
  bool InterleavingRequiresRuntimePointerCheck =
5063
16.9k
      (VF == 1 && 
Legal->getRuntimePointerChecking()->Need2.91k
);
5064
16.9k
5065
16.9k
  // We want to interleave small loops in order to reduce the loop overhead and
5066
16.9k
  // potentially expose ILP opportunities.
5067
16.9k
  LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
5068
16.9k
  if (!InterleavingRequiresRuntimePointerCheck && 
LoopCost < SmallLoopCost16.4k
) {
5069
15.6k
    // We assume that the cost overhead is 1 and we use the cost model
5070
15.6k
    // to estimate the cost of the loop and interleave until the cost of the
5071
15.6k
    // loop overhead is about 5% of the cost of the loop.
5072
15.6k
    unsigned SmallIC =
5073
15.6k
        std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5074
15.6k
5075
15.6k
    // Interleave until store/load ports (estimated by max interleave count) are
5076
15.6k
    // saturated.
5077
15.6k
    unsigned NumStores = Legal->getNumStores();
5078
15.6k
    unsigned NumLoads = Legal->getNumLoads();
5079
15.6k
    unsigned StoresIC = IC / (NumStores ? 
NumStores14.6k
:
1982
);
5080
15.6k
    unsigned LoadsIC = IC / (NumLoads ? 
NumLoads4.58k
:
111.0k
);
5081
15.6k
5082
15.6k
    // If we have a scalar reduction (vector reductions are already dealt with
5083
15.6k
    // by this point), we can increase the critical path length if the loop
5084
15.6k
    // we're interleaving is inside another loop. Limit, by default to 2, so the
5085
15.6k
    // critical path only gets increased by one reduction operation.
5086
15.6k
    if (!Legal->getReductionVars()->empty() && 
TheLoop->getLoopDepth() > 1384
) {
5087
150
      unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5088
150
      SmallIC = std::min(SmallIC, F);
5089
150
      StoresIC = std::min(StoresIC, F);
5090
150
      LoadsIC = std::min(LoadsIC, F);
5091
150
    }
5092
15.6k
5093
15.6k
    if (EnableLoadStoreRuntimeInterleave &&
5094
15.6k
        std::max(StoresIC, LoadsIC) > SmallIC) {
5095
3.07k
      LLVM_DEBUG(
5096
3.07k
          dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5097
3.07k
      return std::max(StoresIC, LoadsIC);
5098
3.07k
    }
5099
12.5k
5100
12.5k
    LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5101
12.5k
    return SmallIC;
5102
12.5k
  }
5103
1.31k
5104
1.31k
  // Interleave if this is a large loop (small loops are already dealt with by
5105
1.31k
  // this point) that could benefit from interleaving.
5106
1.31k
  bool HasReductions = !Legal->getReductionVars()->empty();
5107
1.31k
  if (TTI.enableAggressiveInterleaving(HasReductions)) {
5108
1
    LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5109
1
    return IC;
5110
1
  }
5111
1.31k
5112
1.31k
  LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5113
1.31k
  return 1;
5114
1.31k
}
5115
5116
SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5117
19.1k
LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
5118
19.1k
  // This function calculates the register usage by measuring the highest number
5119
19.1k
  // of values that are alive at a single location. Obviously, this is a very
5120
19.1k
  // rough estimation. We scan the loop in a topological order in order and
5121
19.1k
  // assign a number to each instruction. We use RPO to ensure that defs are
5122
19.1k
  // met before their users. We assume that each instruction that has in-loop
5123
19.1k
  // users starts an interval. We record every time that an in-loop value is
5124
19.1k
  // used, so we have a list of the first and last occurrences of each
5125
19.1k
  // instruction. Next, we transpose this data structure into a multi map that
5126
19.1k
  // holds the list of intervals that *end* at a specific location. This multi
5127
19.1k
  // map allows us to perform a linear search. We scan the instructions linearly
5128
19.1k
  // and record each time that a new interval starts, by placing it in a set.
5129
19.1k
  // If we find this value in the multi-map then we remove it from the set.
5130
19.1k
  // The max register usage is the maximum size of the set.
5131
19.1k
  // We also search for instructions that are defined outside the loop, but are
5132
19.1k
  // used inside the loop. We need this number separately from the max-interval
5133
19.1k
  // usage number because when we unroll, loop-invariant values do not take
5134
19.1k
  // more register.
5135
19.1k
  LoopBlocksDFS DFS(TheLoop);
5136
19.1k
  DFS.perform(LI);
5137
19.1k
5138
19.1k
  RegisterUsage RU;
5139
19.1k
5140
19.1k
  // Each 'key' in the map opens a new interval. The values
5141
19.1k
  // of the map are the index of the 'last seen' usage of the
5142
19.1k
  // instruction that is the key.
5143
19.1k
  using IntervalMap = DenseMap<Instruction *, unsigned>;
5144
19.1k
5145
19.1k
  // Maps instruction to its index.
5146
19.1k
  SmallVector<Instruction *, 64> IdxToInstr;
5147
19.1k
  // Marks the end of each interval.
5148
19.1k
  IntervalMap EndPoint;
5149
19.1k
  // Saves the list of instruction indices that are used in the loop.
5150
19.1k
  SmallPtrSet<Instruction *, 8> Ends;
5151
19.1k
  // Saves the list of values that are used in the loop but are
5152
19.1k
  // defined outside the loop, such as arguments and constants.
5153
19.1k
  SmallPtrSet<Value *, 8> LoopInvariants;
5154
19.1k
5155
20.2k
  for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5156
227k
    for (Instruction &I : BB->instructionsWithoutDebug()) {
5157
227k
      IdxToInstr.push_back(&I);
5158
227k
5159
227k
      // Save the end location of each USE.
5160
455k
      for (Value *U : I.operands()) {
5161
455k
        auto *Instr = dyn_cast<Instruction>(U);
5162
455k
5163
455k
        // Ignore non-instruction values such as arguments, constants, etc.
5164
455k
        if (!Instr)
5165
162k
          continue;
5166
293k
5167
293k
        // If this instruction is outside the loop then record it and continue.
5168
293k
        if (!TheLoop->contains(Instr)) {
5169
33.3k
          LoopInvariants.insert(Instr);
5170
33.3k
          continue;
5171
33.3k
        }
5172
259k
5173
259k
        // Overwrite previous end points.
5174
259k
        EndPoint[Instr] = IdxToInstr.size();
5175
259k
        Ends.insert(Instr);
5176
259k
      }
5177
227k
    }
5178
20.2k
  }
5179
19.1k
5180
19.1k
  // Saves the list of intervals that end with the index in 'key'.
5181
19.1k
  using InstrList = SmallVector<Instruction *, 2>;
5182
19.1k
  DenseMap<unsigned, InstrList> TransposeEnds;
5183
19.1k
5184
19.1k
  // Transpose the EndPoints to a list of values that end at each index.
5185
19.1k
  for (auto &Interval : EndPoint)
5186
187k
    TransposeEnds[Interval.second].push_back(Interval.first);
5187
19.1k
5188
19.1k
  SmallPtrSet<Instruction *, 8> OpenIntervals;
5189
19.1k
5190
19.1k
  // Get the size of the widest register.
5191
19.1k
  unsigned MaxSafeDepDist = -1U;
5192
19.1k
  if (Legal->getMaxSafeDepDistBytes() != -1U)
5193
0
    MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5194
19.1k
  unsigned WidestRegister =
5195
19.1k
      std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5196
19.1k
  const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5197
19.1k
5198
19.1k
  SmallVector<RegisterUsage, 8> RUs(VFs.size());
5199
19.1k
  SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0);
5200
19.1k
5201
19.1k
  LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5202
19.1k
5203
19.1k
  // A lambda that gets the register usage for the given type and VF.
5204
228k
  auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
5205
228k
    if (Ty->isTokenTy())
5206
1
      return 0U;
5207
228k
    unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5208
228k
    return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
5209
228k
  };
5210
19.1k
5211
246k
  for (unsigned int i = 0, s = IdxToInstr.size(); i < s; 
++i227k
) {
5212
227k
    Instruction *I = IdxToInstr[i];
5213
227k
5214
227k
    // Remove all of the instructions that end at this location.
5215
227k
    InstrList &List = TransposeEnds[i];
5216
227k
    for (Instruction *ToRemove : List)
5217
168k
      OpenIntervals.erase(ToRemove);
5218
227k
5219
227k
    // Ignore instructions that are never used within the loop.
5220
227k
    if (Ends.find(I) == Ends.end())
5221
39.1k
      continue;
5222
187k
5223
187k
    // Skip ignored values.
5224
187k
    if (ValuesToIgnore.find(I) != ValuesToIgnore.end())
5225
4
      continue;
5226
187k
5227
187k
    // For each VF find the maximum usage of registers.
5228
375k
    
for (unsigned j = 0, e = VFs.size(); 187k
j < e;
++j187k
) {
5229
187k
      if (VFs[j] == 1) {
5230
42.4k
        MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size());
5231
42.4k
        continue;
5232
42.4k
      }
5233
145k
      collectUniformsAndScalars(VFs[j]);
5234
145k
      // Count the number of live intervals.
5235
145k
      unsigned RegUsage = 0;
5236
354k
      for (auto Inst : OpenIntervals) {
5237
354k
        // Skip ignored values for VF > 1.
5238
354k
        if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end() ||
5239
354k
            
isScalarAfterVectorization(Inst, VFs[j])354k
)
5240
150k
          continue;
5241
204k
        RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
5242
204k
      }
5243
145k
      MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
5244
145k
    }
5245
187k
5246
187k
    LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5247
187k
                      << OpenIntervals.size() << '\n');
5248
187k
5249
187k
    // Add the current instruction to the list of open intervals.
5250
187k
    OpenIntervals.insert(I);
5251
187k
  }
5252
19.1k
5253
38.2k
  for (unsigned i = 0, e = VFs.size(); i < e; 
++i19.1k
) {
5254
19.1k
    unsigned Invariant = 0;
5255
19.1k
    if (VFs[i] == 1)
5256
2.91k
      Invariant = LoopInvariants.size();
5257
16.1k
    else {
5258
16.1k
      for (auto Inst : LoopInvariants)
5259
24.0k
        Invariant += GetRegUsage(Inst->getType(), VFs[i]);
5260
16.1k
    }
5261
19.1k
5262
19.1k
    LLVM_DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n');
5263
19.1k
    LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n');
5264
19.1k
    LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant
5265
19.1k
                      << '\n');
5266
19.1k
5267
19.1k
    RU.LoopInvariantRegs = Invariant;
5268
19.1k
    RU.MaxLocalUsers = MaxUsages[i];
5269
19.1k
    RUs[i] = RU;
5270
19.1k
  }
5271
19.1k
5272
19.1k
  return RUs;
5273
19.1k
}
5274
5275
4.67k
bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5276
4.67k
  // TODO: Cost model for emulated masked load/store is completely
5277
4.67k
  // broken. This hack guides the cost model to use an artificially
5278
4.67k
  // high enough value to practically disable vectorization with such
5279
4.67k
  // operations, except where previously deployed legality hack allowed
5280
4.67k
  // using very low cost values. This is to avoid regressions coming simply
5281
4.67k
  // from moving "masked load/store" check from legality to cost model.
5282
4.67k
  // Masked Load/Gather emulation was previously never allowed.
5283
4.67k
  // Limited number of Masked Store/Scatter emulation was allowed.
5284
4.67k
  assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5285
4.67k
  return isa<LoadInst>(I) ||
5286
4.67k
         
(2.19k
isa<StoreInst>(I)2.19k
&&
5287
2.19k
          
NumPredStores > NumberOfStoresToPredicate2.12k
);
5288
4.67k
}
5289
5290
32.4k
void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
5291
32.4k
  // If we aren't vectorizing the loop, or if we've already collected the
5292
32.4k
  // instructions to scalarize, there's nothing to do. Collection may already
5293
32.4k
  // have occurred if we have a user-selected VF and are now computing the
5294
32.4k
  // expected cost for interleaving.
5295
32.4k
  if (VF < 2 || 
InstsToScalarize.find(VF) != InstsToScalarize.end()32.3k
)
5296
36
    return;
5297
32.3k
5298
32.3k
  // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5299
32.3k
  // not profitable to scalarize any instructions, the presence of VF in the
5300
32.3k
  // map will indicate that we've analyzed it already.
5301
32.3k
  ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5302
32.3k
5303
32.3k
  // Find all the instructions that are scalar with predication in the loop and
5304
32.3k
  // determine if it would be better to not if-convert the blocks they are in.
5305
32.3k
  // If so, we also record the instructions to scalarize.
5306
35.8k
  for (BasicBlock *BB : TheLoop->blocks()) {
5307
35.8k
    if (!blockNeedsPredication(BB))
5308
33.4k
      continue;
5309
2.33k
    for (Instruction &I : *BB)
5310
12.3k
      if (isScalarWithPredication(&I)) {
5311
2.33k
        ScalarCostsTy ScalarCosts;
5312
2.33k
        // Do not apply discount logic if hacked cost is needed
5313
2.33k
        // for emulated masked memrefs.
5314
2.33k
        if (!useEmulatedMaskMemRefHack(&I) &&
5315
2.33k
            
computePredInstDiscount(&I, ScalarCosts, VF) >= 0355
)
5316
323
          ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5317
2.33k
        // Remember that BB will remain after vectorization.
5318
2.33k
        PredicatedBBsAfterVectorization.insert(BB);
5319
2.33k
      }
5320
2.33k
  }
5321
32.3k
}
5322
5323
int LoopVectorizationCostModel::computePredInstDiscount(
5324
    Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5325
355
    unsigned VF) {
5326
355
  assert(!isUniformAfterVectorization(PredInst, VF) &&
5327
355
         "Instruction marked uniform-after-vectorization will be predicated");
5328
355
5329
355
  // Initialize the discount to zero, meaning that the scalar version and the
5330
355
  // vector version cost the same.
5331
355
  int Discount = 0;
5332
355
5333
355
  // Holds instructions to analyze. The instructions we visit are mapped in
5334
355
  // ScalarCosts. Those instructions are the ones that would be scalarized if
5335
355
  // we find that the scalar version costs less.
5336
355
  SmallVector<Instruction *, 8> Worklist;
5337
355
5338
355
  // Returns true if the given instruction can be scalarized.
5339
979
  auto canBeScalarized = [&](Instruction *I) -> bool {
5340
979
    // We only attempt to scalarize instructions forming a single-use chain
5341
979
    // from the original predicated block that would otherwise be vectorized.
5342
979
    // Although not strictly necessary, we give up on instructions we know will
5343
979
    // already be scalar to avoid traversing chains that are unlikely to be
5344
979
    // beneficial.
5345
979
    if (!I->hasOneUse() || 
PredInst->getParent() != I->getParent()475
||
5346
979
        
isScalarAfterVectorization(I, VF)435
)
5347
595
      return false;
5348
384
5349
384
    // If the instruction is scalar with predication, it will be analyzed
5350
384
    // separately. We ignore it within the context of PredInst.
5351
384
    if (isScalarWithPredication(I))
5352
151
      return false;
5353
233
5354
233
    // If any of the instruction's operands are uniform after vectorization,
5355
233
    // the instruction cannot be scalarized. This prevents, for example, a
5356
233
    // masked load from being scalarized.
5357
233
    //
5358
233
    // We assume we will only emit a value for lane zero of an instruction
5359
233
    // marked uniform after vectorization, rather than VF identical values.
5360
233
    // Thus, if we scalarize an instruction that uses a uniform, we would
5361
233
    // create uses of values corresponding to the lanes we aren't emitting code
5362
233
    // for. This behavior can be changed by allowing getScalarValue to clone
5363
233
    // the lane zero values for uniforms rather than asserting.
5364
233
    for (Use &U : I->operands())
5365
430
      if (auto *J = dyn_cast<Instruction>(U.get()))
5366
384
        if (isUniformAfterVectorization(J, VF))
5367
6
          return false;
5368
233
5369
233
    // Otherwise, we can scalarize the instruction.
5370
233
    
return true227
;
5371
233
  };
5372
355
5373
355
  // Compute the expected cost discount from scalarizing the entire expression
5374
355
  // feeding the predicated instruction. We currently only consider expressions
5375
355
  // that are single-use instruction chains.
5376
355
  Worklist.push_back(PredInst);
5377
937
  while (!Worklist.empty()) {
5378
582
    Instruction *I = Worklist.pop_back_val();
5379
582
5380
582
    // If we've already analyzed the instruction, there's nothing to do.
5381
582
    if (ScalarCosts.find(I) != ScalarCosts.end())
5382
0
      continue;
5383
582
5384
582
    // Compute the cost of the vector instruction. Note that this cost already
5385
582
    // includes the scalarization overhead of the predicated instruction.
5386
582
    unsigned VectorCost = getInstructionCost(I, VF).first;
5387
582
5388
582
    // Compute the cost of the scalarized instruction. This cost is the cost of
5389
582
    // the instruction as if it wasn't if-converted and instead remained in the
5390
582
    // predicated block. We will scale this cost by block probability after
5391
582
    // computing the scalarization overhead.
5392
582
    unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
5393
582
5394
582
    // Compute the scalarization overhead of needed insertelement instructions
5395
582
    // and phi nodes.
5396
582
    if (isScalarWithPredication(I) && 
!I->getType()->isVoidTy()355
) {
5397
76
      ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
5398
76
                                                 true, false);
5399
76
      ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
5400
76
    }
5401
582
5402
582
    // Compute the scalarization overhead of needed extractelement
5403
582
    // instructions. For each of the instruction's operands, if the operand can
5404
582
    // be scalarized, add it to the worklist; otherwise, account for the
5405
582
    // overhead.
5406
582
    for (Use &U : I->operands())
5407
1.13k
      if (auto *J = dyn_cast<Instruction>(U.get())) {
5408
979
        assert(VectorType::isValidElementType(J->getType()) &&
5409
979
               "Instruction has non-scalar type");
5410
979
        if (canBeScalarized(J))
5411
227
          Worklist.push_back(J);
5412
752
        else if (needsExtract(J, VF))
5413
384
          ScalarCost += TTI.getScalarizationOverhead(
5414
384
                              ToVectorTy(J->getType(),VF), false, true);
5415
979
      }
5416
582
5417
582
    // Scale the total scalar cost by block probability.
5418
582
    ScalarCost /= getReciprocalPredBlockProb();
5419
582
5420
582
    // Compute the discount. A non-negative discount means the vector version
5421
582
    // of the instruction costs more, and scalarizing would be beneficial.
5422
582
    Discount += VectorCost - ScalarCost;
5423
582
    ScalarCosts[I] = ScalarCost;
5424
582
  }
5425
355
5426
355
  return Discount;
5427
355
}
5428
5429
LoopVectorizationCostModel::VectorizationCostTy
5430
51.5k
LoopVectorizationCostModel::expectedCost(unsigned VF) {
5431
51.5k
  VectorizationCostTy Cost;
5432
51.5k
5433
51.5k
  // For each block.
5434
56.8k
  for (BasicBlock *BB : TheLoop->blocks()) {
5435
56.8k
    VectorizationCostTy BlockCost;
5436
56.8k
5437
56.8k
    // For each instruction in the old loop.
5438
631k
    for (Instruction &I : BB->instructionsWithoutDebug()) {
5439
631k
      // Skip ignored values.
5440
631k
      if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() ||
5441
631k
          
(631k
VF > 1631k
&&
VecValuesToIgnore.find(&I) != VecValuesToIgnore.end()388k
))
5442
69
        continue;
5443
631k
5444
631k
      VectorizationCostTy C = getInstructionCost(&I, VF);
5445
631k
5446
631k
      // Check if we should override the cost.
5447
631k
      if (ForceTargetInstructionCost.getNumOccurrences() > 0)
5448
10
        C.first = ForceTargetInstructionCost;
5449
631k
5450
631k
      BlockCost.first += C.first;
5451
631k
      BlockCost.second |= C.second;
5452
631k
      LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5453
631k
                        << " for VF " << VF << " For instruction: " << I
5454
631k
                        << '\n');
5455
631k
    }
5456
56.8k
5457
56.8k
    // If we are vectorizing a predicated block, it will have been
5458
56.8k
    // if-converted. This means that the block's instructions (aside from
5459
56.8k
    // stores and instructions that may divide by zero) will now be
5460
56.8k
    // unconditionally executed. For the scalar case, we may not always execute
5461
56.8k
    // the predicated block. Thus, scale the block's cost by the probability of
5462
56.8k
    // executing it.
5463
56.8k
    if (VF == 1 && 
blockNeedsPredication(BB)21.1k
)
5464
1.28k
      BlockCost.first /= getReciprocalPredBlockProb();
5465
56.8k
5466
56.8k
    Cost.first += BlockCost.first;
5467
56.8k
    Cost.second |= BlockCost.second;
5468
56.8k
  }
5469
51.5k
5470
51.5k
  return Cost;
5471
51.5k
}
5472
5473
/// Gets Address Access SCEV after verifying that the access pattern
5474
/// is loop invariant except the induction variable dependence.
5475
///
5476
/// This SCEV can be sent to the Target in order to estimate the address
5477
/// calculation cost.
5478
static const SCEV *getAddressAccessSCEV(
5479
              Value *Ptr,
5480
              LoopVectorizationLegality *Legal,
5481
              PredicatedScalarEvolution &PSE,
5482
16.5k
              const Loop *TheLoop) {
5483
16.5k
5484
16.5k
  auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5485
16.5k
  if (!Gep)
5486
1.27k
    return nullptr;
5487
15.2k
5488
15.2k
  // We are looking for a gep with all loop invariant indices except for one
5489
15.2k
  // which should be an induction variable.
5490
15.2k
  auto SE = PSE.getSE();
5491
15.2k
  unsigned NumOperands = Gep->getNumOperands();
5492
42.6k
  for (unsigned i = 1; i < NumOperands; 
++i27.4k
) {
5493
33.7k
    Value *Opd = Gep->getOperand(i);
5494
33.7k
    if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5495
33.7k
        
!Legal->isInductionVariable(Opd)12.3k
)
5496
6.33k
      return nullptr;
5497
33.7k
  }
5498
15.2k
5499
15.2k
  // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5500
15.2k
  
return PSE.getSCEV(Ptr)8.92k
;
5501
15.2k
}
5502
5503
7.42k
static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
5504
7.42k
  return Legal->hasStride(I->getOperand(0)) ||
5505
7.42k
         Legal->hasStride(I->getOperand(1));
5506
7.42k
}
5507
5508
unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5509
16.5k
                                                                 unsigned VF) {
5510
16.5k
  assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
5511
16.5k
  Type *ValTy = getMemInstValueType(I);
5512
16.5k
  auto SE = PSE.getSE();
5513
16.5k
5514
16.5k
  unsigned Alignment = getLoadStoreAlignment(I);
5515
16.5k
  unsigned AS = getLoadStoreAddressSpace(I);
5516
16.5k
  Value *Ptr = getLoadStorePointerOperand(I);
5517
16.5k
  Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5518
16.5k
5519
16.5k
  // Figure out whether the access is strided and get the stride value
5520
16.5k
  // if it's known in compile time
5521
16.5k
  const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5522
16.5k
5523
16.5k
  // Get the cost of the scalar memory instruction and address computation.
5524
16.5k
  unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5525
16.5k
5526
16.5k
  // Don't pass *I here, since it is scalar but will actually be part of a
5527
16.5k
  // vectorized loop where the user of it is a vectorized instruction.
5528
16.5k
  Cost += VF *
5529
16.5k
          TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
5530
16.5k
                              AS);
5531
16.5k
5532
16.5k
  // Get the overhead of the extractelement and insertelement instructions
5533
16.5k
  // we might create due to scalarization.
5534
16.5k
  Cost += getScalarizationOverhead(I, VF);
5535
16.5k
5536
16.5k
  // If we have a predicated store, it may not be executed for each vector
5537
16.5k
  // lane. Scale the cost by the probability of executing the predicated
5538
16.5k
  // block.
5539
16.5k
  if (isPredicatedInst(I)) {
5540
2.34k
    Cost /= getReciprocalPredBlockProb();
5541
2.34k
5542
2.34k
    if (useEmulatedMaskMemRefHack(I))
5543
1.84k
      // Artificially setting to a high enough value to practically disable
5544
1.84k
      // vectorization with such operations.
5545
1.84k
      Cost = 3000000;
5546
2.34k
  }
5547
16.5k
5548
16.5k
  return Cost;
5549
16.5k
}
5550
5551
unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5552
43.4k
                                                             unsigned VF) {
5553
43.4k
  Type *ValTy = getMemInstValueType(I);
5554
43.4k
  Type *VectorTy = ToVectorTy(ValTy, VF);
5555
43.4k
  unsigned Alignment = getLoadStoreAlignment(I);
5556
43.4k
  Value *Ptr = getLoadStorePointerOperand(I);
5557
43.4k
  unsigned AS = getLoadStoreAddressSpace(I);
5558
43.4k
  int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
5559
43.4k
5560
43.4k
  assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5561
43.4k
         "Stride should be 1 or -1 for consecutive memory access");
5562
43.4k
  unsigned Cost = 0;
5563
43.4k
  if (Legal->isMaskRequired(I))
5564
185
    Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
5565
43.2k
  else
5566
43.2k
    Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
5567
43.4k
5568
43.4k
  bool Reverse = ConsecutiveStride < 0;
5569
43.4k
  if (Reverse)
5570
2.98k
    Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5571
43.4k
  return Cost;
5572
43.4k
}
5573
5574
unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5575
636
                                                         unsigned VF) {
5576
636
  Type *ValTy = getMemInstValueType(I);
5577
636
  Type *VectorTy = ToVectorTy(ValTy, VF);
5578
636
  unsigned Alignment = getLoadStoreAlignment(I);
5579
636
  unsigned AS = getLoadStoreAddressSpace(I);
5580
636
  if (isa<LoadInst>(I)) {
5581
621
    return TTI.getAddressComputationCost(ValTy) +
5582
621
           TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
5583
621
           TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
5584
621
  }
5585
15
  StoreInst *SI = cast<StoreInst>(I);
5586
15
5587
15
  bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
5588
15
  return TTI.getAddressComputationCost(ValTy) +
5589
15
         TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
5590
15
         (isLoopInvariantStoreValue ? 
08
: TTI.getVectorInstrCost(
5591
7
                                               Instruction::ExtractElement,
5592
7
                                               VectorTy, VF - 1));
5593
15
}
5594
5595
unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5596
132
                                                          unsigned VF) {
5597
132
  Type *ValTy = getMemInstValueType(I);
5598
132
  Type *VectorTy = ToVectorTy(ValTy, VF);
5599
132
  unsigned Alignment = getLoadStoreAlignment(I);
5600
132
  Value *Ptr = getLoadStorePointerOperand(I);
5601
132
5602
132
  return TTI.getAddressComputationCost(VectorTy) +
5603
132
         TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5604
132
                                    Legal->isMaskRequired(I), Alignment);
5605
132
}
5606
5607
unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5608
2.26k
                                                            unsigned VF) {
5609
2.26k
  Type *ValTy = getMemInstValueType(I);
5610
2.26k
  Type *VectorTy = ToVectorTy(ValTy, VF);
5611
2.26k
  unsigned AS = getLoadStoreAddressSpace(I);
5612
2.26k
5613
2.26k
  auto Group = getInterleavedAccessGroup(I);
5614
2.26k
  assert(Group && "Fail to get an interleaved access group.");
5615
2.26k
5616
2.26k
  unsigned InterleaveFactor = Group->getFactor();
5617
2.26k
  Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5618
2.26k
5619
2.26k
  // Holds the indices of existing members in an interleaved load group.
5620
2.26k
  // An interleaved store group doesn't need this as it doesn't allow gaps.
5621
2.26k
  SmallVector<unsigned, 4> Indices;
5622
2.26k
  if (isa<LoadInst>(I)) {
5623
5.49k
    for (unsigned i = 0; i < InterleaveFactor; 
i++4.17k
)
5624
4.17k
      if (Group->getMember(i))
5625
3.29k
        Indices.push_back(i);
5626
1.32k
  }
5627
2.26k
5628
2.26k
  // Calculate the cost of the whole interleaved group.
5629
2.26k
  bool UseMaskForGaps = 
5630
2.26k
      Group->requiresScalarEpilogue() && 
!IsScalarEpilogueAllowed342
;
5631
2.26k
  unsigned Cost = TTI.getInterleavedMemoryOpCost(
5632
2.26k
      I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5633
2.26k
      Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
5634
2.26k
5635
2.26k
  if (Group->isReverse()) {
5636
298
    // TODO: Add support for reversed masked interleaved access.
5637
298
    assert(!Legal->isMaskRequired(I) &&
5638
298
           "Reverse masked interleaved access not supported.");
5639
298
    Cost += Group->getNumMembers() *
5640
298
            TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5641
298
  }
5642
2.26k
  return Cost;
5643
2.26k
}
5644
5645
unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5646
103k
                                                              unsigned VF) {
5647
103k
  // Calculate scalar cost only. Vectorization cost should be ready at this
5648
103k
  // moment.
5649
103k
  if (VF == 1) {
5650
39.4k
    Type *ValTy = getMemInstValueType(I);
5651
39.4k
    unsigned Alignment = getLoadStoreAlignment(I);
5652
39.4k
    unsigned AS = getLoadStoreAddressSpace(I);
5653
39.4k
5654
39.4k
    return TTI.getAddressComputationCost(ValTy) +
5655
39.4k
           TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
5656
39.4k
  }
5657
64.1k
  return getWideningCost(I, VF);
5658
64.1k
}
5659
5660
LoopVectorizationCostModel::VectorizationCostTy
5661
632k
LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
5662
632k
  // If we know that this instruction will remain uniform, check the cost of
5663
632k
  // the scalar version.
5664
632k
  if (isUniformAfterVectorization(I, VF))
5665
373k
    VF = 1;
5666
632k
5667
632k
  if (VF > 1 && 
isProfitableToScalarize(I, VF)259k
)
5668
447
    return VectorizationCostTy(InstsToScalarize[VF][I], false);
5669
632k
5670
632k
  // Forced scalars do not have any scalarization overhead.
5671
632k
  auto ForcedScalar = ForcedScalars.find(VF);
5672
632k
  if (VF > 1 && 
ForcedScalar != ForcedScalars.end()259k
) {
5673
123
    auto InstSet = ForcedScalar->second;
5674
123
    if (InstSet.find(I) != InstSet.end())
5675
9
      return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
5676
632k
  }
5677
632k
5678
632k
  Type *VectorTy;
5679
632k
  unsigned C = getInstructionCost(I, VF, VectorTy);
5680
632k
5681
632k
  bool TypeNotScalarized =
5682
632k
      VF > 1 && 
VectorTy->isVectorTy()259k
&&
TTI.getNumberOfParts(VectorTy) < VF191k
;
5683
632k
  return VectorizationCostTy(C, TypeNotScalarized);
5684
632k
}
5685
5686
unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5687
18.2k
                                                              unsigned VF) {
5688
18.2k
5689
18.2k
  if (VF == 1)
5690
3
    return 0;
5691
18.2k
5692
18.2k
  unsigned Cost = 0;
5693
18.2k
  Type *RetTy = ToVectorTy(I->getType(), VF);
5694
18.2k
  if (!RetTy->isVoidTy() &&
5695
18.2k
      
(11.7k
!isa<LoadInst>(I)11.7k
||
!TTI.supportsEfficientVectorElementLoadStore()10.1k
))
5696
11.7k
    Cost += TTI.getScalarizationOverhead(RetTy, true, false);
5697
18.2k
5698
18.2k
  // Some targets keep addresses scalar.
5699
18.2k
  if (isa<LoadInst>(I) && 
!TTI.prefersVectorizedAddressing()10.1k
)
5700
12
    return Cost;
5701
18.2k
5702
18.2k
  // Some targets support efficient element stores.
5703
18.2k
  if (isa<StoreInst>(I) && 
TTI.supportsEfficientVectorElementLoadStore()6.40k
)
5704
2
    return Cost;
5705
18.1k
5706
18.1k
  // Collect operands to consider.
5707
18.1k
  CallInst *CI = dyn_cast<CallInst>(I);
5708
18.1k
  Instruction::op_range Ops = CI ? 
CI->arg_operands()1.57k
:
I->operands()16.6k
;
5709
18.1k
5710
18.1k
  // Skip operands that do not require extraction/scalarization and do not incur
5711
18.1k
  // any overhead.
5712
18.1k
  return Cost + TTI.getOperandsScalarizationOverhead(
5713
18.1k
                    filterExtractingOperands(Ops, VF), VF);
5714
18.1k
}
5715
5716
32.3k
void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
5717
32.3k
  if (VF == 1)
5718
0
    return;
5719
32.3k
  NumPredStores = 0;
5720
35.8k
  for (BasicBlock *BB : TheLoop->blocks()) {
5721
35.8k
    // For each instruction in the old loop.
5722
389k
    for (Instruction &I : *BB) {
5723
389k
      Value *Ptr =  getLoadStorePointerOperand(&I);
5724
389k
      if (!Ptr)
5725
324k
        continue;
5726
64.2k
5727
64.2k
      // TODO: We should generate better code and update the cost model for
5728
64.2k
      // predicated uniform stores. Today they are treated as any other
5729
64.2k
      // predicated store (see added test cases in
5730
64.2k
      // invariant-store-vectorization.ll).
5731
64.2k
      if (isa<StoreInst>(&I) && 
isScalarWithPredication(&I)32.6k
)
5732
1.03k
        NumPredStores++;
5733
64.2k
5734
64.2k
      if (Legal->isUniform(Ptr) &&
5735
64.2k
          // Conditional loads and stores should be scalarized and predicated.
5736
64.2k
          // isScalarWithPredication cannot be used here since masked
5737
64.2k
          // gather/scatters are not considered scalar with predication.
5738
64.2k
          
!Legal->blockNeedsPredication(I.getParent())842
) {
5739
636
        // TODO: Avoid replicating loads and stores instead of
5740
636
        // relying on instcombine to remove them.
5741
636
        // Load: Scalar load + broadcast
5742
636
        // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
5743
636
        unsigned Cost = getUniformMemOpCost(&I, VF);
5744
636
        setWideningDecision(&I, VF, CM_Scalarize, Cost);
5745
636
        continue;
5746
636
      }
5747
63.6k
5748
63.6k
      // We assume that widening is the best solution when possible.
5749
63.6k
      if (memoryInstructionCanBeWidened(&I, VF)) {
5750
43.4k
        unsigned Cost = getConsecutiveMemOpCost(&I, VF);
5751
43.4k
        int ConsecutiveStride =
5752
43.4k
               Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
5753
43.4k
        assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5754
43.4k
               "Expected consecutive stride.");
5755
43.4k
        InstWidening Decision =
5756
43.4k
            ConsecutiveStride == 1 ? 
CM_Widen40.4k
:
CM_Widen_Reverse2.98k
;
5757
43.4k
        setWideningDecision(&I, VF, Decision, Cost);
5758
43.4k
        continue;
5759
43.4k
      }
5760
20.1k
5761
20.1k
      // Choose between Interleaving, Gather/Scatter or Scalarization.
5762
20.1k
      unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
5763
20.1k
      unsigned NumAccesses = 1;
5764
20.1k
      if (isAccessInterleaved(&I)) {
5765
5.90k
        auto Group = getInterleavedAccessGroup(&I);
5766
5.90k
        assert(Group && "Fail to get an interleaved access group.");
5767
5.90k
5768
5.90k
        // Make one decision for the whole group.
5769
5.90k
        if (getWideningDecision(&I, VF) != CM_Unknown)
5770
3.64k
          continue;
5771
2.26k
5772
2.26k
        NumAccesses = Group->getNumMembers();
5773
2.26k
        if (interleavedAccessCanBeWidened(&I, VF))
5774
2.26k
          InterleaveCost = getInterleaveGroupCost(&I, VF);
5775
2.26k
      }
5776
20.1k
5777
20.1k
      unsigned GatherScatterCost =
5778
16.5k
          isLegalGatherOrScatter(&I)
5779
16.5k
              ? 
getGatherScatterCost(&I, VF) * NumAccesses132
5780
16.5k
              : 
std::numeric_limits<unsigned>::max()16.4k
;
5781
16.5k
5782
16.5k
      unsigned ScalarizationCost =
5783
16.5k
          getMemInstScalarizationCost(&I, VF) * NumAccesses;
5784
16.5k
5785
16.5k
      // Choose better solution for the current VF,
5786
16.5k
      // write down this decision and use it during vectorization.
5787
16.5k
      unsigned Cost;
5788
16.5k
      InstWidening Decision;
5789
16.5k
      if (InterleaveCost <= GatherScatterCost &&
5790
16.5k
          
InterleaveCost < ScalarizationCost16.4k
) {
5791
2.20k
        Decision = CM_Interleave;
5792
2.20k
        Cost = InterleaveCost;
5793
14.3k
      } else if (GatherScatterCost < ScalarizationCost) {
5794
105
        Decision = CM_GatherScatter;
5795
105
        Cost = GatherScatterCost;
5796
14.2k
      } else {
5797
14.2k
        Decision = CM_Scalarize;
5798
14.2k
        Cost = ScalarizationCost;
5799
14.2k
      }
5800
16.5k
      // If the instructions belongs to an interleave group, the whole group
5801
16.5k
      // receives the same decision. The whole group receives the cost, but
5802
16.5k
      // the cost will actually be assigned to one instruction.
5803
16.5k
      if (auto Group = getInterleavedAccessGroup(&I))
5804
2.26k
        setWideningDecision(Group, VF, Decision, Cost);
5805
14.2k
      else
5806
14.2k
        setWideningDecision(&I, VF, Decision, Cost);
5807
16.5k
    }
5808
35.8k
  }
5809
32.3k
5810
32.3k
  // Make sure that any load of address and any other address computation
5811
32.3k
  // remains scalar unless there is gather/scatter support. This avoids
5812
32.3k
  // inevitable extracts into address registers, and also has the benefit of
5813
32.3k
  // activating LSR more, since that pass can't optimize vectorized
5814
32.3k
  // addresses.
5815
32.3k
  if (TTI.prefersVectorizedAddressing())
5816
32.3k
    return;
5817
17
5818
17
  // Start with all scalar pointer uses.
5819
17
  SmallPtrSet<Instruction *, 8> AddrDefs;
5820
17
  for (BasicBlock *BB : TheLoop->blocks())
5821
227
    
for (Instruction &I : *BB)19
{
5822
227
      Instruction *PtrDef =
5823
227
        dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
5824
227
      if (PtrDef && 
TheLoop->contains(PtrDef)66
&&
5825
227
          
getWideningDecision(&I, VF) != CM_GatherScatter66
)
5826
66
        AddrDefs.insert(PtrDef);
5827
227
    }
5828
17
5829
17
  // Add all instructions used to generate the addresses.
5830
17
  SmallVector<Instruction *, 4> Worklist;
5831
17
  for (auto *I : AddrDefs)
5832
55
    Worklist.push_back(I);
5833
78
  while (!Worklist.empty()) {
5834
61
    Instruction *I = Worklist.pop_back_val();
5835
61
    for (auto &Op : I->operands())
5836
121
      if (auto *InstOp = dyn_cast<Instruction>(Op))
5837
61
        if ((InstOp->getParent() == I->getParent()) && 
!isa<PHINode>(InstOp)60
&&
5838
61
            
AddrDefs.insert(InstOp).second12
)
5839
6
          Worklist.push_back(InstOp);
5840
61
  }
5841
17
5842
61
  for (auto *I : AddrDefs) {
5843
61
    if (isa<LoadInst>(I)) {
5844
1
      // Setting the desired widening decision should ideally be handled in
5845
1
      // by cost functions, but since this involves the task of finding out
5846
1
      // if the loaded register is involved in an address computation, it is
5847
1
      // instead changed here when we know this is the case.
5848
1
      InstWidening Decision = getWideningDecision(I, VF);
5849
1
      if (Decision == CM_Widen || 
Decision == CM_Widen_Reverse0
)
5850
1
        // Scalarize a widened load of address.
5851
1
        setWideningDecision(I, VF, CM_Scalarize,
5852
1
                            (VF * getMemoryInstructionCost(I, 1)));
5853
0
      else if (auto Group = getInterleavedAccessGroup(I)) {
5854
0
        // Scalarize an interleave group of address loads.
5855
0
        for (unsigned I = 0; I < Group->getFactor(); ++I) {
5856
0
          if (Instruction *Member = Group->getMember(I))
5857
0
            setWideningDecision(Member, VF, CM_Scalarize,
5858
0
                                (VF * getMemoryInstructionCost(Member, 1)));
5859
0
        }
5860
0
      }
5861
1
    } else
5862
60
      // Make sure I gets scalarized and a cost estimate without
5863
60
      // scalarization overhead.
5864
60
      ForcedScalars[VF].insert(I);
5865
61
  }
5866
17
}
5867
5868
unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
5869
                                                        unsigned VF,
5870
632k
                                                        Type *&VectorTy) {
5871
632k
  Type *RetTy = I->getType();
5872
632k
  if (canTruncateToMinimalBitwidth(I, VF))
5873
618
    RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
5874
632k
  VectorTy = isScalarAfterVectorization(I, VF) ? 
RetTy389k
:
ToVectorTy(RetTy, VF)242k
;
5875
632k
  auto SE = PSE.getSE();
5876
632k
5877
632k
  // TODO: We need to estimate the cost of intrinsic calls.
5878
632k
  switch (I->getOpcode()) {
5879
632k
  case Instruction::GetElementPtr:
5880
98.5k
    // We mark this instruction as zero-cost because the cost of GEPs in
5881
98.5k
    // vectorized code depends on whether the corresponding memory instruction
5882
98.5k
    // is scalarized or not. Therefore, we handle GEPs with the memory
5883
98.5k
    // instruction cost.
5884
98.5k
    return 0;
5885
632k
  case Instruction::Br: {
5886
56.8k
    // In cases of scalarized and predicated instructions, there will be VF
5887
56.8k
    // predicated blocks in the vectorized loop. Each branch around these
5888
56.8k
    // blocks requires also an extract of its vector compare i1 element.
5889
56.8k
    bool ScalarPredicatedBB = false;
5890
56.8k
    BranchInst *BI = cast<BranchInst>(I);
5891
56.8k
    if (VF > 1 && 
BI->isConditional()35.7k
&&
5892
56.8k
        
(33.9k
PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) !=
5893
33.9k
             PredicatedBBsAfterVectorization.end() ||
5894
33.9k
         PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) !=
5895
33.4k
             PredicatedBBsAfterVectorization.end()))
5896
926
      ScalarPredicatedBB = true;
5897
56.8k
5898
56.8k
    if (ScalarPredicatedBB) {
5899
926
      // Return cost for branches around scalarized and predicated blocks.
5900
926
      Type *Vec_i1Ty =
5901
926
          VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
5902
926
      return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
5903
926
              (TTI.getCFInstrCost(Instruction::Br) * VF));
5904
55.9k
    } else if (I->getParent() == TheLoop->getLoopLatch() || 
VF == 14.41k
)
5905
53.4k
      // The back-edge branch will remain, as will all scalar branches.
5906
53.4k
      return TTI.getCFInstrCost(Instruction::Br);
5907
2.51k
    else
5908
2.51k
      // This branch will be eliminated by if-conversion.
5909
2.51k
      return 0;
5910
0
    // Note: We currently assume zero cost for an unconditional branch inside
5911
0
    // a predicated block since it will become a fall-through, although we
5912
0
    // may decide in the future to call TTI for all branches.
5913
0
  }
5914
71.8k
  case Instruction::PHI: {
5915
71.8k
    auto *Phi = cast<PHINode>(I);
5916
71.8k
5917
71.8k
    // First-order recurrences are replaced by vector shuffles inside the loop.
5918
71.8k
    // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
5919
71.8k
    if (VF > 1 && 
Legal->isFirstOrderRecurrence(Phi)21.7k
)
5920
125
      return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
5921
125
                                VectorTy, VF - 1, VectorType::get(RetTy, 1));
5922
71.7k
5923
71.7k
    // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
5924
71.7k
    // converted into select instructions. We require N - 1 selects per phi
5925
71.7k
    // node, where N is the number of incoming values.
5926
71.7k
    if (VF > 1 && 
Phi->getParent() != TheLoop->getHeader()21.6k
)
5927
867
      return (Phi->getNumIncomingValues() - 1) *
5928
867
             TTI.getCmpSelInstrCost(
5929
867
                 Instruction::Select, ToVectorTy(Phi->getType(), VF),
5930
867
                 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));
5931
70.9k
5932
70.9k
    return TTI.getCFInstrCost(Instruction::PHI);
5933
70.9k
  }
5934
70.9k
  case Instruction::UDiv:
5935
1.14k
  case Instruction::SDiv:
5936
1.14k
  case Instruction::URem:
5937
1.14k
  case Instruction::SRem:
5938
1.14k
    // If we have a predicated instruction, it may not be executed for each
5939
1.14k
    // vector lane. Get the scalarization cost and scale this amount by the
5940
1.14k
    // probability of executing the predicated block. If the instruction is not
5941
1.14k
    // predicated, we fall through to the next case.
5942
1.14k
    if (VF > 1 && 
isScalarWithPredication(I)722
) {
5943
86
      unsigned Cost = 0;
5944
86
5945
86
      // These instructions have a non-void type, so account for the phi nodes
5946
86
      // that we will create. This cost is likely to be zero. The phi node
5947
86
      // cost, if any, should be scaled by the block probability because it
5948
86
      // models a copy at the end of each predicated block.
5949
86
      Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
5950
86
5951
86
      // The cost of the non-predicated instruction.
5952
86
      Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
5953
86
5954
86
      // The cost of insertelement and extractelement instructions needed for
5955
86
      // scalarization.
5956
86
      Cost += getScalarizationOverhead(I, VF);
5957
86
5958
86
      // Scale the cost by the probability of executing the predicated blocks.
5959
86
      // This assumes the predicated block for each vector lane is equally
5960
86
      // likely.
5961
86
      return Cost / getReciprocalPredBlockProb();
5962
86
    }
5963
1.05k
    LLVM_FALLTHROUGH;
5964
166k
  case Instruction::Add:
5965
166k
  case Instruction::FAdd:
5966
166k
  case Instruction::Sub:
5967
166k
  case Instruction::FSub:
5968
166k
  case Instruction::Mul:
5969
166k
  case Instruction::FMul:
5970
166k
  case Instruction::FDiv:
5971
166k
  case Instruction::FRem:
5972
166k
  case Instruction::Shl:
5973
166k
  case Instruction::LShr:
5974
166k
  case Instruction::AShr:
5975
166k
  case Instruction::And:
5976
166k
  case Instruction::Or:
5977
166k
  case Instruction::Xor: {
5978
166k
    // Since we will replace the stride by 1 the multiplication should go away.
5979
166k
    if (I->getOpcode() == Instruction::Mul && 
isStrideMul(I, Legal)7.42k
)
5980
64
      return 0;
5981
166k
    // Certain instructions can be cheaper to vectorize if they have a constant
5982
166k
    // second vector operand. One example of this are shifts on x86.
5983
166k
    Value *Op2 = I->getOperand(1);
5984
166k
    TargetTransformInfo::OperandValueProperties Op2VP;
5985
166k
    TargetTransformInfo::OperandValueKind Op2VK =
5986
166k
        TTI.getOperandInfo(Op2, Op2VP);
5987
166k
    if (Op2VK == TargetTransformInfo::OK_AnyValue && 
Legal->isUniform(Op2)93.1k
)
5988
4.71k
      Op2VK = TargetTransformInfo::OK_UniformValue;
5989
166k
5990
166k
    SmallVector<const Value *, 4> Operands(I->operand_values());
5991
166k
    unsigned N = isScalarAfterVectorization(I, VF) ? 
VF86.3k
:
179.6k
;
5992
166k
    return N * TTI.getArithmeticInstrCost(
5993
166k
                   I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
5994
166k
                   Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands);
5995
166k
  }
5996
166k
  case Instruction::FNeg: {
5997
1
    unsigned N = isScalarAfterVectorization(I, VF) ? 
VF0
: 1;
5998
1
    return N * TTI.getArithmeticInstrCost(
5999
1
                   I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6000
1
                   TargetTransformInfo::OK_AnyValue,
6001
1
                   TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6002
1
                   I->getOperand(0));
6003
166k
  }
6004
166k
  case Instruction::Select: {
6005
3.97k
    SelectInst *SI = cast<SelectInst>(I);
6006
3.97k
    const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6007
3.97k
    bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6008
3.97k
    Type *CondTy = SI->getCondition()->getType();
6009
3.97k
    if (!ScalarCond)
6010
3.97k
      CondTy = VectorType::get(CondTy, VF);
6011
3.97k
6012
3.97k
    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
6013
166k
  }
6014
166k
  case Instruction::ICmp:
6015
58.9k
  case Instruction::FCmp: {
6016
58.9k
    Type *ValTy = I->getOperand(0)->getType();
6017
58.9k
    Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6018
58.9k
    if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6019
0
      ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6020
58.9k
    VectorTy = ToVectorTy(ValTy, VF);
6021
58.9k
    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
6022
58.9k
  }
6023
103k
  case Instruction::Store:
6024
103k
  case Instruction::Load: {
6025
103k
    unsigned Width = VF;
6026
103k
    if (Width > 1) {
6027
64.1k
      InstWidening Decision = getWideningDecision(I, Width);
6028
64.1k
      assert(Decision != CM_Unknown &&
6029
64.1k
             "CM decision should be taken at this point");
6030
64.1k
      if (Decision == CM_Scalarize)
6031
14.9k
        Width = 1;
6032
64.1k
    }
6033
103k
    VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6034
103k
    return getMemoryInstructionCost(I, VF);
6035
103k
  }
6036
103k
  case Instruction::ZExt:
6037
71.2k
  case Instruction::SExt:
6038
71.2k
  case Instruction::FPToUI:
6039
71.2k
  case Instruction::FPToSI:
6040
71.2k
  case Instruction::FPExt:
6041
71.2k
  case Instruction::PtrToInt:
6042
71.2k
  case Instruction::IntToPtr:
6043
71.2k
  case Instruction::SIToFP:
6044
71.2k
  case Instruction::UIToFP:
6045
71.2k
  case Instruction::Trunc:
6046
71.2k
  case Instruction::FPTrunc:
6047
71.2k
  case Instruction::BitCast: {
6048
71.2k
    // We optimize the truncation of induction variables having constant
6049
71.2k
    // integer steps. The cost of these truncations is the same as the scalar
6050
71.2k
    // operation.
6051
71.2k
    if (isOptimizableIVTruncate(I, VF)) {
6052
5.40k
      auto *Trunc = cast<TruncInst>(I);
6053
5.40k
      return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6054
5.40k
                                  Trunc->getSrcTy(), Trunc);
6055
5.40k
    }
6056
65.8k
6057
65.8k
    Type *SrcScalarTy = I->getOperand(0)->getType();
6058
65.8k
    Type *SrcVecTy =
6059
65.8k
        VectorTy->isVectorTy() ? 
ToVectorTy(SrcScalarTy, VF)32.0k
:
SrcScalarTy33.8k
;
6060
65.8k
    if (canTruncateToMinimalBitwidth(I, VF)) {
6061
275
      // This cast is going to be shrunk. This may remove the cast or it might
6062
275
      // turn it into slightly different cast. For example, if MinBW == 16,
6063
275
      // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6064
275
      //
6065
275
      // Calculate the modified src and dest types.
6066
275
      Type *MinVecTy = VectorTy;
6067
275
      if (I->getOpcode() == Instruction::Trunc) {
6068
99
        SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6069
99
        VectorTy =
6070
99
            largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6071
176
      } else if (I->getOpcode() == Instruction::ZExt ||
6072
176
                 
I->getOpcode() == Instruction::SExt3
) {
6073
176
        SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6074
176
        VectorTy =
6075
176
            smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6076
176
      }
6077
275
    }
6078
65.8k
6079
65.8k
    unsigned N = isScalarAfterVectorization(I, VF) ? 
VF33.8k
:
132.0k
;
6080
65.8k
    return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
6081
65.8k
  }
6082
65.8k
  case Instruction::Call: {
6083
989
    bool NeedToScalarize;
6084
989
    CallInst *CI = cast<CallInst>(I);
6085
989
    unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6086
989
    if (getVectorIntrinsicIDForCall(CI, TLI))
6087
825
      return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6088
164
    return CallCost;
6089
164
  }
6090
164
  default:
6091
6
    // The cost of executing VF copies of the scalar instruction. This opcode
6092
6
    // is unknown. Assume that it is the same as 'mul'.
6093
6
    return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
6094
6
           getScalarizationOverhead(I, VF);
6095
632k
  } // end of switch.
6096
632k
}
6097
6098
char LoopVectorize::ID = 0;
6099
6100
static const char lv_name[] = "Loop Vectorization";
6101
6102
48.9k
INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6103
48.9k
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6104
48.9k
INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6105
48.9k
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6106
48.9k
INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6107
48.9k
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6108
48.9k
INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6109
48.9k
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6110
48.9k
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6111
48.9k
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6112
48.9k
INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6113
48.9k
INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6114
48.9k
INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6115
48.9k
INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6116
48.9k
INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6117
6118
namespace llvm {
6119
6120
0
Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6121
6122
Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6123
13.4k
                              bool VectorizeOnlyWhenForced) {
6124
13.4k
  return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6125
13.4k
}
6126
6127
} // end namespace llvm
6128
6129
627
bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6130
627
  // Check if the pointer operand of a load or store instruction is
6131
627
  // consecutive.
6132
627
  if (auto *Ptr = getLoadStorePointerOperand(Inst))
6133
627
    return Legal->isConsecutivePtr(Ptr);
6134
0
  return false;
6135
0
}
6136
6137
19.9k
void LoopVectorizationCostModel::collectValuesToIgnore() {
6138
19.9k
  // Ignore ephemeral values.
6139
19.9k
  CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6140
19.9k
6141
19.9k
  // Ignore type-promoting instructions we identified during reduction
6142
19.9k
  // detection.
6143
19.9k
  for (auto &Reduction : *Legal->getReductionVars()) {
6144
3.44k
    RecurrenceDescriptor &RedDes = Reduction.second;
6145
3.44k
    SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6146
3.44k
    VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6147
3.44k
  }
6148
19.9k
  // Ignore type-casting instructions we identified during induction
6149
19.9k
  // detection.
6150
23.8k
  for (auto &Induction : *Legal->getInductionVars()) {
6151
23.8k
    InductionDescriptor &IndDes = Induction.second;
6152
23.8k
    const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6153
23.8k
    VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6154
23.8k
  }
6155
19.9k
}
6156
6157
// TODO: we could return a pair of values that specify the max VF and
6158
// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6159
// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6160
// doesn't have a cost model that can choose which plan to execute if
6161
// more than one is generated.
6162
static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6163
5
                                 LoopVectorizationCostModel &CM) {
6164
5
  unsigned WidestType;
6165
5
  std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6166
5
  return WidestVectorRegBits / WidestType;
6167
5
}
6168
6169
VectorizationFactor
6170
LoopVectorizationPlanner::planInVPlanNativePath(bool OptForSize,
6171
7
                                                unsigned UserVF) {
6172
7
  unsigned VF = UserVF;
6173
7
  // Outer loop handling: They may require CFG and instruction level
6174
7
  // transformations before even evaluating whether vectorization is profitable.
6175
7
  // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6176
7
  // the vectorization pipeline.
6177
7
  if (!OrigLoop->empty()) {
6178
7
    // If the user doesn't provide a vectorization factor, determine a
6179
7
    // reasonable one.
6180
7
    if (!UserVF) {
6181
5
      VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
6182
5
      LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6183
5
6184
5
      // Make sure we have a VF > 1 for stress testing.
6185
5
      if (VPlanBuildStressTest && 
VF < 20
) {
6186
0
        LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6187
0
                          << "overriding computed VF.\n");
6188
0
        VF = 4;
6189
0
      }
6190
5
    }
6191
7
    assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6192
7
    assert(isPowerOf2_32(VF) && "VF needs to be a power of two");
6193
7
    LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF
6194
7
                      << " to build VPlans.\n");
6195
7
    buildVPlans(VF, VF);
6196
7
6197
7
    // For VPlan build stress testing, we bail out after VPlan construction.
6198
7
    if (VPlanBuildStressTest)
6199
0
      return VectorizationFactor::Disabled();
6200
7
6201
7
    return {VF, 0};
6202
7
  }
6203
0
6204
0
  LLVM_DEBUG(
6205
0
      dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6206
0
                "VPlan-native path.\n");
6207
0
  return VectorizationFactor::Disabled();
6208
0
}
6209
6210
Optional<VectorizationFactor> LoopVectorizationPlanner::plan(bool OptForSize,
6211
19.9k
                                                             unsigned UserVF) {
6212
19.9k
  assert(OrigLoop->empty() && "Inner loop expected.");
6213
19.9k
  Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(OptForSize);
6214
19.9k
  if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6215
103
    return None;
6216
19.8k
6217
19.8k
  // Invalidate interleave groups if all blocks of loop will be predicated.
6218
19.8k
  if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6219
19.8k
      
!useMaskedInterleavedAccesses(*TTI)26
) {
6220
21
    LLVM_DEBUG(
6221
21
        dbgs()
6222
21
        << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6223
21
           "which requires masked-interleaved support.\n");
6224
21
    CM.InterleaveInfo.reset();
6225
21
  }
6226
19.8k
6227
19.8k
  if (UserVF) {
6228
649
    LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6229
649
    assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
6230
649
    // Collect the instructions (and their associated costs) that will be more
6231
649
    // profitable to scalarize.
6232
649
    CM.selectUserVectorizationFactor(UserVF);
6233
649
    buildVPlansWithVPRecipes(UserVF, UserVF);
6234
649
    LLVM_DEBUG(printPlans(dbgs()));
6235
649
    return {{UserVF, 0}};
6236
649
  }
6237
19.2k
6238
19.2k
  unsigned MaxVF = MaybeMaxVF.getValue();
6239
19.2k
  assert(MaxVF != 0 && "MaxVF is zero.");
6240
19.2k
6241
70.1k
  for (unsigned VF = 1; VF <= MaxVF; 
VF *= 250.9k
) {
6242
50.9k
    // Collect Uniform and Scalar instructions after vectorization with VF.
6243
50.9k
    CM.collectUniformsAndScalars(VF);
6244
50.9k
6245
50.9k
    // Collect the instructions (and their associated costs) that will be more
6246
50.9k
    // profitable to scalarize.
6247
50.9k
    if (VF > 1)
6248
31.7k
      CM.collectInstsToScalarize(VF);
6249
50.9k
  }
6250
19.2k
6251
19.2k
  buildVPlansWithVPRecipes(1, MaxVF);
6252
19.2k
  LLVM_DEBUG(printPlans(dbgs()));
6253
19.2k
  if (MaxVF == 1)
6254
15
    return VectorizationFactor::Disabled();
6255
19.1k
6256
19.1k
  // Select the optimal vectorization factor.
6257
19.1k
  return CM.selectVectorizationFactor(MaxVF);
6258
19.1k
}
6259
6260
17.0k
void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
6261
17.0k
  LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
6262
17.0k
                    << '\n');
6263
17.0k
  BestVF = VF;
6264
17.0k
  BestUF = UF;
6265
17.0k
6266
32.9k
  erase_if(VPlans, [VF](const VPlanPtr &Plan) {
6267
32.9k
    return !Plan->hasVF(VF);
6268
32.9k
  });
6269
17.0k
  assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
6270
17.0k
}
6271
6272
void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
6273
17.0k
                                           DominatorTree *DT) {
6274
17.0k
  // Perform the actual loop transformation.
6275
17.0k
6276
17.0k
  // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6277
17.0k
  VPCallbackILV CallbackILV(ILV);
6278
17.0k
6279
17.0k
  VPTransformState State{BestVF, BestUF,      LI,
6280
17.0k
                         DT,     ILV.Builder, ILV.VectorLoopValueMap,
6281
17.0k
                         &ILV,   CallbackILV};
6282
17.0k
  State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6283
17.0k
  State.TripCount = ILV.getOrCreateTripCount(nullptr);
6284
17.0k
6285
17.0k
  //===------------------------------------------------===//
6286
17.0k
  //
6287
17.0k
  // Notice: any optimization or new instruction that go
6288
17.0k
  // into the code below should also be implemented in
6289
17.0k
  // the cost-model.
6290
17.0k
  //
6291
17.0k
  //===------------------------------------------------===//
6292
17.0k
6293
17.0k
  // 2. Copy and widen instructions from the old loop into the new loop.
6294
17.0k
  assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
6295
17.0k
  VPlans.front()->execute(&State);
6296
17.0k
6297
17.0k
  // 3. Fix the vectorized code: take care of header phi's, live-outs,
6298
17.0k
  //    predication, updating analyses.
6299
17.0k
  ILV.fixVectorizedLoop();
6300
17.0k
}
6301
6302
void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6303
19.8k
    SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6304
19.8k
  BasicBlock *Latch = OrigLoop->getLoopLatch();
6305
19.8k
6306
19.8k
  // We create new control-flow for the vectorized loop, so the original
6307
19.8k
  // condition will be dead after vectorization if it's only used by the
6308
19.8k
  // branch.
6309
19.8k
  auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
6310
19.8k
  if (Cmp && Cmp->hasOneUse())
6311
19.8k
    DeadInstructions.insert(Cmp);
6312
19.8k
6313
19.8k
  // We create new "steps" for induction variable updates to which the original
6314
19.8k
  // induction variables map. An original update instruction will be dead if
6315
19.8k
  // all its users except the induction variable are dead.
6316
23.7k
  for (auto &Induction : *Legal->getInductionVars()) {
6317
23.7k
    PHINode *Ind = Induction.first;
6318
23.7k
    auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
6319
49.7k
    if (
llvm::all_of(IndUpdate->users(), [&](User *U) -> bool 23.7k
{
6320
49.7k
          return U == Ind || DeadInstructions.find(cast<Instruction>(U)) !=
6321
28.1k
                                 DeadInstructions.end();
6322
49.7k
        }))
6323
14.3k
      DeadInstructions.insert(IndUpdate);
6324
23.7k
6325
23.7k
    // We record as "Dead" also the type-casting instructions we had identified
6326
23.7k
    // during induction analysis. We don't need any handling for them in the
6327
23.7k
    // vectorized loop because we have proven that, under a proper runtime
6328
23.7k
    // test guarding the vectorized loop, the value of the phi, and the casted
6329
23.7k
    // value of the phi, are the same. The last instruction in this casting chain
6330
23.7k
    // will get its scalar/vector/widened def from the scalar/vector/widened def
6331
23.7k
    // of the respective phi node. Any other casts in the induction def-use chain
6332
23.7k
    // have no other uses outside the phi update chain, and will be ignored.
6333
23.7k
    InductionDescriptor &IndDes = Induction.second;
6334
23.7k
    const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6335
23.7k
    DeadInstructions.insert(Casts.begin(), Casts.end());
6336
23.7k
  }
6337
19.8k
}
6338
6339
0
Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
6340
6341
1.84k
Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
6342
6343
Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
6344
3.64k
                                        Instruction::BinaryOps BinOp) {
6345
3.64k
  // When unrolling and the VF is 1, we only need to add a simple scalar.
6346
3.64k
  Type *Ty = Val->getType();
6347
3.64k
  assert(!Ty->isVectorTy() && "Val must be a scalar");
6348
3.64k
6349
3.64k
  if (Ty->isFloatingPointTy()) {
6350
12
    Constant *C = ConstantFP::get(Ty, (double)StartIdx);
6351
12
6352
12
    // Floating point operations had to be 'fast' to enable the unrolling.
6353
12
    Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
6354
12
    return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
6355
12
  }
6356
3.63k
  Constant *C = ConstantInt::get(Ty, StartIdx);
6357
3.63k
  return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
6358
3.63k
}
6359
6360
12.6k
static void AddRuntimeUnrollDisableMetaData(Loop *L) {
6361
12.6k
  SmallVector<Metadata *, 4> MDs;
6362
12.6k
  // Reserve first location for self reference to the LoopID metadata node.
6363
12.6k
  MDs.push_back(nullptr);
6364
12.6k
  bool IsUnrollMetadata = false;
6365
12.6k
  MDNode *LoopID = L->getLoopID();
6366
12.6k
  if (LoopID) {
6367
273
    // First find existing loop unrolling disable metadata.
6368
802
    for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; 
++i529
) {
6369
529
      auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
6370
529
      if (MD) {
6371
529
        const auto *S = dyn_cast<MDString>(MD->getOperand(0));
6372
529
        IsUnrollMetadata =
6373
529
            S && 
S->getString().startswith("llvm.loop.unroll.disable")46
;
6374
529
      }
6375
529
      MDs.push_back(LoopID->getOperand(i));
6376
529
    }
6377
273
  }
6378
12.6k
6379
12.6k
  if (!IsUnrollMetadata) {
6380
12.6k
    // Add runtime unroll disable metadata.
6381
12.6k
    LLVMContext &Context = L->getHeader()->getContext();
6382
12.6k
    SmallVector<Metadata *, 1> DisableOperands;
6383
12.6k
    DisableOperands.push_back(
6384
12.6k
        MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
6385
12.6k
    MDNode *DisableNode = MDNode::get(Context, DisableOperands);
6386
12.6k
    MDs.push_back(DisableNode);
6387
12.6k
    MDNode *NewLoopID = MDNode::get(Context, MDs);
6388
12.6k
    // Set operand 0 to refer to the loop id itself.
6389
12.6k
    NewLoopID->replaceOperandWith(0, NewLoopID);
6390
12.6k
    L->setLoopID(NewLoopID);
6391
12.6k
  }
6392
12.6k
}
6393
6394
bool LoopVectorizationPlanner::getDecisionAndClampRange(
6395
1.11M
    const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
6396
1.11M
  assert(Range.End > Range.Start && "Trying to test an empty VF range.");
6397
1.11M
  bool PredicateAtRangeStart = Predicate(Range.Start);
6398
1.11M
6399
1.43M
  for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; 
TmpVF *= 2322k
)
6400
340k
    if (Predicate(TmpVF) != PredicateAtRangeStart) {
6401
18.6k
      Range.End = TmpVF;
6402
18.6k
      break;
6403
18.6k
    }
6404
1.11M
6405
1.11M
  return PredicateAtRangeStart;
6406
1.11M
}
6407
6408
/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6409
/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6410
/// of VF's starting at a given VF and extending it as much as possible. Each
6411
/// vectorization decision can potentially shorten this sub-range during
6412
/// buildVPlan().
6413
7
void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
6414
14
  for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6415
7
    VFRange SubRange = {VF, MaxVF + 1};
6416
7
    VPlans.push_back(buildVPlan(SubRange));
6417
7
    VF = SubRange.End;
6418
7
  }
6419
7
}
6420
6421
VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
6422
5.79k
                                         VPlanPtr &Plan) {
6423
5.79k
  assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
6424
5.79k
6425
5.79k
  // Look for cached value.
6426
5.79k
  std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
6427
5.79k
  EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
6428
5.79k
  if (ECEntryIt != EdgeMaskCache.end())
6429
1.17k
    return ECEntryIt->second;
6430
4.62k
6431
4.62k
  VPValue *SrcMask = createBlockInMask(Src, Plan);
6432
4.62k
6433
4.62k
  // The terminator has to be a branch inst!
6434
4.62k
  BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
6435
4.62k
  assert(BI && "Unexpected terminator found");
6436
4.62k
6437
4.62k
  if (!BI->isConditional())
6438
1.31k
    return EdgeMaskCache[Edge] = SrcMask;
6439
3.30k
6440
3.30k
  VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
6441
3.30k
  assert(EdgeMask && "No Edge Mask found for condition");
6442
3.30k
6443
3.30k
  if (BI->getSuccessor(0) != Dst)
6444
1.51k
    EdgeMask = Builder.createNot(EdgeMask);
6445
3.30k
6446
3.30k
  if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
6447
1.54k
    EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
6448
3.30k
6449
3.30k
  return EdgeMaskCache[Edge] = EdgeMask;
6450
3.30k
}
6451
6452
7.52k
VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
6453
7.52k
  assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
6454
7.52k
6455
7.52k
  // Look for cached value.
6456
7.52k
  BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
6457
7.52k
  if (BCEntryIt != BlockMaskCache.end())
6458
3.62k
    return BCEntryIt->second;
6459
3.89k
6460
3.89k
  // All-one mask is modelled as no-mask following the convention for masked
6461
3.89k
  // load/store/gather/scatter. Initialize BlockMask to no-mask.
6462
3.89k
  VPValue *BlockMask = nullptr;
6463
3.89k
6464
3.89k
  if (OrigLoop->getHeader() == BB) {
6465
1.11k
    if (!CM.blockNeedsPredication(BB))
6466
1.07k
      return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
6467
35
6468
35
    // Introduce the early-exit compare IV <= BTC to form header block mask.
6469
35
    // This is used instead of IV < TC because TC may wrap, unlike BTC.
6470
35
    VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction());
6471
35
    VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
6472
35
    BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
6473
35
    return BlockMaskCache[BB] = BlockMask;
6474
35
  }
6475
2.78k
6476
2.78k
  // This is the block mask. We OR all incoming edges.
6477
3.17k
  
for (auto *Predecessor : predecessors(BB))2.78k
{
6478
3.17k
    VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
6479
3.17k
    if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
6480
0
      return BlockMaskCache[BB] = EdgeMask;
6481
3.17k
6482
3.17k
    if (!BlockMask) { // BlockMask has its initialized nullptr value.
6483
2.78k
      BlockMask = EdgeMask;
6484
2.78k
      continue;
6485
2.78k
    }
6486
392
6487
392
    BlockMask = Builder.createOr(BlockMask, EdgeMask);
6488
392
  }
6489
2.78k
6490
2.78k
  return BlockMaskCache[BB] = BlockMask;
6491
2.78k
}
6492
6493
VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I,
6494
                                                           VFRange &Range,
6495
380k
                                                           VPlanPtr &Plan) {
6496
380k
  const InterleaveGroup<Instruction> *IG = CM.getInterleavedAccessGroup(I);
6497
380k
  if (!IG)
6498
374k
    return nullptr;
6499
5.28k
6500
5.28k
  // Now check if IG is relevant for VF's in the given range.
6501
5.28k
  auto isIGMember = [&](Instruction *I) -> std::function<bool(unsigned)> {
6502
6.61k
    return [=](unsigned VF) -> bool {
6503
6.61k
      return (VF >= 2 && // Query is illegal for VF == 1
6504
6.61k
              CM.getWideningDecision(I, VF) ==
6505
2.92k
                  LoopVectorizationCostModel::CM_Interleave);
6506
6.61k
    };
6507
5.28k
  };
6508
5.28k
  if (!LoopVectorizationPlanner::getDecisionAndClampRange(isIGMember(I), Range))
6509
3.80k
    return nullptr;
6510
1.48k
6511
1.48k
  // I is a member of an InterleaveGroup for VF's in the (possibly trimmed)
6512
1.48k
  // range. If it's the primary member of the IG construct a VPInterleaveRecipe.
6513
1.48k
  // Otherwise, it's an adjunct member of the IG, do not construct any Recipe.
6514
1.48k
  assert(I == IG->getInsertPos() &&
6515
1.48k
         "Generating a recipe for an adjunct member of an interleave group");
6516
1.48k
6517
1.48k
  VPValue *Mask = nullptr;
6518
1.48k
  if (Legal->isMaskRequired(I))
6519
11
    Mask = createBlockInMask(I->getParent(), Plan);
6520
1.48k
6521
1.48k
  return new VPInterleaveRecipe(IG, Mask);
6522
1.48k
}
6523
6524
VPWidenMemoryInstructionRecipe *
6525
VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
6526
378k
                                  VPlanPtr &Plan) {
6527
378k
  if (!isa<LoadInst>(I) && 
!isa<StoreInst>(I)342k
)
6528
302k
    return nullptr;
6529
75.8k
6530
105k
  
auto willWiden = [&](unsigned VF) -> bool 75.8k
{
6531
105k
    if (VF == 1)
6532
39.1k
      return false;
6533
66.4k
    if (CM.isScalarAfterVectorization(I, VF) ||
6534
66.4k
        CM.isProfitableToScalarize(I, VF))
6535
258
      return false;
6536
66.1k
    LoopVectorizationCostModel::InstWidening Decision =
6537
66.1k
        CM.getWideningDecision(I, VF);
6538
66.1k
    assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
6539
66.1k
           "CM decision should be taken at this point.");
6540
66.1k
    assert(Decision != LoopVectorizationCostModel::CM_Interleave &&
6541
66.1k
           "Interleave memory opportunity should be caught earlier.");
6542
66.1k
    return Decision != LoopVectorizationCostModel::CM_Scalarize;
6543
66.1k
  };
6544
75.8k
6545
75.8k
  if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6546
48.1k
    return nullptr;
6547
27.6k
6548
27.6k
  VPValue *Mask = nullptr;
6549
27.6k
  if (Legal->isMaskRequired(I))
6550
106
    Mask = createBlockInMask(I->getParent(), Plan);
6551
27.6k
6552
27.6k
  return new VPWidenMemoryInstructionRecipe(*I, Mask);
6553
27.6k
}
6554
6555
VPWidenIntOrFpInductionRecipe *
6556
351k
VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) {
6557
351k
  if (PHINode *Phi = dyn_cast<PHINode>(I)) {
6558
53.8k
    // Check if this is an integer or fp induction. If so, build the recipe that
6559
53.8k
    // produces its scalar and vector values.
6560
53.8k
    InductionDescriptor II = Legal->getInductionVars()->lookup(Phi);
6561
53.8k
    if (II.getKind() == InductionDescriptor::IK_IntInduction ||
6562
53.8k
        
II.getKind() == InductionDescriptor::IK_FpInduction16.0k
)
6563
37.7k
      return new VPWidenIntOrFpInductionRecipe(Phi);
6564
16.0k
6565
16.0k
    return nullptr;
6566
16.0k
  }
6567
297k
6568
297k
  // Optimize the special case where the source is a constant integer
6569
297k
  // induction variable. Notice that we can only optimize the 'trunc' case
6570
297k
  // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6571
297k
  // (c) other casts depend on pointer size.
6572
297k
6573
297k
  // Determine whether \p K is a truncation based on an induction variable that
6574
297k
  // can be optimized.
6575
297k
  auto isOptimizableIVTruncate =
6576
297k
      [&](Instruction *K) -> std::function<bool(unsigned)> {
6577
21.3k
    return
6578
28.4k
        [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
6579
21.3k
  };
6580
297k
6581
297k
  if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange(
6582
21.3k
                               isOptimizableIVTruncate(I), Range))
6583
5.01k
    return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
6584
5.01k
                                             cast<TruncInst>(I));
6585
292k
  return nullptr;
6586
292k
}
6587
6588
308k
VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
6589
308k
  PHINode *Phi = dyn_cast<PHINode>(I);
6590
308k
  if (!Phi || 
Phi->getParent() == OrigLoop->getHeader()16.0k
)
6591
307k
    return nullptr;
6592
1.08k
6593
1.08k
  // We know that all PHIs in non-header blocks are converted into selects, so
6594
1.08k
  // we don't have to worry about the insertion order and we can just use the
6595
1.08k
  // builder. At this point we generate the predication tree. There may be
6596
1.08k
  // duplications since this is a simple recursive scan, but future
6597
1.08k
  // optimizations will clean it up.
6598
1.08k
6599
1.08k
  SmallVector<VPValue *, 2> Masks;
6600
1.08k
  unsigned NumIncoming = Phi->getNumIncomingValues();
6601
3.71k
  for (unsigned In = 0; In < NumIncoming; 
In++2.62k
) {
6602
2.62k
    VPValue *EdgeMask =
6603
2.62k
      createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
6604
2.62k
    assert((EdgeMask || NumIncoming == 1) &&
6605
2.62k
           "Multiple predecessors with one having a full mask");
6606
2.62k
    if (EdgeMask)
6607
2.62k
      Masks.push_back(EdgeMask);
6608
2.62k
  }
6609
1.08k
  return new VPBlendRecipe(Phi, Masks);
6610
1.08k
}
6611
6612
bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
6613
292k
                                 VFRange &Range) {
6614
292k
6615
292k
  bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6616
394k
      [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6617
292k
6618
292k
  if (IsPredicated)
6619
2.78k
    return false;
6620
289k
6621
289k
  auto IsVectorizableOpcode = [](unsigned Opcode) {
6622
289k
    switch (Opcode) {
6623
289k
    case Instruction::Add:
6624
289k
    case Instruction::And:
6625
289k
    case Instruction::AShr:
6626
289k
    case Instruction::BitCast:
6627
289k
    case Instruction::Br:
6628
289k
    case Instruction::Call:
6629
289k
    case Instruction::FAdd:
6630
289k
    case Instruction::FCmp:
6631
289k
    case Instruction::FDiv:
6632
289k
    case Instruction::FMul:
6633
289k
    case Instruction::FNeg:
6634
289k
    case Instruction::FPExt:
6635
289k
    case Instruction::FPToSI:
6636
289k
    case Instruction::FPToUI:
6637
289k
    case Instruction::FPTrunc:
6638
289k
    case Instruction::FRem:
6639
289k
    case Instruction::FSub:
6640
289k
    case Instruction::GetElementPtr:
6641
289k
    case Instruction::ICmp:
6642
289k
    case Instruction::IntToPtr:
6643
289k
    case Instruction::Load:
6644
289k
    case Instruction::LShr:
6645
289k
    case Instruction::Mul:
6646
289k
    case Instruction::Or:
6647
289k
    case Instruction::PHI:
6648
289k
    case Instruction::PtrToInt:
6649
289k
    case Instruction::SDiv:
6650
289k
    case Instruction::Select:
6651
289k
    case Instruction::SExt:
6652
289k
    case Instruction::Shl:
6653
289k
    case Instruction::SIToFP:
6654
289k
    case Instruction::SRem:
6655
289k
    case Instruction::Store:
6656
289k
    case Instruction::Sub:
6657
289k
    case Instruction::Trunc:
6658
289k
    case Instruction::UDiv:
6659
289k
    case Instruction::UIToFP:
6660
289k
    case Instruction::URem:
6661
289k
    case Instruction::Xor:
6662
289k
    case Instruction::ZExt:
6663
289k
      return true;
6664
6
    }
6665
6
    return false;
6666
6
  };
6667
289k
6668
289k
  if (!IsVectorizableOpcode(I->getOpcode()))
6669
6
    return false;
6670
289k
6671
289k
  if (CallInst *CI = dyn_cast<CallInst>(I)) {
6672
850
    Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6673
850
    if (ID && 
(686
ID == Intrinsic::assume686
||
ID == Intrinsic::lifetime_end676
||
6674
686
               
ID == Intrinsic::lifetime_start671
||
ID == Intrinsic::sideeffect668
))
6675
19
      return false;
6676
289k
  }
6677
289k
6678
387k
  
auto willWiden = [&](unsigned VF) -> bool 289k
{
6679
387k
    if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
6680
387k
                             
CM.isProfitableToScalarize(I, VF)137k
))
6681
249k
      return false;
6682
137k
    if (CallInst *CI = dyn_cast<CallInst>(I)) {
6683
653
      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6684
653
      // The following case may be scalarized depending on the VF.
6685
653
      // The flag shows whether we use Intrinsic or a usual Call for vectorized
6686
653
      // version of the instruction.
6687
653
      // Is it beneficial to perform intrinsic call compared to lib call?
6688
653
      bool NeedToScalarize;
6689
653
      unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
6690
653
      bool UseVectorIntrinsic =
6691
653
          ID && 
CM.getVectorIntrinsicCost(CI, VF) <= CallCost516
;
6692
653
      return UseVectorIntrinsic || 
!NeedToScalarize188
;
6693
653
    }
6694
137k
    if (isa<LoadInst>(I) || 
isa<StoreInst>(I)128k
) {
6695
12.8k
      assert(CM.getWideningDecision(I, VF) ==
6696
12.8k
                 LoopVectorizationCostModel::CM_Scalarize &&
6697
12.8k
             "Memory widening decisions should have been taken care by now");
6698
12.8k
      return false;
6699
12.8k
    }
6700
124k
    return true;
6701
124k
  };
6702
289k
6703
289k
  if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6704
210k
    return false;
6705
79.0k
6706
79.0k
  // Success: widen this instruction. We optimize the common case where
6707
79.0k
  // consecutive instructions can be represented by a single recipe.
6708
79.0k
  if (!VPBB->empty()) {
6709
77.6k
    VPWidenRecipe *LastWidenRecipe = dyn_cast<VPWidenRecipe>(&VPBB->back());
6710
77.6k
    if (LastWidenRecipe && 
LastWidenRecipe->appendInstruction(I)53.5k
)
6711
53.3k
      return true;
6712
25.7k
  }
6713
25.7k
6714
25.7k
  VPBB->appendRecipe(new VPWidenRecipe(I));
6715
25.7k
  return true;
6716
25.7k
}
6717
6718
VPBasicBlock *VPRecipeBuilder::handleReplication(
6719
    Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
6720
    DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
6721
213k
    VPlanPtr &Plan) {
6722
213k
  bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
6723
265k
      [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
6724
213k
      Range);
6725
213k
6726
213k
  bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6727
264k
      [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6728
213k
6729
213k
  auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
6730
213k
6731
213k
  // Find if I uses a predicated instruction. If so, it will use its scalar
6732
213k
  // value. Avoid hoisting the insert-element which packs the scalar value into
6733
213k
  // a vector value, as that happens iff all users use the vector value.
6734
213k
  for (auto &Op : I->operands())
6735
436k
    if (auto *PredInst = dyn_cast<Instruction>(Op))
6736
290k
      if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
6737
1.08k
        PredInst2Recipe[PredInst]->setAlsoPack(false);
6738
213k
6739
213k
  // Finalize the recipe for Instr, first if it is not predicated.
6740
213k
  if (!IsPredicated) {
6741
210k
    LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
6742
210k
    VPBB->appendRecipe(Recipe);
6743
210k
    return VPBB;
6744
210k
  }
6745
2.78k
  LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
6746
2.78k
  assert(VPBB->getSuccessors().empty() &&
6747
2.78k
         "VPBB has successors when handling predicated replication.");
6748
2.78k
  // Record predicated instructions for above packing optimizations.
6749
2.78k
  PredInst2Recipe[I] = Recipe;
6750
2.78k
  VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
6751
2.78k
  VPBlockUtils::insertBlockAfter(Region, VPBB);
6752
2.78k
  auto *RegSucc = new VPBasicBlock();
6753
2.78k
  VPBlockUtils::insertBlockAfter(RegSucc, Region);
6754
2.78k
  return RegSucc;
6755
2.78k
}
6756
6757
VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
6758
                                                      VPRecipeBase *PredRecipe,
6759
2.78k
                                                      VPlanPtr &Plan) {
6760
2.78k
  // Instructions marked for predication are replicated and placed under an
6761
2.78k
  // if-then construct to prevent side-effects.
6762
2.78k
6763
2.78k
  // Generate recipes to compute the block mask for this region.
6764
2.78k
  VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
6765
2.78k
6766
2.78k
  // Build the triangular if-then region.
6767
2.78k
  std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
6768
2.78k
  assert(Instr->getParent() && "Predicated instruction not in any basic block");
6769
2.78k
  auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
6770
2.78k
  auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
6771
2.78k
  auto *PHIRecipe =
6772
2.78k
      Instr->getType()->isVoidTy() ? 
nullptr1.13k
:
new VPPredInstPHIRecipe(Instr)1.65k
;
6773
2.78k
  auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
6774
2.78k
  auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
6775
2.78k
  VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
6776
2.78k
6777
2.78k
  // Note: first set Entry as region entry and then connect successors starting
6778
2.78k
  // from it in order, to propagate the "parent" of each VPBasicBlock.
6779
2.78k
  VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
6780
2.78k
  VPBlockUtils::connectBlocks(Pred, Exit);
6781
2.78k
6782
2.78k
  return Region;
6783
2.78k
}
6784
6785
bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
6786
380k
                                        VPlanPtr &Plan, VPBasicBlock *VPBB) {
6787
380k
  VPRecipeBase *Recipe = nullptr;
6788
380k
  // Check if Instr should belong to an interleave memory recipe, or already
6789
380k
  // does. In the latter case Instr is irrelevant.
6790
380k
  if ((Recipe = tryToInterleaveMemory(Instr, Range, Plan))) {
6791
1.48k
    VPBB->appendRecipe(Recipe);
6792
1.48k
    return true;
6793
1.48k
  }
6794
378k
6795
378k
  // Check if Instr is a memory operation that should be widened.
6796
378k
  if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) {
6797
27.6k
    VPBB->appendRecipe(Recipe);
6798
27.6k
    return true;
6799
27.6k
  }
6800
351k
6801
351k
  // Check if Instr should form some PHI recipe.
6802
351k
  if ((Recipe = tryToOptimizeInduction(Instr, Range))) {
6803
42.8k
    VPBB->appendRecipe(Recipe);
6804
42.8k
    return true;
6805
42.8k
  }
6806
308k
  if ((Recipe = tryToBlend(Instr, Plan))) {
6807
1.08k
    VPBB->appendRecipe(Recipe);
6808
1.08k
    return true;
6809
1.08k
  }
6810
307k
  if (PHINode *Phi = dyn_cast<PHINode>(Instr)) {
6811
14.9k
    VPBB->appendRecipe(new VPWidenPHIRecipe(Phi));
6812
14.9k
    return true;
6813
14.9k
  }
6814
292k
6815
292k
  // Check if Instr is to be widened by a general VPWidenRecipe, after
6816
292k
  // having first checked for specific widening recipes that deal with
6817
292k
  // Interleave Groups, Inductions and Phi nodes.
6818
292k
  if (tryToWiden(Instr, VPBB, Range))
6819
79.0k
    return true;
6820
213k
6821
213k
  return false;
6822
213k
}
6823
6824
void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
6825
19.8k
                                                        unsigned MaxVF) {
6826
19.8k
  assert(OrigLoop->empty() && "Inner loop expected.");
6827
19.8k
6828
19.8k
  // Collect conditions feeding internal conditional branches; they need to be
6829
19.8k
  // represented in VPlan for it to model masking.
6830
19.8k
  SmallPtrSet<Value *, 1> NeedDef;
6831
19.8k
6832
19.8k
  auto *Latch = OrigLoop->getLoopLatch();
6833
21.9k
  for (BasicBlock *BB : OrigLoop->blocks()) {
6834
21.9k
    if (BB == Latch)
6835
19.8k
      continue;
6836
2.07k
    BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
6837
2.07k
    if (Branch && Branch->isConditional())
6838
1.01k
      NeedDef.insert(Branch->getCondition());
6839
2.07k
  }
6840
19.8k
6841
19.8k
  // If the tail is to be folded by masking, the primary induction variable
6842
19.8k
  // needs to be represented in VPlan for it to model early-exit masking.
6843
19.8k
  if (CM.foldTailByMasking())
6844
26
    NeedDef.insert(Legal->getPrimaryInduction());
6845
19.8k
6846
19.8k
  // Collect instructions from the original loop that will become trivially dead
6847
19.8k
  // in the vectorized loop. We don't need to vectorize these instructions. For
6848
19.8k
  // example, original induction update instructions can become dead because we
6849
19.8k
  // separately emit induction "steps" when generating code for the new loop.
6850
19.8k
  // Similarly, we create a new latch condition when setting up the structure
6851
19.8k
  // of the new loop, so the old one can become dead.
6852
19.8k
  SmallPtrSet<Instruction *, 4> DeadInstructions;
6853
19.8k
  collectTriviallyDeadInstructions(DeadInstructions);
6854
19.8k
6855
58.3k
  for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6856
38.5k
    VFRange SubRange = {VF, MaxVF + 1};
6857
38.5k
    VPlans.push_back(
6858
38.5k
        buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions));
6859
38.5k
    VF = SubRange.End;
6860
38.5k
  }
6861
19.8k
}
6862
6863
VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
6864
    VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
6865
38.5k
    SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6866
38.5k
  // Hold a mapping from predicated instructions to their recipes, in order to
6867
38.5k
  // fix their AlsoPack behavior if a user is determined to replicate and use a
6868
38.5k
  // scalar instead of vector value.
6869
38.5k
  DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
6870
38.5k
6871
38.5k
  DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
6872
38.5k
  DenseMap<Instruction *, Instruction *> SinkAfterInverse;
6873
38.5k
6874
38.5k
  // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
6875
38.5k
  VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
6876
38.5k
  auto Plan = llvm::make_unique<VPlan>(VPBB);
6877
38.5k
6878
38.5k
  VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);
6879
38.5k
  // Represent values that will have defs inside VPlan.
6880
38.5k
  for (Value *V : NeedDef)
6881
1.96k
    Plan->addVPValue(V);
6882
38.5k
6883
38.5k
  // Scan the body of the loop in a topological order to visit each basic block
6884
38.5k
  // after having visited its predecessor basic blocks.
6885
38.5k
  LoopBlocksDFS DFS(OrigLoop);
6886
38.5k
  DFS.perform(LI);
6887
38.5k
6888
42.4k
  for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6889
42.4k
    // Relevant instructions from basic block BB will be grouped into VPRecipe
6890
42.4k
    // ingredients and fill a new VPBasicBlock.
6891
42.4k
    unsigned VPBBsForBB = 0;
6892
42.4k
    auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
6893
42.4k
    VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
6894
42.4k
    VPBB = FirstVPBBForBB;
6895
42.4k
    Builder.setInsertPoint(VPBB);
6896
42.4k
6897
42.4k
    std::vector<Instruction *> Ingredients;
6898
42.4k
6899
42.4k
    // Organize the ingredients to vectorize from current basic block in the
6900
42.4k
    // right order.
6901
491k
    for (Instruction &I : BB->instructionsWithoutDebug()) {
6902
491k
      Instruction *Instr = &I;
6903
491k
6904
491k
      // First filter out irrelevant instructions, to ensure no recipes are
6905
491k
      // built for them.
6906
491k
      if (isa<BranchInst>(Instr) ||
6907
491k
          
DeadInstructions.find(Instr) != DeadInstructions.end()448k
)
6908
108k
        continue;
6909
382k
6910
382k
      // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct
6911
382k
      // member of the IG, do not construct any Recipe for it.
6912
382k
      const InterleaveGroup<Instruction> *IG =
6913
382k
          CM.getInterleavedAccessGroup(Instr);
6914
382k
      if (IG && 
Instr != IG->getInsertPos()7.54k
&&
6915
382k
          
Range.Start >= 24.56k
&& // Query is illegal for VF == 1
6916
382k
          CM.getWideningDecision(Instr, Range.Start) ==
6917
2.32k
              LoopVectorizationCostModel::CM_Interleave) {
6918
2.26k
        auto SinkCandidate = SinkAfterInverse.find(Instr);
6919
2.26k
        if (SinkCandidate != SinkAfterInverse.end())
6920
1
          Ingredients.push_back(SinkCandidate->second);
6921
2.26k
        continue;
6922
2.26k
      }
6923
380k
6924
380k
      // Move instructions to handle first-order recurrences, step 1: avoid
6925
380k
      // handling this instruction until after we've handled the instruction it
6926
380k
      // should follow.
6927
380k
      auto SAIt = SinkAfter.find(Instr);
6928
380k
      if (SAIt != SinkAfter.end()) {
6929
31
        LLVM_DEBUG(dbgs() << "Sinking" << *SAIt->first << " after"
6930
31
                          << *SAIt->second
6931
31
                          << " to vectorize a 1st order recurrence.\n");
6932
31
        SinkAfterInverse[SAIt->second] = Instr;
6933
31
        continue;
6934
31
      }
6935
380k
6936
380k
      Ingredients.push_back(Instr);
6937
380k
6938
380k
      // Move instructions to handle first-order recurrences, step 2: push the
6939
380k
      // instruction to be sunk at its insertion point.
6940
380k
      auto SAInvIt = SinkAfterInverse.find(Instr);
6941
380k
      if (SAInvIt != SinkAfterInverse.end())
6942
30
        Ingredients.push_back(SAInvIt->second);
6943
380k
    }
6944
42.4k
6945
42.4k
    // Introduce each ingredient into VPlan.
6946
380k
    for (Instruction *Instr : Ingredients) {
6947
380k
      if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
6948
167k
        continue;
6949
213k
6950
213k
      // Otherwise, if all widening options failed, Instruction is to be
6951
213k
      // replicated. This may create a successor for VPBB.
6952
213k
      VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
6953
213k
          Instr, Range, VPBB, PredInst2Recipe, Plan);
6954
213k
      if (NextVPBB != VPBB) {
6955
2.78k
        VPBB = NextVPBB;
6956
2.78k
        VPBB->setName(BB->hasName() ? 
BB->getName() + "." + Twine(VPBBsForBB++)121
6957
2.78k
                                    : 
""2.66k
);
6958
2.78k
      }
6959
213k
    }
6960
42.4k
  }
6961
38.5k
6962
38.5k
  // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
6963
38.5k
  // may also be empty, such as the last one VPBB, reflecting original
6964
38.5k
  // basic-blocks with no recipes.
6965
38.5k
  VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
6966
38.5k
  assert(PreEntry->empty() && "Expecting empty pre-entry block.");
6967
38.5k
  VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
6968
38.5k
  VPBlockUtils::disconnectBlocks(PreEntry, Entry);
6969
38.5k
  delete PreEntry;
6970
38.5k
6971
38.5k
  std::string PlanName;
6972
38.5k
  raw_string_ostream RSO(PlanName);
6973
38.5k
  unsigned VF = Range.Start;
6974
38.5k
  Plan->addVF(VF);
6975
38.5k
  RSO << "Initial VPlan for VF={" << VF;
6976
51.6k
  for (VF *= 2; VF < Range.End; 
VF *= 213.0k
) {
6977
13.0k
    Plan->addVF(VF);
6978
13.0k
    RSO << "," << VF;
6979
13.0k
  }
6980
38.5k
  RSO << "},UF>=1";
6981
38.5k
  RSO.flush();
6982
38.5k
  Plan->setName(PlanName);
6983
38.5k
6984
38.5k
  return Plan;
6985
38.5k
}
6986
6987
7
VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
6988
7
  // Outer loop handling: They may require CFG and instruction level
6989
7
  // transformations before even evaluating whether vectorization is profitable.
6990
7
  // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6991
7
  // the vectorization pipeline.
6992
7
  assert(!OrigLoop->empty());
6993
7
  assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6994
7
6995
7
  // Create new empty VPlan
6996
7
  auto Plan = llvm::make_unique<VPlan>();
6997
7
6998
7
  // Build hierarchical CFG
6999
7
  VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7000
7
  HCFGBuilder.buildHierarchicalCFG();
7001
7
7002
14
  for (unsigned VF = Range.Start; VF < Range.End; 
VF *= 27
)
7003
7
    Plan->addVF(VF);
7004
7
7005
7
  if (EnableVPlanPredication) {
7006
0
    VPlanPredicator VPP(*Plan);
7007
0
    VPP.predicate();
7008
0
7009
0
    // Avoid running transformation to recipes until masked code generation in
7010
0
    // VPlan-native path is in place.
7011
0
    return Plan;
7012
0
  }
7013
7
7014
7
  SmallPtrSet<Instruction *, 1> DeadInstructions;
7015
7
  VPlanHCFGTransforms::VPInstructionsToVPRecipes(
7016
7
      Plan, Legal->getInductionVars(), DeadInstructions);
7017
7
7018
7
  return Plan;
7019
7
}
7020
7021
Value* LoopVectorizationPlanner::VPCallbackILV::
7022
1.34k
getOrCreateVectorValues(Value *V, unsigned Part) {
7023
1.34k
      return ILV.getOrCreateVectorValue(V, Part);
7024
1.34k
}
7025
7026
0
void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
7027
0
  O << " +\n"
7028
0
    << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7029
0
  IG->getInsertPos()->printAsOperand(O, false);
7030
0
  if (User) {
7031
0
    O << ", ";
7032
0
    User->getOperand(0)->printAsOperand(O);
7033
0
  }
7034
0
  O << "\\l\"";
7035
0
  for (unsigned i = 0; i < IG->getFactor(); ++i)
7036
0
    if (Instruction *I = IG->getMember(i))
7037
0
      O << " +\n"
7038
0
        << Indent << "\"  " << VPlanIngredient(I) << " " << i << "\\l\"";
7039
0
}
7040
7041
14.2k
void VPWidenRecipe::execute(VPTransformState &State) {
7042
14.2k
  for (auto &Instr : make_range(Begin, End))
7043
50.4k
    State.ILV->widenInstruction(Instr);
7044
14.2k
}
7045
7046
19.2k
void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7047
19.2k
  assert(!State.Instance && "Int or FP induction being replicated.");
7048
19.2k
  State.ILV->widenIntOrFpInduction(IV, Trunc);
7049
19.2k
}
7050
7051
5.05k
void VPWidenPHIRecipe::execute(VPTransformState &State) {
7052
5.05k
  State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7053
5.05k
}
7054
7055
168
void VPBlendRecipe::execute(VPTransformState &State) {
7056
168
  State.ILV->setDebugLocFromInst(State.Builder, Phi);
7057
168
  // We know that all PHIs in non-header blocks are converted into
7058
168
  // selects, so we don't have to worry about the insertion order and we
7059
168
  // can just use the builder.
7060
168
  // At this point we generate the predication tree. There may be
7061
168
  // duplications since this is a simple recursive scan, but future
7062
168
  // optimizations will clean it up.
7063
168
7064
168
  unsigned NumIncoming = Phi->getNumIncomingValues();
7065
168
7066
168
  assert((User || NumIncoming == 1) &&
7067
168
         "Multiple predecessors with predecessors having a full mask");
7068
168
  // Generate a sequence of selects of the form:
7069
168
  // SELECT(Mask3, In3,
7070
168
  //      SELECT(Mask2, In2,
7071
168
  //                   ( ...)))
7072
168
  InnerLoopVectorizer::VectorParts Entry(State.UF);
7073
536
  for (unsigned In = 0; In < NumIncoming; 
++In368
) {
7074
980
    for (unsigned Part = 0; Part < State.UF; 
++Part612
) {
7075
612
      // We might have single edge PHIs (blocks) - use an identity
7076
612
      // 'select' for the first PHI operand.
7077
612
      Value *In0 =
7078
612
          State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part);
7079
612
      if (In == 0)
7080
277
        Entry[Part] = In0; // Initialize with the first incoming value.
7081
335
      else {
7082
335
        // Select between the current value and the previous incoming edge
7083
335
        // based on the incoming mask.
7084
335
        Value *Cond = State.get(User->getOperand(In), Part);
7085
335
        Entry[Part] =
7086
335
            State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
7087
335
      }
7088
612
    }
7089
368
  }
7090
445
  for (unsigned Part = 0; Part < State.UF; 
++Part277
)
7091
277
    State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7092
168
}
7093
7094
669
void VPInterleaveRecipe::execute(VPTransformState &State) {
7095
669
  assert(!State.Instance && "Interleave group being replicated.");
7096
669
  if (!User)
7097
658
    return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());
7098
11
7099
11
  // Last (and currently only) operand is a mask.
7100
11
  InnerLoopVectorizer::VectorParts MaskValues(State.UF);
7101
11
  VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
7102
22
  for (unsigned Part = 0; Part < State.UF; 
++Part11
)
7103
11
    MaskValues[Part] = State.get(Mask, Part);
7104
11
  State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues);
7105
11
}
7106
7107
37.2k
void VPReplicateRecipe::execute(VPTransformState &State) {
7108
37.2k
  if (State.Instance) { // Generate a single instance.
7109
948
    State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
7110
948
    // Insert scalar instance packing it into a vector.
7111
948
    if (AlsoPack && 
State.VF > 1222
) {
7112
182
      // If we're constructing lane 0, initialize to start from undef.
7113
182
      if (State.Instance->Lane == 0) {
7114
46
        Value *Undef =
7115
46
            UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
7116
46
        State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
7117
46
      }
7118
182
      State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
7119
182
    }
7120
948
    return;
7121
948
  }
7122
36.2k
7123
36.2k
  // Generate scalar instances for all VF lanes of all UF parts, unless the
7124
36.2k
  // instruction is uniform inwhich case generate only the first lane for each
7125
36.2k
  // of the UF parts.
7126
36.2k
  unsigned EndLane = IsUniform ? 
135.6k
:
State.VF651
;
7127
104k
  for (unsigned Part = 0; Part < State.UF; 
++Part68.3k
)
7128
139k
    
for (unsigned Lane = 0; 68.3k
Lane < EndLane;
++Lane70.7k
)
7129
70.7k
      State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
7130
36.2k
}
7131
7132
948
void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
7133
948
  assert(State.Instance && "Branch on Mask works only on single instance.");
7134
948
7135
948
  unsigned Part = State.Instance->Part;
7136
948
  unsigned Lane = State.Instance->Lane;
7137
948
7138
948
  Value *ConditionBit = nullptr;
7139
948
  if (!User) // Block in mask is all-one.
7140
0
    ConditionBit = State.Builder.getTrue();
7141
948
  else {
7142
948
    VPValue *BlockInMask = User->getOperand(0);
7143
948
    ConditionBit = State.get(BlockInMask, Part);
7144
948
    if (ConditionBit->getType()->isVectorTy())
7145
560
      ConditionBit = State.Builder.CreateExtractElement(
7146
560
          ConditionBit, State.Builder.getInt32(Lane));
7147
948
  }
7148
948
7149
948
  // Replace the temporary unreachable terminator with a new conditional branch,
7150
948
  // whose two destinations will be set later when they are created.
7151
948
  auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
7152
948
  assert(isa<UnreachableInst>(CurrentTerminator) &&
7153
948
         "Expected to replace unreachable terminator with conditional branch.");
7154
948
  auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
7155
948
  CondBr->setSuccessor(0, nullptr);
7156
948
  ReplaceInstWithInst(CurrentTerminator, CondBr);
7157
948
}
7158
7159
512
void VPPredInstPHIRecipe::execute(VPTransformState &State) {
7160
512
  assert(State.Instance && "Predicated instruction PHI works per instance.");
7161
512
  Instruction *ScalarPredInst = cast<Instruction>(
7162
512
      State.ValueMap.getScalarValue(PredInst, *State.Instance));
7163
512
  BasicBlock *PredicatedBB = ScalarPredInst->getParent();
7164
512
  BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
7165
512
  assert(PredicatingBB && "Predicated block has no single predecessor.");
7166
512
7167
512
  // By current pack/unpack logic we need to generate only a single phi node: if
7168
512
  // a vector value for the predicated instruction exists at this point it means
7169
512
  // the instruction has vector users only, and a phi for the vector value is
7170
512
  // needed. In this case the recipe of the predicated instruction is marked to
7171
512
  // also do that packing, thereby "hoisting" the insert-element sequence.
7172
512
  // Otherwise, a phi node for the scalar value is needed.
7173
512
  unsigned Part = State.Instance->Part;
7174
512
  if (State.ValueMap.hasVectorValue(PredInst, Part)) {
7175
182
    Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
7176
182
    InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
7177
182
    PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
7178
182
    VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
7179
182
    VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
7180
182
    State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
7181
330
  } else {
7182
330
    Type *PredInstType = PredInst->getType();
7183
330
    PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
7184
330
    Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
7185
330
    Phi->addIncoming(ScalarPredInst, PredicatedBB);
7186
330
    State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
7187
330
  }
7188
512
}
7189
7190
21.6k
void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
7191
21.6k
  if (!User)
7192
21.5k
    return State.ILV->vectorizeMemoryInstruction(&Instr);
7193
95
7194
95
  // Last (and currently only) operand is a mask.
7195
95
  InnerLoopVectorizer::VectorParts MaskValues(State.UF);
7196
95
  VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
7197
316
  for (unsigned Part = 0; Part < State.UF; 
++Part221
)
7198
221
    MaskValues[Part] = State.get(Mask, Part);
7199
95
  State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
7200
95
}
7201
7202
// Process the loop in the VPlan-native vectorization path. This path builds
7203
// VPlan upfront in the vectorization pipeline, which allows to apply
7204
// VPlan-to-VPlan transformations from the very beginning without modifying the
7205
// input LLVM IR.
7206
static bool processLoopInVPlanNativePath(
7207
    Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
7208
    LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
7209
    TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
7210
    OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
7211
7
    ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
7212
7
7213
7
  assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
7214
7
  Function *F = L->getHeader()->getParent();
7215
7
  InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
7216
7
  LoopVectorizationCostModel CM(L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7217
7
                                &Hints, IAI);
7218
7
  // Use the planner for outer loop vectorization.
7219
7
  // TODO: CM is not used at this point inside the planner. Turn CM into an
7220
7
  // optional argument if we don't need it in the future.
7221
7
  LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM);
7222
7
7223
7
  // Get user vectorization factor.
7224
7
  const unsigned UserVF = Hints.getWidth();
7225
7
7226
7
  // Check the function attributes and profiles to find out if this function
7227
7
  // should be optimized for size.
7228
7
  bool OptForSize =
7229
7
      Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
7230
7
      
(0
F->hasOptSize()0
||
7231
0
       llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI));
7232
7
7233
7
  // Plan how to best vectorize, return the best VF and its cost.
7234
7
  const VectorizationFactor VF = LVP.planInVPlanNativePath(OptForSize, UserVF);
7235
7
7236
7
  // If we are stress testing VPlan builds, do not attempt to generate vector
7237
7
  // code. Masked vector code generation support will follow soon.
7238
7
  // Also, do not attempt to vectorize if no vector code will be produced.
7239
7
  if (VPlanBuildStressTest || EnableVPlanPredication ||
7240
7
      VectorizationFactor::Disabled() == VF)
7241
0
    return false;
7242
7
7243
7
  LVP.setBestPlan(VF.Width, 1);
7244
7
7245
7
  InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
7246
7
                         &CM);
7247
7
  LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
7248
7
                    << L->getHeader()->getParent()->getName() << "\"\n");
7249
7
  LVP.executePlan(LB, DT);
7250
7
7251
7
  // Mark the loop as already vectorized to avoid vectorizing again.
7252
7
  Hints.setAlreadyVectorized();
7253
7
7254
7
  LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7255
7
  return true;
7256
7
}
7257
7258
146k
bool LoopVectorizePass::processLoop(Loop *L) {
7259
146k
  assert((EnableVPlanNativePath || L->empty()) &&
7260
146k
         "VPlan-native path is not enabled. Only process inner loops.");
7261
146k
7262
#ifndef NDEBUG
7263
  const std::string DebugLocStr = getDebugLocString(L);
7264
#endif /* NDEBUG */
7265
7266
146k
  LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
7267
146k
                    << L->getHeader()->getParent()->getName() << "\" from "
7268
146k
                    << DebugLocStr << "\n");
7269
146k
7270
146k
  LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
7271
146k
7272
146k
  LLVM_DEBUG(
7273
146k
      dbgs() << "LV: Loop hints:"
7274
146k
             << " force="
7275
146k
             << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
7276
146k
                     ? "disabled"
7277
146k
                     : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
7278
146k
                            ? "enabled"
7279
146k
                            : "?"))
7280
146k
             << " width=" << Hints.getWidth()
7281
146k
             << " unroll=" << Hints.getInterleave() << "\n");
7282
146k
7283
146k
  // Function containing loop
7284
146k
  Function *F = L->getHeader()->getParent();
7285
146k
7286
146k
  // Looking at the diagnostic output is the only way to determine if a loop
7287
146k
  // was vectorized (other than looking at the IR or machine code), so it
7288
146k
  // is important to generate an optimization remark for each loop. Most of
7289
146k
  // these messages are generated as OptimizationRemarkAnalysis. Remarks
7290
146k
  // generated as OptimizationRemark and OptimizationRemarkMissed are
7291
146k
  // less verbose reporting vectorized loops and unvectorized loops that may
7292
146k
  // benefit from vectorization, respectively.
7293
146k
7294
146k
  if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
7295
5.70k
    LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7296
5.70k
    return false;
7297
5.70k
  }
7298
141k
7299
141k
  PredicatedScalarEvolution PSE(*SE, *L);
7300
141k
7301
141k
  // Check if it is legal to vectorize the loop.
7302
141k
  LoopVectorizationRequirements Requirements(*ORE);
7303
141k
  LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
7304
141k
                                &Requirements, &Hints, DB, AC);
7305
141k
  if (!LVL.canVectorize(EnableVPlanNativePath)) {
7306
121k
    LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7307
121k
    Hints.emitRemarkWithHints();
7308
121k
    return false;
7309
121k
  }
7310
19.9k
7311
19.9k
  // Check the function attributes and profiles to find out if this function
7312
19.9k
  // should be optimized for size.
7313
19.9k
  bool OptForSize =
7314
19.9k
      Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
7315
19.9k
      
(19.9k
F->hasOptSize()19.9k
||
7316
19.9k
       
llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)19.9k
);
7317
19.9k
7318
19.9k
  // Entrance to the VPlan-native vectorization path. Outer loops are processed
7319
19.9k
  // here. They may require CFG and instruction level transformations before
7320
19.9k
  // even evaluating whether vectorization is profitable. Since we cannot modify
7321
19.9k
  // the incoming IR, we need to build VPlan upfront in the vectorization
7322
19.9k
  // pipeline.
7323
19.9k
  if (!L->empty())
7324
7
    return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
7325
7
                                        ORE, BFI, PSI, Hints);
7326
19.9k
7327
19.9k
  assert(L->empty() && "Inner loop expected.");
7328
19.9k
  // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7329
19.9k
  // count by optimizing for size, to minimize overheads.
7330
19.9k
  // Prefer constant trip counts over profile data, over upper bound estimate.
7331
19.9k
  unsigned ExpectedTC = 0;
7332
19.9k
  bool HasExpectedTC = false;
7333
19.9k
  if (const SCEVConstant *ConstExits =
7334
11.1k
      dyn_cast<SCEVConstant>(SE->getBackedgeTakenCount(L))) {
7335
11.1k
    const APInt &ExitsCount = ConstExits->getAPInt();
7336
11.1k
    // We are interested in small values for ExpectedTC. Skip over those that
7337
11.1k
    // can't fit an unsigned.
7338
11.1k
    if (ExitsCount.ult(std::numeric_limits<unsigned>::max())) {
7339
11.1k
      ExpectedTC = static_cast<unsigned>(ExitsCount.getZExtValue()) + 1;
7340
11.1k
      HasExpectedTC = true;
7341
11.1k
    }
7342
11.1k
  }
7343
19.9k
  // ExpectedTC may be large because it's bound by a variable. Check
7344
19.9k
  // profiling information to validate we should vectorize.
7345
19.9k
  if (!HasExpectedTC && 
LoopVectorizeWithBlockFrequency8.82k
) {
7346
8.82k
    auto EstimatedTC = getLoopEstimatedTripCount(L);
7347
8.82k
    if (EstimatedTC) {
7348
11
      ExpectedTC = *EstimatedTC;
7349
11
      HasExpectedTC = true;
7350
11
    }
7351
8.82k
  }
7352
19.9k
  if (!HasExpectedTC) {
7353
8.81k
    ExpectedTC = SE->getSmallConstantMaxTripCount(L);
7354
8.81k
    HasExpectedTC = (ExpectedTC > 0);
7355
8.81k
  }
7356
19.9k
7357
19.9k
  if (HasExpectedTC && 
ExpectedTC < TinyTripCountVectorThreshold13.3k
) {
7358
184
    LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
7359
184
                      << "This loop is worth vectorizing only if no scalar "
7360
184
                      << "iteration overheads are incurred.");
7361
184
    if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
7362
184
      LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
7363
184
    else {
7364
182
      LLVM_DEBUG(dbgs() << "\n");
7365
182
      // Loops with a very small trip count are considered for vectorization
7366
182
      // under OptForSize, thereby making sure the cost of their loop body is
7367
182
      // dominant, free of runtime guards and scalar iteration overheads.
7368
182
      OptForSize = true;
7369
182
    }
7370
184
  }
7371
19.9k
7372
19.9k
  // Check the function attributes to see if implicit floats are allowed.
7373
19.9k
  // FIXME: This check doesn't seem possibly correct -- what if the loop is
7374
19.9k
  // an integer loop and the vector instructions selected are purely integer
7375
19.9k
  // vector instructions?
7376
19.9k
  if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
7377
1
    LLVM_DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
7378
1
                         "attribute is used.\n");
7379
1
    ORE->emit(createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(),
7380
1
                                     "NoImplicitFloat", L)
7381
1
              << "loop not vectorized due to NoImplicitFloat attribute");
7382
1
    Hints.emitRemarkWithHints();
7383
1
    return false;
7384
1
  }
7385
19.9k
7386
19.9k
  // Check if the target supports potentially unsafe FP vectorization.
7387
19.9k
  // FIXME: Add a check for the type of safety issue (denormal, signaling)
7388
19.9k
  // for the target we're vectorizing for, to make sure none of the
7389
19.9k
  // additional fp-math flags can help.
7390
19.9k
  if (Hints.isPotentiallyUnsafe() &&
7391
19.9k
      
TTI->isFPVectorizationPotentiallyUnsafe()10.7k
) {
7392
0
    LLVM_DEBUG(
7393
0
        dbgs() << "LV: Potentially unsafe FP op prevents vectorization.\n");
7394
0
    ORE->emit(
7395
0
        createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(), "UnsafeFP", L)
7396
0
        << "loop not vectorized due to unsafe FP support.");
7397
0
    Hints.emitRemarkWithHints();
7398
0
    return false;
7399
0
  }
7400
19.9k
7401
19.9k
  bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
7402
19.9k
  InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
7403
19.9k
7404
19.9k
  // If an override option has been passed in for interleaved accesses, use it.
7405
19.9k
  if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
7406
72
    UseInterleaved = EnableInterleavedMemAccesses;
7407
19.9k
7408
19.9k
  // Analyze interleaved memory accesses.
7409
19.9k
  if (UseInterleaved) {
7410
19.4k
    IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
7411
19.4k
  }
7412
19.9k
7413
19.9k
  // Use the cost model.
7414
19.9k
  LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F,
7415
19.9k
                                &Hints, IAI);
7416
19.9k
  CM.collectValuesToIgnore();
7417
19.9k
7418
19.9k
  // Use the planner for vectorization.
7419
19.9k
  LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM);
7420
19.9k
7421
19.9k
  // Get user vectorization factor.
7422
19.9k
  unsigned UserVF = Hints.getWidth();
7423
19.9k
7424
19.9k
  // Plan how to best vectorize, return the best VF and its cost.
7425
19.9k
  Optional<VectorizationFactor> MaybeVF = LVP.plan(OptForSize, UserVF);
7426
19.9k
7427
19.9k
  VectorizationFactor VF = VectorizationFactor::Disabled();
7428
19.9k
  unsigned IC = 1;
7429
19.9k
  unsigned UserIC = Hints.getInterleave();
7430
19.9k
7431
19.9k
  if (MaybeVF) {
7432
19.8k
    VF = *MaybeVF;
7433
19.8k
    // Select the interleave count.
7434
19.8k
    IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost);
7435
19.8k
  }
7436
19.9k
7437
19.9k
  // Identify the diagnostic messages that should be produced.
7438
19.9k
  std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
7439
19.9k
  bool VectorizeLoop = true, InterleaveLoop = true;
7440
19.9k
  if (Requirements.doesNotMeet(F, L, Hints)) {
7441
1.49k
    LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
7442
1.49k
                         "requirements.\n");
7443
1.49k
    Hints.emitRemarkWithHints();
7444
1.49k
    return false;
7445
1.49k
  }
7446
18.4k
7447
18.4k
  if (VF.Width == 1) {
7448
3.15k
    LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
7449
3.15k
    VecDiagMsg = std::make_pair(
7450
3.15k
        "VectorizationNotBeneficial",
7451
3.15k
        "the cost-model indicates that vectorization is not beneficial");
7452
3.15k
    VectorizeLoop = false;
7453
3.15k
  }
7454
18.4k
7455
18.4k
  if (!MaybeVF && 
UserIC > 171
) {
7456
1
    // Tell the user interleaving was avoided up-front, despite being explicitly
7457
1
    // requested.
7458
1
    LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
7459
1
                         "interleaving should be avoided up front\n");
7460
1
    IntDiagMsg = std::make_pair(
7461
1
        "InterleavingAvoided",
7462
1
        "Ignoring UserIC, because interleaving was avoided up front");
7463
1
    InterleaveLoop = false;
7464
18.4k
  } else if (IC == 1 && 
UserIC <= 12.96k
) {
7465
2.81k
    // Tell the user interleaving is not beneficial.
7466
2.81k
    LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
7467
2.81k
    IntDiagMsg = std::make_pair(
7468
2.81k
        "InterleavingNotBeneficial",
7469
2.81k
        "the cost-model indicates that interleaving is not beneficial");
7470
2.81k
    InterleaveLoop = false;
7471
2.81k
    if (UserIC == 1) {
7472
411
      IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
7473
411
      IntDiagMsg.second +=
7474
411
          " and is explicitly disabled or interleave count is set to 1";
7475
411
    }
7476
15.6k
  } else if (IC > 1 && 
UserIC == 115.4k
) {
7477
118
    // Tell the user interleaving is beneficial, but it explicitly disabled.
7478
118
    LLVM_DEBUG(
7479
118
        dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
7480
118
    IntDiagMsg = std::make_pair(
7481
118
        "InterleavingBeneficialButDisabled",
7482
118
        "the cost-model indicates that interleaving is beneficial "
7483
118
        "but is explicitly disabled or interleave count is set to 1");
7484
118
    InterleaveLoop = false;
7485
118
  }
7486
18.4k
7487
18.4k
  // Override IC if user provided an interleave count.
7488
18.4k
  IC = UserIC > 0 ? 
UserIC685
:
IC17.7k
;
7489
18.4k
7490
18.4k
  // Emit diagnostic messages, if any.
7491
18.4k
  const char *VAPassName = Hints.vectorizeAnalysisPassName();
7492
18.4k
  if (!VectorizeLoop && 
!InterleaveLoop3.15k
) {
7493
1.40k
    // Do not vectorize or interleaving the loop.
7494
1.40k
    ORE->emit([&]() {
7495
5
      return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
7496
5
                                      L->getStartLoc(), L->getHeader())
7497
5
             << VecDiagMsg.second;
7498
5
    });
7499
1.40k
    ORE->emit([&]() {
7500
5
      return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
7501
5
                                      L->getStartLoc(), L->getHeader())
7502
5
             << IntDiagMsg.second;
7503
5
    });
7504
1.40k
    return false;
7505
17.0k
  } else if (!VectorizeLoop && 
InterleaveLoop1.74k
) {
7506
1.74k
    LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7507
1.74k
    ORE->emit([&]() {
7508
2
      return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
7509
2
                                        L->getStartLoc(), L->getHeader())
7510
2
             << VecDiagMsg.second;
7511
2
    });
7512
15.3k
  } else if (VectorizeLoop && !InterleaveLoop) {
7513
1.52k
    LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7514
1.52k
                      << ") in " << DebugLocStr << '\n');
7515
1.52k
    ORE->emit([&]() {
7516
5
      return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
7517
5
                                        L->getStartLoc(), L->getHeader())
7518
5
             << IntDiagMsg.second;
7519
5
    });
7520
13.7k
  } else if (VectorizeLoop && InterleaveLoop) {
7521
13.7k
    LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7522
13.7k
                      << ") in " << DebugLocStr << '\n');
7523
13.7k
    LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7524
13.7k
  }
7525
18.4k
7526
18.4k
  LVP.setBestPlan(VF.Width, IC);
7527
17.0k
7528
17.0k
  using namespace ore;
7529
17.0k
  bool DisableRuntimeUnroll = false;
7530
17.0k
  MDNode *OrigLoopID = L->getLoopID();
7531
17.0k
7532
17.0k
  if (!VectorizeLoop) {
7533
1.74k
    assert(IC > 1 && "interleave count should not be 1 or 0");
7534
1.74k
    // If we decided that it is not legal to vectorize the loop, then
7535
1.74k
    // interleave it.
7536
1.74k
    InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
7537
1.74k
                               &CM);
7538
1.74k
    LVP.executePlan(Unroller, DT);
7539
1.74k
7540
1.74k
    ORE->emit([&]() {
7541
2
      return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
7542
2
                                L->getHeader())
7543
2
             << "interleaved loop (interleaved count: "
7544
2
             << NV("InterleaveCount", IC) << ")";
7545
2
    });
7546
15.3k
  } else {
7547
15.3k
    // If we decided that it is *legal* to vectorize the loop, then do it.
7548
15.3k
    InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
7549
15.3k
                           &LVL, &CM);
7550
15.3k
    LVP.executePlan(LB, DT);
7551
15.3k
    ++LoopsVectorized;
7552
15.3k
7553
15.3k
    // Add metadata to disable runtime unrolling a scalar loop when there are
7554
15.3k
    // no runtime checks about strides and memory. A scalar loop that is
7555
15.3k
    // rarely used is not worth unrolling.
7556
15.3k
    if (!LB.areSafetyChecksAdded())
7557
12.6k
      DisableRuntimeUnroll = true;
7558
15.3k
7559
15.3k
    // Report the vectorization decision.
7560
15.3k
    ORE->emit([&]() {
7561
12
      return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
7562
12
                                L->getHeader())
7563
12
             << "vectorized loop (vectorization width: "
7564
12
             << NV("VectorizationFactor", VF.Width)
7565
12
             << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
7566
12
    });
7567
15.3k
  }
7568
17.0k
7569
17.0k
  Optional<MDNode *> RemainderLoopID =
7570
17.0k
      makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7571
17.0k
                                      LLVMLoopVectorizeFollowupEpilogue});
7572
17.0k
  if (RemainderLoopID.hasValue()) {
7573
1
    L->setLoopID(RemainderLoopID.getValue());
7574
17.0k
  } else {
7575
17.0k
    if (DisableRuntimeUnroll)
7576
12.6k
      AddRuntimeUnrollDisableMetaData(L);
7577
17.0k
7578
17.0k
    // Mark the loop as already vectorized to avoid vectorizing again.
7579
17.0k
    Hints.setAlreadyVectorized();
7580
17.0k
  }
7581
17.0k
7582
17.0k
  LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7583
17.0k
  return true;
7584
18.4k
}
7585
7586
bool LoopVectorizePass::runImpl(
7587
    Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
7588
    DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
7589
    DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
7590
    std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
7591
280k
    OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
7592
280k
  SE = &SE_;
7593
280k
  LI = &LI_;
7594
280k
  TTI = &TTI_;
7595
280k
  DT = &DT_;
7596
280k
  BFI = &BFI_;
7597
280k
  TLI = TLI_;
7598
280k
  AA = &AA_;
7599
280k
  AC = &AC_;
7600
280k
  GetLAA = &GetLAA_;
7601
280k
  DB = &DB_;
7602
280k
  ORE = &ORE_;
7603
280k
  PSI = PSI_;
7604
280k
7605
280k
  // Don't attempt if
7606
280k
  // 1. the target claims to have no vector registers, and
7607
280k
  // 2. interleaving won't help ILP.
7608
280k
  //
7609
280k
  // The second condition is necessary because, even if the target has no
7610
280k
  // vector registers, loop vectorization may still enable scalar
7611
280k
  // interleaving.
7612
280k
  if (!TTI->getNumberOfRegisters(true) && 
TTI->getMaxInterleaveFactor(1) < 22.24k
)
7613
2.17k
    return false;
7614
278k
7615
278k
  bool Changed = false;
7616
278k
7617
278k
  // The vectorizer requires loops to be in simplified form.
7618
278k
  // Since simplification may add new inner loops, it has to run before the
7619
278k
  // legality and profitability checks. This means running the loop vectorizer
7620
278k
  // will simplify all loops, regardless of whether anything end up being
7621
278k
  // vectorized.
7622
278k
  for (auto &L : *LI)
7623
129k
    Changed |=
7624
129k
        simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
7625
278k
7626
278k
  // Build up a worklist of inner-loops to vectorize. This is necessary as
7627
278k
  // the act of vectorizing or partially unrolling a loop creates new loops
7628
278k
  // and can invalidate iterators across the loops.
7629
278k
  SmallVector<Loop *, 8> Worklist;
7630
278k
7631
278k
  for (Loop *L : *LI)
7632
129k
    collectSupportedLoops(*L, LI, ORE, Worklist);
7633
278k
7634
278k
  LoopsAnalyzed += Worklist.size();
7635
278k
7636
278k
  // Now walk the identified inner loops.
7637
425k
  while (!Worklist.empty()) {
7638
146k
    Loop *L = Worklist.pop_back_val();
7639
146k
7640
146k
    // For the inner loops we actually process, form LCSSA to simplify the
7641
146k
    // transform.
7642
146k
    Changed |= formLCSSARecursively(*L, *DT, LI, SE);
7643
146k
7644
146k
    Changed |= processLoop(L);
7645
146k
  }
7646
278k
7647
278k
  // Process each loop nest in the function.
7648
278k
  return Changed;
7649
278k
}
7650
7651
PreservedAnalyses LoopVectorizePass::run(Function &F,
7652
887
                                         FunctionAnalysisManager &AM) {
7653
887
    auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
7654
887
    auto &LI = AM.getResult<LoopAnalysis>(F);
7655
887
    auto &TTI = AM.getResult<TargetIRAnalysis>(F);
7656
887
    auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
7657
887
    auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
7658
887
    auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
7659
887
    auto &AA = AM.getResult<AAManager>(F);
7660
887
    auto &AC = AM.getResult<AssumptionAnalysis>(F);
7661
887
    auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
7662
887
    auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
7663
887
    MemorySSA *MSSA = EnableMSSALoopDependency
7664
887
                          ? 
&AM.getResult<MemorySSAAnalysis>(F).getMSSA()0
7665
887
                          : nullptr;
7666
887
7667
887
    auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
7668
887
    std::function<const LoopAccessInfo &(Loop &)> GetLAA =
7669
887
        [&](Loop &L) -> const LoopAccessInfo & {
7670
23
      LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
7671
23
      return LAM.getResult<LoopAccessAnalysis>(L, AR);
7672
23
    };
7673
887
    const ModuleAnalysisManager &MAM =
7674
887
        AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
7675
887
    ProfileSummaryInfo *PSI =
7676
887
        MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
7677
887
    bool Changed =
7678
887
        runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
7679
887
    if (!Changed)
7680
868
      return PreservedAnalyses::all();
7681
19
    PreservedAnalyses PA;
7682
19
7683
19
    // We currently do not preserve loopinfo/dominator analyses with outer loop
7684
19
    // vectorization. Until this is addressed, mark these analyses as preserved
7685
19
    // only for non-VPlan-native path.
7686
19
    // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
7687
19
    if (!EnableVPlanNativePath) {
7688
19
      PA.preserve<LoopAnalysis>();
7689
19
      PA.preserve<DominatorTreeAnalysis>();
7690
19
    }
7691
19
    PA.preserve<BasicAA>();
7692
19
    PA.preserve<GlobalsAA>();
7693
19
    return PA;
7694
19
}