Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
</
Line
Count
Source (jump to first uncovered line)
1
//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10
// and generates target-independent LLVM-IR.
11
// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12
// of instructions in order to estimate the profitability of vectorization.
13
//
14
// The loop vectorizer combines consecutive loop iterations into a single
15
// 'wide' iteration. After this transformation the index is incremented
16
// by the SIMD vector width, and not by one.
17
//
18
// This pass has three parts:
19
// 1. The main loop pass that drives the different parts.
20
// 2. LoopVectorizationLegality - A unit that checks for the legality
21
//    of the vectorization.
22
// 3. InnerLoopVectorizer - A unit that performs the actual
23
//    widening of instructions.
24
// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25
//    of vectorization. It decides on the optimal vector width, which
26
//    can be one, if vectorization is not profitable.
27
//
28
// There is a development effort going on to migrate loop vectorizer to the
29
// VPlan infrastructure and to introduce outer loop vectorization support (see
30
// docs/Proposal/VectorizationPlan.rst and
31
// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32
// purpose, we temporarily introduced the VPlan-native vectorization path: an
33
// alternative vectorization path that is natively implemented on top of the
34
// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35
//
36
//===----------------------------------------------------------------------===//
37
//
38
// The reduction-variable vectorization is based on the paper:
39
//  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40
//
41
// Variable uniformity checks are inspired by:
42
//  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43
//
44
// The interleaved access vectorization is based on the paper:
45
//  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46
//  Data for SIMD
47
//
48
// Other ideas/concepts are from:
49
//  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50
//
51
//  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52
//  Vectorizing Compilers.
53
//
54
//===----------------------------------------------------------------------===//
55
56
#include "llvm/Transforms/Vectorize/LoopVectorize.h"
57
#include "LoopVectorizationPlanner.h"
58
#include "VPRecipeBuilder.h"
59
#include "VPlan.h"
60
#include "VPlanHCFGBuilder.h"
61
#include "VPlanHCFGTransforms.h"
62
#include "VPlanPredicator.h"
63
#include "llvm/ADT/APInt.h"
64
#include "llvm/ADT/ArrayRef.h"
65
#include "llvm/ADT/DenseMap.h"
66
#include "llvm/ADT/DenseMapInfo.h"
67
#include "llvm/ADT/Hashing.h"
68
#include "llvm/ADT/MapVector.h"
69
#include "llvm/ADT/None.h"
70
#include "llvm/ADT/Optional.h"
71
#include "llvm/ADT/STLExtras.h"
72
#include "llvm/ADT/SetVector.h"
73
#include "llvm/ADT/SmallPtrSet.h"
74
#include "llvm/ADT/SmallVector.h"
75
#include "llvm/ADT/Statistic.h"
76
#include "llvm/ADT/StringRef.h"
77
#include "llvm/ADT/Twine.h"
78
#include "llvm/ADT/iterator_range.h"
79
#include "llvm/Analysis/AssumptionCache.h"
80
#include "llvm/Analysis/BasicAliasAnalysis.h"
81
#include "llvm/Analysis/BlockFrequencyInfo.h"
82
#include "llvm/Analysis/CFG.h"
83
#include "llvm/Analysis/CodeMetrics.h"
84
#include "llvm/Analysis/DemandedBits.h"
85
#include "llvm/Analysis/GlobalsModRef.h"
86
#include "llvm/Analysis/LoopAccessAnalysis.h"
87
#include "llvm/Analysis/LoopAnalysisManager.h"
88
#include "llvm/Analysis/LoopInfo.h"
89
#include "llvm/Analysis/LoopIterator.h"
90
#include "llvm/Analysis/MemorySSA.h"
91
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
92
#include "llvm/Analysis/ProfileSummaryInfo.h"
93
#include "llvm/Analysis/ScalarEvolution.h"
94
#include "llvm/Analysis/ScalarEvolutionExpander.h"
95
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
96
#include "llvm/Analysis/TargetLibraryInfo.h"
97
#include "llvm/Analysis/TargetTransformInfo.h"
98
#include "llvm/Analysis/VectorUtils.h"
99
#include "llvm/IR/Attributes.h"
100
#include "llvm/IR/BasicBlock.h"
101
#include "llvm/IR/CFG.h"
102
#include "llvm/IR/Constant.h"
103
#include "llvm/IR/Constants.h"
104
#include "llvm/IR/DataLayout.h"
105
#include "llvm/IR/DebugInfoMetadata.h"
106
#include "llvm/IR/DebugLoc.h"
107
#include "llvm/IR/DerivedTypes.h"
108
#include "llvm/IR/DiagnosticInfo.h"
109
#include "llvm/IR/Dominators.h"
110
#include "llvm/IR/Function.h"
111
#include "llvm/IR/IRBuilder.h"
112
#include "llvm/IR/InstrTypes.h"
113
#include "llvm/IR/Instruction.h"
114
#include "llvm/IR/Instructions.h"
115
#include "llvm/IR/IntrinsicInst.h"
116
#include "llvm/IR/Intrinsics.h"
117
#include "llvm/IR/LLVMContext.h"
118
#include "llvm/IR/Metadata.h"
119
#include "llvm/IR/Module.h"
120
#include "llvm/IR/Operator.h"
121
#include "llvm/IR/Type.h"
122
#include "llvm/IR/Use.h"
123
#include "llvm/IR/User.h"
124
#include "llvm/IR/Value.h"
125
#include "llvm/IR/ValueHandle.h"
126
#include "llvm/IR/Verifier.h"
127
#include "llvm/Pass.h"
128
#include "llvm/Support/Casting.h"
129
#include "llvm/Support/CommandLine.h"
130
#include "llvm/Support/Compiler.h"
131
#include "llvm/Support/Debug.h"
132
#include "llvm/Support/ErrorHandling.h"
133
#include "llvm/Support/MathExtras.h"
134
#include "llvm/Support/raw_ostream.h"
135
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
136
#include "llvm/Transforms/Utils/LoopSimplify.h"
137
#include "llvm/Transforms/Utils/LoopUtils.h"
138
#include "llvm/Transforms/Utils/LoopVersioning.h"
139
#include "llvm/Transforms/Utils/SizeOpts.h"
140
#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
141
#include <algorithm>
142
#include <cassert>
143
#include <cstdint>
144
#include <cstdlib>
145
#include <functional>
146
#include <iterator>
147
#include <limits>
148
#include <memory>
149
#include <string>
150
#include <tuple>
151
#include <utility>
152
#include <vector>
153
154
using namespace llvm;
155
156
24
#define LV_NAME "loop-vectorize"
157
#define DEBUG_TYPE LV_NAME
158
159
/// @{
160
/// Metadata attribute names
161
static const char *const LLVMLoopVectorizeFollowupAll =
162
    "llvm.loop.vectorize.followup_all";
163
static const char *const LLVMLoopVectorizeFollowupVectorized =
164
    "llvm.loop.vectorize.followup_vectorized";
165
static const char *const LLVMLoopVectorizeFollowupEpilogue =
166
    "llvm.loop.vectorize.followup_epilogue";
167
/// @}
168
169
STATISTIC(LoopsVectorized, "Number of loops vectorized");
170
STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
171
172
/// Loops with a known constant trip count below this number are vectorized only
173
/// if no scalar iteration overheads are incurred.
174
static cl::opt<unsigned> TinyTripCountVectorThreshold(
175
    "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
176
    cl::desc("Loops with a constant trip count that is smaller than this "
177
             "value are vectorized only if no scalar iteration overheads "
178
             "are incurred."));
179
180
static cl::opt<bool> MaximizeBandwidth(
181
    "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
182
    cl::desc("Maximize bandwidth when selecting vectorization factor which "
183
             "will be determined by the smallest type in loop."));
184
185
static cl::opt<bool> EnableInterleavedMemAccesses(
186
    "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
187
    cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
188
189
/// An interleave-group may need masking if it resides in a block that needs
190
/// predication, or in order to mask away gaps. 
191
static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
192
    "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
193
    cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
194
195
/// We don't interleave loops with a known constant trip count below this
196
/// number.
197
static const unsigned TinyTripCountInterleaveThreshold = 128;
198
199
static cl::opt<unsigned> ForceTargetNumScalarRegs(
200
    "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
201
    cl::desc("A flag that overrides the target's number of scalar registers."));
202
203
static cl::opt<unsigned> ForceTargetNumVectorRegs(
204
    "force-target-num-vector-regs", cl::init(0), cl::Hidden,
205
    cl::desc("A flag that overrides the target's number of vector registers."));
206
207
static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
208
    "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
209
    cl::desc("A flag that overrides the target's max interleave factor for "
210
             "scalar loops."));
211
212
static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
213
    "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
214
    cl::desc("A flag that overrides the target's max interleave factor for "
215
             "vectorized loops."));
216
217
static cl::opt<unsigned> ForceTargetInstructionCost(
218
    "force-target-instruction-cost", cl::init(0), cl::Hidden,
219
    cl::desc("A flag that overrides the target's expected cost for "
220
             "an instruction to a single constant value. Mostly "
221
             "useful for getting consistent testing."));
222
223
static cl::opt<unsigned> SmallLoopCost(
224
    "small-loop-cost", cl::init(20), cl::Hidden,
225
    cl::desc(
226
        "The cost of a loop that is considered 'small' by the interleaver."));
227
228
static cl::opt<bool> LoopVectorizeWithBlockFrequency(
229
    "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
230
    cl::desc("Enable the use of the block frequency analysis to access PGO "
231
             "heuristics minimizing code growth in cold regions and being more "
232
             "aggressive in hot regions."));
233
234
// Runtime interleave loops for load/store throughput.
235
static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
236
    "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
237
    cl::desc(
238
        "Enable runtime interleaving until load/store ports are saturated"));
239
240
/// The number of stores in a loop that are allowed to need predication.
241
static cl::opt<unsigned> NumberOfStoresToPredicate(
242
    "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
243
    cl::desc("Max number of stores to be predicated behind an if."));
244
245
static cl::opt<bool> EnableIndVarRegisterHeur(
246
    "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
247
    cl::desc("Count the induction variable only once when interleaving"));
248
249
static cl::opt<bool> EnableCondStoresVectorization(
250
    "enable-cond-stores-vec", cl::init(true), cl::Hidden,
251
    cl::desc("Enable if predication of stores during vectorization."));
252
253
static cl::opt<unsigned> MaxNestedScalarReductionIC(
254
    "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
255
    cl::desc("The maximum interleave count to use when interleaving a scalar "
256
             "reduction in a nested loop."));
257
258
cl::opt<bool> EnableVPlanNativePath(
259
    "enable-vplan-native-path", cl::init(false), cl::Hidden,
260
    cl::desc("Enable VPlan-native vectorization path with "
261
             "support for outer loop vectorization."));
262
263
// FIXME: Remove this switch once we have divergence analysis. Currently we
264
// assume divergent non-backedge branches when this switch is true.
265
cl::opt<bool> EnableVPlanPredication(
266
    "enable-vplan-predication", cl::init(false), cl::Hidden,
267
    cl::desc("Enable VPlan-native vectorization path predicator with "
268
             "support for outer loop vectorization."));
269
270
// This flag enables the stress testing of the VPlan H-CFG construction in the
271
// VPlan-native vectorization path. It must be used in conjuction with
272
// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
273
// verification of the H-CFGs built.
274
static cl::opt<bool> VPlanBuildStressTest(
275
    "vplan-build-stress-test", cl::init(false), cl::Hidden,
276
    cl::desc(
277
        "Build VPlan for every supported loop nest in the function and bail "
278
        "out right after the build (stress test the VPlan H-CFG construction "
279
        "in the VPlan-native vectorization path)."));
280
281
cl::opt<bool> llvm::EnableLoopInterleaving(
282
    "interleave-loops", cl::init(true), cl::Hidden,
283
    cl::desc("Enable loop interleaving in Loop vectorization passes"));
284
cl::opt<bool> llvm::EnableLoopVectorization(
285
    "vectorize-loops", cl::init(true), cl::Hidden,
286
    cl::desc("Run the Loop vectorization passes"));
287
288
/// A helper function for converting Scalar types to vector types.
289
/// If the incoming type is void, we return void. If the VF is 1, we return
290
/// the scalar type.
291
633k
static Type *ToVectorTy(Type *Scalar, unsigned VF) {
292
633k
  if (Scalar->isVoidTy() || 
VF == 1558k
)
293
226k
    return Scalar;
294
406k
  return VectorType::get(Scalar, VF);
295
406k
}
296
297
/// A helper function that returns the type of loaded or stored value.
298
260k
static Type *getMemInstValueType(Value *I) {
299
260k
  assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
300
260k
         "Expected Load or Store instruction");
301
260k
  if (auto *LI = dyn_cast<LoadInst>(I))
302
125k
    return LI->getType();
303
134k
  return cast<StoreInst>(I)->getValueOperand()->getType();
304
134k
}
305
306
/// A helper function that returns true if the given type is irregular. The
307
/// type is irregular if its allocated size doesn't equal the store size of an
308
/// element of the corresponding vector type at the given vectorization factor.
309
45.6k
static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
310
45.6k
  // Determine if an array of VF elements of type Ty is "bitcast compatible"
311
45.6k
  // with a <VF x Ty> vector.
312
45.6k
  if (VF > 1) {
313
45.6k
    auto *VectorTy = VectorType::get(Ty, VF);
314
45.6k
    return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
315
45.6k
  }
316
0
317
0
  // If the vectorization factor is one, we just check if an array of type Ty
318
0
  // requires padding between elements.
319
0
  return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
320
0
}
321
322
/// A helper function that returns the reciprocal of the block probability of
323
/// predicated blocks. If we return X, we are assuming the predicated block
324
/// will execute once for every X iterations of the loop header.
325
///
326
/// TODO: We should use actual block probability here, if available. Currently,
327
///       we always assume predicated blocks have a 50% chance of executing.
328
4.29k
static unsigned getReciprocalPredBlockProb() { return 2; }
329
330
/// A helper function that adds a 'fast' flag to floating-point operations.
331
151k
static Value *addFastMathFlag(Value *V) {
332
151k
  if (isa<FPMathOperator>(V))
333
68
    cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
334
151k
  return V;
335
151k
}
336
337
1.19k
static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
338
1.19k
  if (isa<FPMathOperator>(V))
339
47
    cast<Instruction>(V)->setFastMathFlags(FMF);
340
1.19k
  return V;
341
1.19k
}
342
343
/// A helper function that returns an integer or floating-point constant with
344
/// value C.
345
70.4k
static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
346
70.4k
  return Ty->isIntegerTy() ? 
ConstantInt::getSigned(Ty, C)70.4k
347
70.4k
                           : 
ConstantFP::get(Ty, C)34
;
348
70.4k
}
349
350
namespace llvm {
351
352
/// InnerLoopVectorizer vectorizes loops which contain only one basic
353
/// block to a specified vectorization factor (VF).
354
/// This class performs the widening of scalars into vectors, or multiple
355
/// scalars. This class also implements the following features:
356
/// * It inserts an epilogue loop for handling loops that don't have iteration
357
///   counts that are known to be a multiple of the vectorization factor.
358
/// * It handles the code generation for reduction variables.
359
/// * Scalarization (implementation using scalars) of un-vectorizable
360
///   instructions.
361
/// InnerLoopVectorizer does not perform any vectorization-legality
362
/// checks, and relies on the caller to check for the different legality
363
/// aspects. The InnerLoopVectorizer relies on the
364
/// LoopVectorizationLegality class to provide information about the induction
365
/// and reduction variables that were found to a given vectorization factor.
366
class InnerLoopVectorizer {
367
public:
368
  InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
369
                      LoopInfo *LI, DominatorTree *DT,
370
                      const TargetLibraryInfo *TLI,
371
                      const TargetTransformInfo *TTI, AssumptionCache *AC,
372
                      OptimizationRemarkEmitter *ORE, unsigned VecWidth,
373
                      unsigned UnrollFactor, LoopVectorizationLegality *LVL,
374
                      LoopVectorizationCostModel *CM)
375
      : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
376
        AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
377
        Builder(PSE.getSE()->getContext()),
378
17.0k
        VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
379
17.0k
  virtual ~InnerLoopVectorizer() = default;
380
381
  /// Create a new empty loop. Unlink the old loop and connect the new one.
382
  /// Return the pre-header block of the new loop.
383
  BasicBlock *createVectorizedLoopSkeleton();
384
385
  /// Widen a single instruction within the innermost loop.
386
  void widenInstruction(Instruction &I);
387
388
  /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
389
  void fixVectorizedLoop();
390
391
  // Return true if any runtime check is added.
392
15.3k
  bool areSafetyChecksAdded() { return AddedSafetyChecks; }
393
394
  /// A type for vectorized values in the new loop. Each value from the
395
  /// original loop, when vectorized, is represented by UF vector values in the
396
  /// new unrolled loop, where UF is the unroll factor.
397
  using VectorParts = SmallVector<Value *, 2>;
398
399
  /// Vectorize a single PHINode in a block. This method handles the induction
400
  /// variable canonicalization. It supports both VF = 1 for unrolled loops and
401
  /// arbitrary length vectors.
402
  void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
403
404
  /// A helper function to scalarize a single Instruction in the innermost loop.
405
  /// Generates a sequence of scalar instances for each lane between \p MinLane
406
  /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
407
  /// inclusive..
408
  void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
409
                            bool IfPredicateInstr);
410
411
  /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
412
  /// is provided, the integer induction variable will first be truncated to
413
  /// the corresponding type.
414
  void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
415
416
  /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
417
  /// vector or scalar value on-demand if one is not yet available. When
418
  /// vectorizing a loop, we visit the definition of an instruction before its
419
  /// uses. When visiting the definition, we either vectorize or scalarize the
420
  /// instruction, creating an entry for it in the corresponding map. (In some
421
  /// cases, such as induction variables, we will create both vector and scalar
422
  /// entries.) Then, as we encounter uses of the definition, we derive values
423
  /// for each scalar or vector use unless such a value is already available.
424
  /// For example, if we scalarize a definition and one of its uses is vector,
425
  /// we build the required vector on-demand with an insertelement sequence
426
  /// when visiting the use. Otherwise, if the use is scalar, we can use the
427
  /// existing scalar definition.
428
  ///
429
  /// Return a value in the new loop corresponding to \p V from the original
430
  /// loop at unroll index \p Part. If the value has already been vectorized,
431
  /// the corresponding vector entry in VectorLoopValueMap is returned. If,
432
  /// however, the value has a scalar entry in VectorLoopValueMap, we construct
433
  /// a new vector value on-demand by inserting the scalar values into a vector
434
  /// with an insertelement sequence. If the value has been neither vectorized
435
  /// nor scalarized, it must be loop invariant, so we simply broadcast the
436
  /// value into a vector.
437
  Value *getOrCreateVectorValue(Value *V, unsigned Part);
438
439
  /// Return a value in the new loop corresponding to \p V from the original
440
  /// loop at unroll and vector indices \p Instance. If the value has been
441
  /// vectorized but not scalarized, the necessary extractelement instruction
442
  /// will be generated.
443
  Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
444
445
  /// Construct the vector value of a scalarized value \p V one lane at a time.
446
  void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
447
448
  /// Try to vectorize the interleaved access group that \p Instr belongs to,
449
  /// optionally masking the vector operations if \p BlockInMask is non-null.
450
  void vectorizeInterleaveGroup(Instruction *Instr,
451
                                VectorParts *BlockInMask = nullptr);
452
453
  /// Vectorize Load and Store instructions, optionally masking the vector
454
  /// operations if \p BlockInMask is non-null.
455
  void vectorizeMemoryInstruction(Instruction *Instr,
456
                                  VectorParts *BlockInMask = nullptr);
457
458
  /// Set the debug location in the builder using the debug location in
459
  /// the instruction.
460
  void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
461
462
  /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
463
  void fixNonInductionPHIs(void);
464
465
protected:
466
  friend class LoopVectorizationPlanner;
467
468
  /// A small list of PHINodes.
469
  using PhiVector = SmallVector<PHINode *, 4>;
470
471
  /// A type for scalarized values in the new loop. Each value from the
472
  /// original loop, when scalarized, is represented by UF x VF scalar values
473
  /// in the new unrolled loop, where UF is the unroll factor and VF is the
474
  /// vectorization factor.
475
  using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
476
477
  /// Set up the values of the IVs correctly when exiting the vector loop.
478
  void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
479
                    Value *CountRoundDown, Value *EndValue,
480
                    BasicBlock *MiddleBlock);
481
482
  /// Create a new induction variable inside L.
483
  PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
484
                                   Value *Step, Instruction *DL);
485
486
  /// Handle all cross-iteration phis in the header.
487
  void fixCrossIterationPHIs();
488
489
  /// Fix a first-order recurrence. This is the second phase of vectorizing
490
  /// this phi node.
491
  void fixFirstOrderRecurrence(PHINode *Phi);
492
493
  /// Fix a reduction cross-iteration phi. This is the second phase of
494
  /// vectorizing this phi node.
495
  void fixReduction(PHINode *Phi);
496
497
  /// The Loop exit block may have single value PHI nodes with some
498
  /// incoming value. While vectorizing we only handled real values
499
  /// that were defined inside the loop and we should have one value for
500
  /// each predecessor of its parent basic block. See PR14725.
501
  void fixLCSSAPHIs();
502
503
  /// Iteratively sink the scalarized operands of a predicated instruction into
504
  /// the block that was created for it.
505
  void sinkScalarOperands(Instruction *PredInst);
506
507
  /// Shrinks vector element sizes to the smallest bitwidth they can be legally
508
  /// represented as.
509
  void truncateToMinimalBitwidths();
510
511
  /// Insert the new loop to the loop hierarchy and pass manager
512
  /// and update the analysis passes.
513
  void updateAnalysis();
514
515
  /// Create a broadcast instruction. This method generates a broadcast
516
  /// instruction (shuffle) for loop invariant values and for the induction
517
  /// value. If this is the induction variable then we extend it to N, N+1, ...
518
  /// this is needed because each iteration in the loop corresponds to a SIMD
519
  /// element.
520
  virtual Value *getBroadcastInstrs(Value *V);
521
522
  /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
523
  /// to each vector element of Val. The sequence starts at StartIndex.
524
  /// \p Opcode is relevant for FP induction variable.
525
  virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
526
                               Instruction::BinaryOps Opcode =
527
                               Instruction::BinaryOpsEnd);
528
529
  /// Compute scalar induction steps. \p ScalarIV is the scalar induction
530
  /// variable on which to base the steps, \p Step is the size of the step, and
531
  /// \p EntryVal is the value from the original loop that maps to the steps.
532
  /// Note that \p EntryVal doesn't have to be an induction variable - it
533
  /// can also be a truncate instruction.
534
  void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
535
                        const InductionDescriptor &ID);
536
537
  /// Create a vector induction phi node based on an existing scalar one. \p
538
  /// EntryVal is the value from the original loop that maps to the vector phi
539
  /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
540
  /// truncate instruction, instead of widening the original IV, we widen a
541
  /// version of the IV truncated to \p EntryVal's type.
542
  void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
543
                                       Value *Step, Instruction *EntryVal);
544
545
  /// Returns true if an instruction \p I should be scalarized instead of
546
  /// vectorized for the chosen vectorization factor.
547
  bool shouldScalarizeInstruction(Instruction *I) const;
548
549
  /// Returns true if we should generate a scalar version of \p IV.
550
  bool needsScalarInduction(Instruction *IV) const;
551
552
  /// If there is a cast involved in the induction variable \p ID, which should
553
  /// be ignored in the vectorized loop body, this function records the
554
  /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
555
  /// cast. We had already proved that the casted Phi is equal to the uncasted
556
  /// Phi in the vectorized loop (under a runtime guard), and therefore
557
  /// there is no need to vectorize the cast - the same value can be used in the
558
  /// vector loop for both the Phi and the cast.
559
  /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
560
  /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
561
  ///
562
  /// \p EntryVal is the value from the original loop that maps to the vector
563
  /// phi node and is used to distinguish what is the IV currently being
564
  /// processed - original one (if \p EntryVal is a phi corresponding to the
565
  /// original IV) or the "newly-created" one based on the proof mentioned above
566
  /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
567
  /// latter case \p EntryVal is a TruncInst and we must not record anything for
568
  /// that IV, but it's error-prone to expect callers of this routine to care
569
  /// about that, hence this explicit parameter.
570
  void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
571
                                             const Instruction *EntryVal,
572
                                             Value *VectorLoopValue,
573
                                             unsigned Part,
574
                                             unsigned Lane = UINT_MAX);
575
576
  /// Generate a shuffle sequence that will reverse the vector Vec.
577
  virtual Value *reverseVector(Value *Vec);
578
579
  /// Returns (and creates if needed) the original loop trip count.
580
  Value *getOrCreateTripCount(Loop *NewLoop);
581
582
  /// Returns (and creates if needed) the trip count of the widened loop.
583
  Value *getOrCreateVectorTripCount(Loop *NewLoop);
584
585
  /// Returns a bitcasted value to the requested vector type.
586
  /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
587
  Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
588
                                const DataLayout &DL);
589
590
  /// Emit a bypass check to see if the vector trip count is zero, including if
591
  /// it overflows.
592
  void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
593
594
  /// Emit a bypass check to see if all of the SCEV assumptions we've
595
  /// had to make are correct.
596
  void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
597
598
  /// Emit bypass checks to check any memory assumptions we may have made.
599
  void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
600
601
  /// Compute the transformed value of Index at offset StartValue using step
602
  /// StepValue.
603
  /// For integer induction, returns StartValue + Index * StepValue.
604
  /// For pointer induction, returns StartValue[Index * StepValue].
605
  /// FIXME: The newly created binary instructions should contain nsw/nuw
606
  /// flags, which can be found from the original scalar operations.
607
  Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
608
                              const DataLayout &DL,
609
                              const InductionDescriptor &ID) const;
610
611
  /// Add additional metadata to \p To that was not present on \p Orig.
612
  ///
613
  /// Currently this is used to add the noalias annotations based on the
614
  /// inserted memchecks.  Use this for instructions that are *cloned* into the
615
  /// vector loop.
616
  void addNewMetadata(Instruction *To, const Instruction *Orig);
617
618
  /// Add metadata from one instruction to another.
619
  ///
620
  /// This includes both the original MDs from \p From and additional ones (\see
621
  /// addNewMetadata).  Use this for *newly created* instructions in the vector
622
  /// loop.
623
  void addMetadata(Instruction *To, Instruction *From);
624
625
  /// Similar to the previous function but it adds the metadata to a
626
  /// vector of instructions.
627
  void addMetadata(ArrayRef<Value *> To, Instruction *From);
628
629
  /// The original loop.
630
  Loop *OrigLoop;
631
632
  /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
633
  /// dynamic knowledge to simplify SCEV expressions and converts them to a
634
  /// more usable form.
635
  PredicatedScalarEvolution &PSE;
636
637
  /// Loop Info.
638
  LoopInfo *LI;
639
640
  /// Dominator Tree.
641
  DominatorTree *DT;
642
643
  /// Alias Analysis.
644
  AliasAnalysis *AA;
645
646
  /// Target Library Info.
647
  const TargetLibraryInfo *TLI;
648
649
  /// Target Transform Info.
650
  const TargetTransformInfo *TTI;
651
652
  /// Assumption Cache.
653
  AssumptionCache *AC;
654
655
  /// Interface to emit optimization remarks.
656
  OptimizationRemarkEmitter *ORE;
657
658
  /// LoopVersioning.  It's only set up (non-null) if memchecks were
659
  /// used.
660
  ///
661
  /// This is currently only used to add no-alias metadata based on the
662
  /// memchecks.  The actually versioning is performed manually.
663
  std::unique_ptr<LoopVersioning> LVer;
664
665
  /// The vectorization SIMD factor to use. Each vector will have this many
666
  /// vector elements.
667
  unsigned VF;
668
669
  /// The vectorization unroll factor to use. Each scalar is vectorized to this
670
  /// many different vector instructions.
671
  unsigned UF;
672
673
  /// The builder that we use
674
  IRBuilder<> Builder;
675
676
  // --- Vectorization state ---
677
678
  /// The vector-loop preheader.
679
  BasicBlock *LoopVectorPreHeader;
680
681
  /// The scalar-loop preheader.
682
  BasicBlock *LoopScalarPreHeader;
683
684
  /// Middle Block between the vector and the scalar.
685
  BasicBlock *LoopMiddleBlock;
686
687
  /// The ExitBlock of the scalar loop.
688
  BasicBlock *LoopExitBlock;
689
690
  /// The vector loop body.
691
  BasicBlock *LoopVectorBody;
692
693
  /// The scalar loop body.
694
  BasicBlock *LoopScalarBody;
695
696
  /// A list of all bypass blocks. The first block is the entry of the loop.
697
  SmallVector<BasicBlock *, 4> LoopBypassBlocks;
698
699
  /// The new Induction variable which was added to the new block.
700
  PHINode *Induction = nullptr;
701
702
  /// The induction variable of the old basic block.
703
  PHINode *OldInduction = nullptr;
704
705
  /// Maps values from the original loop to their corresponding values in the
706
  /// vectorized loop. A key value can map to either vector values, scalar
707
  /// values or both kinds of values, depending on whether the key was
708
  /// vectorized and scalarized.
709
  VectorizerValueMap VectorLoopValueMap;
710
711
  /// Store instructions that were predicated.
712
  SmallVector<Instruction *, 4> PredicatedInstructions;
713
714
  /// Trip count of the original loop.
715
  Value *TripCount = nullptr;
716
717
  /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
718
  Value *VectorTripCount = nullptr;
719
720
  /// The legality analysis.
721
  LoopVectorizationLegality *Legal;
722
723
  /// The profitablity analysis.
724
  LoopVectorizationCostModel *Cost;
725
726
  // Record whether runtime checks are added.
727
  bool AddedSafetyChecks = false;
728
729
  // Holds the end values for each induction variable. We save the end values
730
  // so we can later fix-up the external users of the induction variables.
731
  DenseMap<PHINode *, Value *> IVEndValues;
732
733
  // Vector of original scalar PHIs whose corresponding widened PHIs need to be
734
  // fixed up at the end of vector code generation.
735
  SmallVector<PHINode *, 8> OrigPHIsToFix;
736
};
737
738
class InnerLoopUnroller : public InnerLoopVectorizer {
739
public:
740
  InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
741
                    LoopInfo *LI, DominatorTree *DT,
742
                    const TargetLibraryInfo *TLI,
743
                    const TargetTransformInfo *TTI, AssumptionCache *AC,
744
                    OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
745
                    LoopVectorizationLegality *LVL,
746
                    LoopVectorizationCostModel *CM)
747
      : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
748
1.74k
                            UnrollFactor, LVL, CM) {}
749
750
private:
751
  Value *getBroadcastInstrs(Value *V) override;
752
  Value *getStepVector(Value *Val, int StartIdx, Value *Step,
753
                       Instruction::BinaryOps Opcode =
754
                       Instruction::BinaryOpsEnd) override;
755
  Value *reverseVector(Value *Vec) override;
756
};
757
758
} // end namespace llvm
759
760
/// Look for a meaningful debug location on the instruction or it's
761
/// operands.
762
34.1k
static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
763
34.1k
  if (!I)
764
7.56k
    return I;
765
26.5k
766
26.5k
  DebugLoc Empty;
767
26.5k
  if (I->getDebugLoc() != Empty)
768
4
    return I;
769
26.5k
770
78.0k
  
for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); 26.5k
OI != OE;
++OI51.4k
) {
771
52.3k
    if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
772
26.5k
      if (OpInst->getDebugLoc() != Empty)
773
928
        return OpInst;
774
52.3k
  }
775
26.5k
776
26.5k
  
return I25.6k
;
777
26.5k
}
778
779
186k
void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
780
186k
  if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
781
178k
    const DILocation *DIL = Inst->getDebugLoc();
782
178k
    if (DIL && 
Inst->getFunction()->isDebugInfoForProfiling()8.93k
&&
783
178k
        
!isa<DbgInfoIntrinsic>(Inst)34
) {
784
34
      auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
785
34
      if (NewDIL)
786
34
        B.SetCurrentDebugLocation(NewDIL.getValue());
787
34
      else
788
34
        LLVM_DEBUG(dbgs()
789
34
                   << "Failed to create new discriminator: "
790
34
                   << DIL->getFilename() << " Line: " << DIL->getLine());
791
34
    }
792
178k
    else
793
178k
      B.SetCurrentDebugLocation(DIL);
794
178k
  } else
795
8.50k
    B.SetCurrentDebugLocation(DebugLoc());
796
186k
}
797
798
#ifndef NDEBUG
799
/// \return string containing a file name and a line # for the given loop.
800
static std::string getDebugLocString(const Loop *L) {
801
  std::string Result;
802
  if (L) {
803
    raw_string_ostream OS(Result);
804
    if (const DebugLoc LoopDbgLoc = L->getStartLoc())
805
      LoopDbgLoc.print(OS);
806
    else
807
      // Just print the module name.
808
      OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
809
    OS.flush();
810
  }
811
  return Result;
812
}
813
#endif
814
815
void InnerLoopVectorizer::addNewMetadata(Instruction *To,
816
210k
                                         const Instruction *Orig) {
817
210k
  // If the loop was versioned with memchecks, add the corresponding no-alias
818
210k
  // metadata.
819
210k
  if (LVer && 
(30.8k
isa<LoadInst>(Orig)30.8k
||
isa<StoreInst>(Orig)23.9k
))
820
11.9k
    LVer->annotateInstWithNoAlias(To, Orig);
821
210k
}
822
823
void InnerLoopVectorizer::addMetadata(Instruction *To,
824
138k
                                      Instruction *From) {
825
138k
  propagateMetadata(To, From);
826
138k
  addNewMetadata(To, From);
827
138k
}
828
829
void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
830
105k
                                      Instruction *From) {
831
105k
  for (Value *V : To) {
832
105k
    if (Instruction *I = dyn_cast<Instruction>(V))
833
105k
      addMetadata(I, From);
834
105k
  }
835
105k
}
836
837
namespace llvm {
838
839
/// LoopVectorizationCostModel - estimates the expected speedups due to
840
/// vectorization.
841
/// In many cases vectorization is not profitable. This can happen because of
842
/// a number of reasons. In this class we mainly attempt to predict the
843
/// expected speedup/slowdowns due to the supported instruction set. We use the
844
/// TargetTransformInfo to query the different backends for the cost of
845
/// different operations.
846
class LoopVectorizationCostModel {
847
public:
848
  LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE,
849
                             LoopInfo *LI, LoopVectorizationLegality *Legal,
850
                             const TargetTransformInfo &TTI,
851
                             const TargetLibraryInfo *TLI, DemandedBits *DB,
852
                             AssumptionCache *AC,
853
                             OptimizationRemarkEmitter *ORE, const Function *F,
854
                             const LoopVectorizeHints *Hints,
855
                             InterleavedAccessInfo &IAI)
856
      : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
857
19.9k
    AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {}
858
859
  /// \return An upper bound for the vectorization factor, or None if
860
  /// vectorization and interleaving should be avoided up front.
861
  Optional<unsigned> computeMaxVF(bool OptForSize);
862
863
  /// \return The most profitable vectorization factor and the cost of that VF.
864
  /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
865
  /// then this vectorization factor will be selected if vectorization is
866
  /// possible.
867
  VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
868
869
  /// Setup cost-based decisions for user vectorization factor.
870
649
  void selectUserVectorizationFactor(unsigned UserVF) {
871
649
    collectUniformsAndScalars(UserVF);
872
649
    collectInstsToScalarize(UserVF);
873
649
  }
874
875
  /// \return The size (in bits) of the smallest and widest types in the code
876
  /// that needs to be vectorized. We ignore values that remain scalar such as
877
  /// 64 bit loop indices.
878
  std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
879
880
  /// \return The desired interleave count.
881
  /// If interleave count has been specified by metadata it will be returned.
882
  /// Otherwise, the interleave count is computed and returned. VF and LoopCost
883
  /// are the selected vectorization factor and the cost of the selected VF.
884
  unsigned selectInterleaveCount(bool OptForSize, unsigned VF,
885
                                 unsigned LoopCost);
886
887
  /// Memory access instruction may be vectorized in more than one way.
888
  /// Form of instruction after vectorization depends on cost.
889
  /// This function takes cost-based decisions for Load/Store instructions
890
  /// and collects them in a map. This decisions map is used for building
891
  /// the lists of loop-uniform and loop-scalar instructions.
892
  /// The calculated cost is saved with widening decision in order to
893
  /// avoid redundant calculations.
894
  void setCostBasedWideningDecision(unsigned VF);
895
896
  /// A struct that represents some properties of the register usage
897
  /// of a loop.
898
  struct RegisterUsage {
899
    /// Holds the number of loop invariant values that are used in the loop.
900
    unsigned LoopInvariantRegs;
901
902
    /// Holds the maximum number of concurrent live intervals in the loop.
903
    unsigned MaxLocalUsers;
904
  };
905
906
  /// \return Returns information about the register usages of the loop for the
907
  /// given vectorization factors.
908
  SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
909
910
  /// Collect values we want to ignore in the cost model.
911
  void collectValuesToIgnore();
912
913
  /// \returns The smallest bitwidth each instruction can be represented with.
914
  /// The vector equivalents of these instructions should be truncated to this
915
  /// type.
916
30.6k
  const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
917
30.6k
    return MinBWs;
918
30.6k
  }
919
920
  /// \returns True if it is more profitable to scalarize instruction \p I for
921
  /// vectorization factor \p VF.
922
492k
  bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
923
492k
    assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
924
492k
925
492k
    // Cost model is not run in the VPlan-native path - return conservative
926
492k
    // result until this changes.
927
492k
    if (EnableVPlanNativePath)
928
47
      return false;
929
492k
930
492k
    auto Scalars = InstsToScalarize.find(VF);
931
492k
    assert(Scalars != InstsToScalarize.end() &&
932
492k
           "VF not yet analyzed for scalarization profitability");
933
492k
    return Scalars->second.find(I) != Scalars->second.end();
934
492k
  }
935
936
  /// Returns true if \p I is known to be uniform after vectorization.
937
917k
  bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
938
917k
    if (VF == 1)
939
404k
      return true;
940
512k
941
512k
    // Cost model is not run in the VPlan-native path - return conservative
942
512k
    // result until this changes.
943
512k
    if (EnableVPlanNativePath)
944
0
      return false;
945
512k
946
512k
    auto UniformsPerVF = Uniforms.find(VF);
947
512k
    assert(UniformsPerVF != Uniforms.end() &&
948
512k
           "VF not yet analyzed for uniformity");
949
512k
    return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
950
512k
  }
951
952
  /// Returns true if \p I is known to be scalar after vectorization.
953
1.72M
  bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
954
1.72M
    if (VF == 1)
955
650k
      return true;
956
1.07M
957
1.07M
    // Cost model is not run in the VPlan-native path - return conservative
958
1.07M
    // result until this changes.
959
1.07M
    if (EnableVPlanNativePath)
960
47
      return false;
961
1.07M
962
1.07M
    auto ScalarsPerVF = Scalars.find(VF);
963
1.07M
    assert(ScalarsPerVF != Scalars.end() &&
964
1.07M
           "Scalar values are not calculated for VF");
965
1.07M
    return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
966
1.07M
  }
967
968
  /// \returns True if instruction \p I can be truncated to a smaller bitwidth
969
  /// for vectorization factor \p VF.
970
757k
  bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
971
757k
    return VF > 1 && 
MinBWs.find(I) != MinBWs.end()296k
&&
972
757k
           
!isProfitableToScalarize(I, VF)893
&&
973
757k
           
!isScalarAfterVectorization(I, VF)893
;
974
757k
  }
975
976
  /// Decision that was taken during cost calculation for memory instruction.
977
  enum InstWidening {
978
    CM_Unknown,
979
    CM_Widen,         // For consecutive accesses with stride +1.
980
    CM_Widen_Reverse, // For consecutive accesses with stride -1.
981
    CM_Interleave,
982
    CM_GatherScatter,
983
    CM_Scalarize
984
  };
985
986
  /// Save vectorization decision \p W and \p Cost taken by the cost model for
987
  /// instruction \p I and vector width \p VF.
988
  void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
989
58.3k
                           unsigned Cost) {
990
58.3k
    assert(VF >= 2 && "Expected VF >=2");
991
58.3k
    WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
992
58.3k
  }
993
994
  /// Save vectorization decision \p W and \p Cost taken by the cost model for
995
  /// interleaving group \p Grp and vector width \p VF.
996
  void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
997
2.26k
                           InstWidening W, unsigned Cost) {
998
2.26k
    assert(VF >= 2 && "Expected VF >=2");
999
2.26k
    /// Broadcast this decicion to all instructions inside the group.
1000
2.26k
    /// But the cost will be assigned to one instruction only.
1001
9.05k
    for (unsigned i = 0; i < Grp->getFactor(); 
++i6.78k
) {
1002
6.78k
      if (auto *I = Grp->getMember(i)) {
1003
5.90k
        if (Grp->getInsertPos() == I)
1004
2.26k
          WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1005
3.64k
        else
1006
3.64k
          WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1007
5.90k
      }
1008
6.78k
    }
1009
2.26k
  }
1010
1011
  /// Return the cost model decision for the given instruction \p I and vector
1012
  /// width \p VF. Return CM_Unknown if this instruction did not pass
1013
  /// through the cost modeling.
1014
248k
  InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1015
248k
    assert(VF >= 2 && "Expected VF >=2");
1016
248k
1017
248k
    // Cost model is not run in the VPlan-native path - return conservative
1018
248k
    // result until this changes.
1019
248k
    if (EnableVPlanNativePath)
1020
16
      return CM_GatherScatter;
1021
248k
1022
248k
    std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1023
248k
    auto Itr = WideningDecisions.find(InstOnVF);
1024
248k
    if (Itr == WideningDecisions.end())
1025
2.26k
      return CM_Unknown;
1026
245k
    return Itr->second.first;
1027
245k
  }
1028
1029
  /// Return the vectorization cost for the given instruction \p I and vector
1030
  /// width \p VF.
1031
64.1k
  unsigned getWideningCost(Instruction *I, unsigned VF) {
1032
64.1k
    assert(VF >= 2 && "Expected VF >=2");
1033
64.1k
    std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1034
64.1k
    assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1035
64.1k
           "The cost is not calculated");
1036
64.1k
    return WideningDecisions[InstOnVF].second;
1037
64.1k
  }
1038
1039
  /// Return True if instruction \p I is an optimizable truncate whose operand
1040
  /// is an induction variable. Such a truncate will be removed by adding a new
1041
  /// induction variable with the destination type.
1042
99.7k
  bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1043
99.7k
    // If the instruction is not a truncate, return false.
1044
99.7k
    auto *Trunc = dyn_cast<TruncInst>(I);
1045
99.7k
    if (!Trunc)
1046
45.2k
      return false;
1047
54.5k
1048
54.5k
    // Get the source and destination types of the truncate.
1049
54.5k
    Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1050
54.5k
    Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1051
54.5k
1052
54.5k
    // If the truncate is free for the given types, return false. Replacing a
1053
54.5k
    // free truncate with an induction variable would add an induction variable
1054
54.5k
    // update instruction to each iteration of the loop. We exclude from this
1055
54.5k
    // check the primary induction variable since it will need an update
1056
54.5k
    // instruction regardless.
1057
54.5k
    Value *Op = Trunc->getOperand(0);
1058
54.5k
    if (Op != Legal->getPrimaryInduction() && 
TTI.isTruncateFree(SrcTy, DestTy)42.0k
)
1059
16.7k
      return false;
1060
37.8k
1061
37.8k
    // If the truncated value is not an induction variable, return false.
1062
37.8k
    return Legal->isInductionPhi(Op);
1063
37.8k
  }
1064
1065
  /// Collects the instructions to scalarize for each predicated instruction in
1066
  /// the loop.
1067
  void collectInstsToScalarize(unsigned VF);
1068
1069
  /// Collect Uniform and Scalar values for the given \p VF.
1070
  /// The sets depend on CM decision for Load/Store instructions
1071
  /// that may be vectorized as interleave, gather-scatter or scalarized.
1072
197k
  void collectUniformsAndScalars(unsigned VF) {
1073
197k
    // Do the analysis once.
1074
197k
    if (VF == 1 || 
Uniforms.find(VF) != Uniforms.end()177k
)
1075
164k
      return;
1076
32.3k
    setCostBasedWideningDecision(VF);
1077
32.3k
    collectLoopUniforms(VF);
1078
32.3k
    collectLoopScalars(VF);
1079
32.3k
  }
1080
1081
  /// Returns true if the target machine supports masked store operation
1082
  /// for the given \p DataType and kind of access to \p Ptr.
1083
4.52k
  bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
1084
4.52k
    return Legal->isConsecutivePtr(Ptr) && 
TTI.isLegalMaskedStore(DataType)3.02k
;
1085
4.52k
  }
1086
1087
  /// Returns true if the target machine supports masked load operation
1088
  /// for the given \p DataType and kind of access to \p Ptr.
1089
3.64k
  bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
1090
3.64k
    return Legal->isConsecutivePtr(Ptr) && 
TTI.isLegalMaskedLoad(DataType)1.89k
;
1091
3.64k
  }
1092
1093
  /// Returns true if the target machine supports masked scatter operation
1094
  /// for the given \p DataType.
1095
10.6k
  bool isLegalMaskedScatter(Type *DataType) {
1096
10.6k
    return TTI.isLegalMaskedScatter(DataType);
1097
10.6k
  }
1098
1099
  /// Returns true if the target machine supports masked gather operation
1100
  /// for the given \p DataType.
1101
13.7k
  bool isLegalMaskedGather(Type *DataType) {
1102
13.7k
    return TTI.isLegalMaskedGather(DataType);
1103
13.7k
  }
1104
1105
  /// Returns true if the target machine can represent \p V as a masked gather
1106
  /// or scatter operation.
1107
16.8k
  bool isLegalGatherOrScatter(Value *V) {
1108
16.8k
    bool LI = isa<LoadInst>(V);
1109
16.8k
    bool SI = isa<StoreInst>(V);
1110
16.8k
    if (!LI && 
!SI6.49k
)
1111
0
      return false;
1112
16.8k
    auto *Ty = getMemInstValueType(V);
1113
16.8k
    return (LI && 
isLegalMaskedGather(Ty)10.3k
) ||
(16.7k
SI16.7k
&&
isLegalMaskedScatter(Ty)6.49k
);
1114
16.8k
  }
1115
1116
  /// Returns true if \p I is an instruction that will be scalarized with
1117
  /// predication. Such instructions include conditional stores and
1118
  /// instructions that may divide by zero.
1119
  /// If a non-zero VF has been calculated, we check if I will be scalarized
1120
  /// predication for that VF.
1121
  bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1122
1123
  // Returns true if \p I is an instruction that will be predicated either
1124
  // through scalar predication or masked load/store or masked gather/scatter.
1125
  // Superset of instructions that return true for isScalarWithPredication.
1126
16.5k
  bool isPredicatedInst(Instruction *I) {
1127
16.5k
    if (!blockNeedsPredication(I->getParent()))
1128
14.2k
      return false;
1129
2.34k
    // Loads and stores that need some form of masked operation are predicated
1130
2.34k
    // instructions.
1131
2.34k
    if (isa<LoadInst>(I) || 
isa<StoreInst>(I)1.08k
)
1132
2.34k
      return Legal->isMaskRequired(I);
1133
0
    return isScalarWithPredication(I);
1134
0
  }
1135
1136
  /// Returns true if \p I is a memory instruction with consecutive memory
1137
  /// access that can be widened.
1138
  bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1139
1140
  /// Returns true if \p I is a memory instruction in an interleaved-group
1141
  /// of memory accesses that can be vectorized with wide vector loads/stores
1142
  /// and shuffles.
1143
  bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1144
1145
  /// Check if \p Instr belongs to any interleaved access group.
1146
20.4k
  bool isAccessInterleaved(Instruction *Instr) {
1147
20.4k
    return InterleaveInfo.isInterleaved(Instr);
1148
20.4k
  }
1149
1150
  /// Get the interleaved access group that \p Instr belongs to.
1151
  const InterleaveGroup<Instruction> *
1152
790k
  getInterleavedAccessGroup(Instruction *Instr) {
1153
790k
    return InterleaveInfo.getInterleaveGroup(Instr);
1154
790k
  }
1155
1156
  /// Returns true if an interleaved group requires a scalar iteration
1157
  /// to handle accesses with gaps, and there is nothing preventing us from
1158
  /// creating a scalar epilogue.
1159
32.3k
  bool requiresScalarEpilogue() const {
1160
32.3k
    return IsScalarEpilogueAllowed && 
InterleaveInfo.requiresScalarEpilogue()32.2k
;
1161
32.3k
  }
1162
1163
  /// Returns true if a scalar epilogue is not allowed due to optsize.
1164
68
  bool isScalarEpilogueAllowed() const { return IsScalarEpilogueAllowed; }
1165
1166
  /// Returns true if all loop blocks should be masked to fold tail loop.
1167
915k
  bool foldTailByMasking() const { return FoldTailByMasking; }
1168
1169
844k
  bool blockNeedsPredication(BasicBlock *BB) {
1170
844k
    return foldTailByMasking() || 
Legal->blockNeedsPredication(BB)840k
;
1171
844k
  }
1172
1173
  /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1174
  /// with factor VF.  Return the cost of the instruction, including
1175
  /// scalarization overhead if it's needed.
1176
  unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
1177
1178
  /// Estimate cost of a call instruction CI if it were vectorized with factor
1179
  /// VF. Return the cost of the instruction, including scalarization overhead
1180
  /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1181
  /// scalarized -
1182
  /// i.e. either vector version isn't available, or is too expensive.
1183
  unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
1184
1185
private:
1186
  unsigned NumPredStores = 0;
1187
1188
  /// \return An upper bound for the vectorization factor, larger than zero.
1189
  /// One is returned if vectorization should best be avoided due to cost.
1190
  unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount);
1191
1192
  /// The vectorization cost is a combination of the cost itself and a boolean
1193
  /// indicating whether any of the contributing operations will actually
1194
  /// operate on
1195
  /// vector values after type legalization in the backend. If this latter value
1196
  /// is
1197
  /// false, then all operations will be scalarized (i.e. no vectorization has
1198
  /// actually taken place).
1199
  using VectorizationCostTy = std::pair<unsigned, bool>;
1200
1201
  /// Returns the expected execution cost. The unit of the cost does
1202
  /// not matter because we use the 'cost' units to compare different
1203
  /// vector widths. The cost that is returned is *not* normalized by
1204
  /// the factor width.
1205
  VectorizationCostTy expectedCost(unsigned VF);
1206
1207
  /// Returns the execution time cost of an instruction for a given vector
1208
  /// width. Vector width of one means scalar.
1209
  VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1210
1211
  /// The cost-computation logic from getInstructionCost which provides
1212
  /// the vector type as an output parameter.
1213
  unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1214
1215
  /// Calculate vectorization cost of memory instruction \p I.
1216
  unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1217
1218
  /// The cost computation for scalarized memory instruction.
1219
  unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1220
1221
  /// The cost computation for interleaving group of memory instructions.
1222
  unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1223
1224
  /// The cost computation for Gather/Scatter instruction.
1225
  unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1226
1227
  /// The cost computation for widening instruction \p I with consecutive
1228
  /// memory access.
1229
  unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1230
1231
  /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1232
  /// Load: scalar load + broadcast.
1233
  /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1234
  /// element)
1235
  unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1236
1237
  /// Estimate the overhead of scalarizing an instruction. This is a
1238
  /// convenience wrapper for the type-based getScalarizationOverhead API.
1239
  unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
1240
1241
  /// Returns whether the instruction is a load or store and will be a emitted
1242
  /// as a vector operation.
1243
  bool isConsecutiveLoadOrStore(Instruction *I);
1244
1245
  /// Returns true if an artificially high cost for emulated masked memrefs
1246
  /// should be used.
1247
  bool useEmulatedMaskMemRefHack(Instruction *I);
1248
1249
  /// Create an analysis remark that explains why vectorization failed
1250
  ///
1251
  /// \p RemarkName is the identifier for the remark.  \return the remark object
1252
  /// that can be streamed to.
1253
103
  OptimizationRemarkAnalysis createMissedAnalysis(StringRef RemarkName) {
1254
103
    return createLVMissedAnalysis(Hints->vectorizeAnalysisPassName(),
1255
103
                                  RemarkName, TheLoop);
1256
103
  }
1257
1258
  /// Map of scalar integer values to the smallest bitwidth they can be legally
1259
  /// represented as. The vector equivalents of these values should be truncated
1260
  /// to this type.
1261
  MapVector<Instruction *, uint64_t> MinBWs;
1262
1263
  /// A type representing the costs for instructions if they were to be
1264
  /// scalarized rather than vectorized. The entries are Instruction-Cost
1265
  /// pairs.
1266
  using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1267
1268
  /// A set containing all BasicBlocks that are known to present after
1269
  /// vectorization as a predicated block.
1270
  SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1271
1272
  /// Records whether it is allowed to have the original scalar loop execute at
1273
  /// least once. This may be needed as a fallback loop in case runtime 
1274
  /// aliasing/dependence checks fail, or to handle the tail/remainder
1275
  /// iterations when the trip count is unknown or doesn't divide by the VF,
1276
  /// or as a peel-loop to handle gaps in interleave-groups.
1277
  /// Under optsize and when the trip count is very small we don't allow any
1278
  /// iterations to execute in the scalar loop.
1279
  bool IsScalarEpilogueAllowed = true;
1280
1281
  /// All blocks of loop are to be masked to fold tail of scalar iterations.
1282
  bool FoldTailByMasking = false;
1283
1284
  /// A map holding scalar costs for different vectorization factors. The
1285
  /// presence of a cost for an instruction in the mapping indicates that the
1286
  /// instruction will be scalarized when vectorizing with the associated
1287
  /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1288
  DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1289
1290
  /// Holds the instructions known to be uniform after vectorization.
1291
  /// The data is collected per VF.
1292
  DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
1293
1294
  /// Holds the instructions known to be scalar after vectorization.
1295
  /// The data is collected per VF.
1296
  DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
1297
1298
  /// Holds the instructions (address computations) that are forced to be
1299
  /// scalarized.
1300
  DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1301
1302
  /// Returns the expected difference in cost from scalarizing the expression
1303
  /// feeding a predicated instruction \p PredInst. The instructions to
1304
  /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1305
  /// non-negative return value implies the expression will be scalarized.
1306
  /// Currently, only single-use chains are considered for scalarization.
1307
  int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1308
                              unsigned VF);
1309
1310
  /// Collect the instructions that are uniform after vectorization. An
1311
  /// instruction is uniform if we represent it with a single scalar value in
1312
  /// the vectorized loop corresponding to each vector iteration. Examples of
1313
  /// uniform instructions include pointer operands of consecutive or
1314
  /// interleaved memory accesses. Note that although uniformity implies an
1315
  /// instruction will be scalar, the reverse is not true. In general, a
1316
  /// scalarized instruction will be represented by VF scalar values in the
1317
  /// vectorized loop, each corresponding to an iteration of the original
1318
  /// scalar loop.
1319
  void collectLoopUniforms(unsigned VF);
1320
1321
  /// Collect the instructions that are scalar after vectorization. An
1322
  /// instruction is scalar if it is known to be uniform or will be scalarized
1323
  /// during vectorization. Non-uniform scalarized instructions will be
1324
  /// represented by VF values in the vectorized loop, each corresponding to an
1325
  /// iteration of the original scalar loop.
1326
  void collectLoopScalars(unsigned VF);
1327
1328
  /// Keeps cost model vectorization decision and cost for instructions.
1329
  /// Right now it is used for memory instructions only.
1330
  using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
1331
                                std::pair<InstWidening, unsigned>>;
1332
1333
  DecisionList WideningDecisions;
1334
1335
  /// Returns true if \p V is expected to be vectorized and it needs to be
1336
  /// extracted.
1337
30.3k
  bool needsExtract(Value *V, unsigned VF) const {
1338
30.3k
    Instruction *I = dyn_cast<Instruction>(V);
1339
30.3k
    if (VF == 1 || !I || 
!TheLoop->contains(I)28.4k
||
TheLoop->isLoopInvariant(I)27.8k
)
1340
2.51k
      return false;
1341
27.8k
1342
27.8k
    // Assume we can vectorize V (and hence we need extraction) if the
1343
27.8k
    // scalars are not computed yet. This can happen, because it is called
1344
27.8k
    // via getScalarizationOverhead from setCostBasedWideningDecision, before
1345
27.8k
    // the scalars are collected. That should be a safe assumption in most
1346
27.8k
    // cases, because we check if the operands have vectorizable types
1347
27.8k
    // beforehand in LoopVectorizationLegality.
1348
27.8k
    return Scalars.find(VF) == Scalars.end() ||
1349
27.8k
           
!isScalarAfterVectorization(I, VF)2.59k
;
1350
27.8k
  };
1351
1352
  /// Returns a range containing only operands needing to be extracted.
1353
  SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1354
18.1k
                                                   unsigned VF) {
1355
18.1k
    return SmallVector<Value *, 4>(make_filter_range(
1356
29.6k
        Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1357
18.1k
  }
1358
1359
public:
1360
  /// The loop that we evaluate.
1361
  Loop *TheLoop;
1362
1363
  /// Predicated scalar evolution analysis.
1364
  PredicatedScalarEvolution &PSE;
1365
1366
  /// Loop Info analysis.
1367
  LoopInfo *LI;
1368
1369
  /// Vectorization legality.
1370
  LoopVectorizationLegality *Legal;
1371
1372
  /// Vector target information.
1373
  const TargetTransformInfo &TTI;
1374
1375
  /// Target Library Info.
1376
  const TargetLibraryInfo *TLI;
1377
1378
  /// Demanded bits analysis.
1379
  DemandedBits *DB;
1380
1381
  /// Assumption cache.
1382
  AssumptionCache *AC;
1383
1384
  /// Interface to emit optimization remarks.
1385
  OptimizationRemarkEmitter *ORE;
1386
1387
  const Function *TheFunction;
1388
1389
  /// Loop Vectorize Hint.
1390
  const LoopVectorizeHints *Hints;
1391
1392
  /// The interleave access information contains groups of interleaved accesses
1393
  /// with the same stride and close to each other.
1394
  InterleavedAccessInfo &InterleaveInfo;
1395
1396
  /// Values to ignore in the cost model.
1397
  SmallPtrSet<const Value *, 16> ValuesToIgnore;
1398
1399
  /// Values to ignore in the cost model when VF > 1.
1400
  SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1401
};
1402
1403
} // end namespace llvm
1404
1405
// Return true if \p OuterLp is an outer loop annotated with hints for explicit
1406
// vectorization. The loop needs to be annotated with #pragma omp simd
1407
// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1408
// vector length information is not provided, vectorization is not considered
1409
// explicit. Interleave hints are not allowed either. These limitations will be
1410
// relaxed in the future.
1411
// Please, note that we are currently forced to abuse the pragma 'clang
1412
// vectorize' semantics. This pragma provides *auto-vectorization hints*
1413
// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1414
// provides *explicit vectorization hints* (LV can bypass legal checks and
1415
// assume that vectorization is legal). However, both hints are implemented
1416
// using the same metadata (llvm.loop.vectorize, processed by
1417
// LoopVectorizeHints). This will be fixed in the future when the native IR
1418
// representation for pragma 'omp simd' is introduced.
1419
static bool isExplicitVecOuterLoop(Loop *OuterLp,
1420
7
                                   OptimizationRemarkEmitter *ORE) {
1421
7
  assert(!OuterLp->empty() && "This is not an outer loop");
1422
7
  LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1423
7
1424
7
  // Only outer loops with an explicit vectorization hint are supported.
1425
7
  // Unannotated outer loops are ignored.
1426
7
  if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1427
0
    return false;
1428
7
1429
7
  Function *Fn = OuterLp->getHeader()->getParent();
1430
7
  if (!Hints.allowVectorization(Fn, OuterLp,
1431
7
                                true /*VectorizeOnlyWhenForced*/)) {
1432
0
    LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1433
0
    return false;
1434
0
  }
1435
7
1436
7
  if (Hints.getInterleave() > 1) {
1437
0
    // TODO: Interleave support is future work.
1438
0
    LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1439
0
                         "outer loops.\n");
1440
0
    Hints.emitRemarkWithHints();
1441
0
    return false;
1442
0
  }
1443
7
1444
7
  return true;
1445
7
}
1446
1447
static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1448
                                  OptimizationRemarkEmitter *ORE,
1449
179k
                                  SmallVectorImpl<Loop *> &V) {
1450
179k
  // Collect inner loops and outer loops without irreducible control flow. For
1451
179k
  // now, only collect outer loops that have explicit vectorization hints. If we
1452
179k
  // are stress testing the VPlan H-CFG construction, we collect the outermost
1453
179k
  // loop of every loop nest.
1454
179k
  if (L.empty() || 
VPlanBuildStressTest32.7k
||
1455
179k
      
(32.7k
EnableVPlanNativePath32.7k
&&
isExplicitVecOuterLoop(&L, ORE)7
)) {
1456
146k
    LoopBlocksRPO RPOT(&L);
1457
146k
    RPOT.perform(LI);
1458
146k
    if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1459
146k
      V.push_back(&L);
1460
146k
      // TODO: Collect inner loops inside marked outer loops in case
1461
146k
      // vectorization fails for the outer loop. Do not invoke
1462
146k
      // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1463
146k
      // already known to be reducible. We can use an inherited attribute for
1464
146k
      // that.
1465
146k
      return;
1466
146k
    }
1467
32.7k
  }
1468
32.7k
  for (Loop *InnerL : L)
1469
49.7k
    collectSupportedLoops(*InnerL, LI, ORE, V);
1470
32.7k
}
1471
1472
namespace {
1473
1474
/// The LoopVectorize Pass.
1475
struct LoopVectorize : public FunctionPass {
1476
  /// Pass identification, replacement for typeid
1477
  static char ID;
1478
1479
  LoopVectorizePass Impl;
1480
1481
  explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1482
                         bool VectorizeOnlyWhenForced = false)
1483
13.7k
      : FunctionPass(ID) {
1484
13.7k
    Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
1485
13.7k
    Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
1486
13.7k
    initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1487
13.7k
  }
1488
1489
279k
  bool runOnFunction(Function &F) override {
1490
279k
    if (skipFunction(F))
1491
44
      return false;
1492
279k
1493
279k
    auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1494
279k
    auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1495
279k
    auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1496
279k
    auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1497
279k
    auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1498
279k
    auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1499
279k
    auto *TLI = TLIP ? &TLIP->getTLI() : 
nullptr0
;
1500
279k
    auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1501
279k
    auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1502
279k
    auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1503
279k
    auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1504
279k
    auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1505
279k
    auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1506
279k
1507
279k
    std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1508
279k
        [&](Loop &L) -> const LoopAccessInfo & 
{ return LAA->getInfo(&L); }27.3k
;
1509
279k
1510
279k
    return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1511
279k
                        GetLAA, *ORE, PSI);
1512
279k
  }
1513
1514
13.7k
  void getAnalysisUsage(AnalysisUsage &AU) const override {
1515
13.7k
    AU.addRequired<AssumptionCacheTracker>();
1516
13.7k
    AU.addRequired<BlockFrequencyInfoWrapperPass>();
1517
13.7k
    AU.addRequired<DominatorTreeWrapperPass>();
1518
13.7k
    AU.addRequired<LoopInfoWrapperPass>();
1519
13.7k
    AU.addRequired<ScalarEvolutionWrapperPass>();
1520
13.7k
    AU.addRequired<TargetTransformInfoWrapperPass>();
1521
13.7k
    AU.addRequired<AAResultsWrapperPass>();
1522
13.7k
    AU.addRequired<LoopAccessLegacyAnalysis>();
1523
13.7k
    AU.addRequired<DemandedBitsWrapperPass>();
1524
13.7k
    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1525
13.7k
1526
13.7k
    // We currently do not preserve loopinfo/dominator analyses with outer loop
1527
13.7k
    // vectorization. Until this is addressed, mark these analyses as preserved
1528
13.7k
    // only for non-VPlan-native path.
1529
13.7k
    // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1530
13.7k
    if (!EnableVPlanNativePath) {
1531
13.7k
      AU.addPreserved<LoopInfoWrapperPass>();
1532
13.7k
      AU.addPreserved<DominatorTreeWrapperPass>();
1533
13.7k
    }
1534
13.7k
1535
13.7k
    AU.addPreserved<BasicAAWrapperPass>();
1536
13.7k
    AU.addPreserved<GlobalsAAWrapperPass>();
1537
13.7k
    AU.addRequired<ProfileSummaryInfoWrapperPass>();
1538
13.7k
  }
1539
};
1540
1541
} // end anonymous namespace
1542
1543
//===----------------------------------------------------------------------===//
1544
// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1545
// LoopVectorizationCostModel and LoopVectorizationPlanner.
1546
//===----------------------------------------------------------------------===//
1547
1548
52.8k
Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1549
52.8k
  // We need to place the broadcast of invariant variables outside the loop,
1550
52.8k
  // but only if it's proven safe to do so. Else, broadcast will be inside
1551
52.8k
  // vector loop body.
1552
52.8k
  Instruction *Instr = dyn_cast<Instruction>(V);
1553
52.8k
  bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1554
52.8k
                     (!Instr ||
1555
52.8k
                      
DT->dominates(Instr->getParent(), LoopVectorPreHeader)14.1k
);
1556
52.8k
  // Place the code for broadcasting invariant variables in the new preheader.
1557
52.8k
  IRBuilder<>::InsertPointGuard Guard(Builder);
1558
52.8k
  if (SafeToHoist)
1559
46.6k
    Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1560
52.8k
1561
52.8k
  // Broadcast the scalar into all locations in the vector.
1562
52.8k
  Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1563
52.8k
1564
52.8k
  return Shuf;
1565
52.8k
}
1566
1567
void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1568
11.2k
    const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1569
11.2k
  assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1570
11.2k
         "Expected either an induction phi-node or a truncate of it!");
1571
11.2k
  Value *Start = II.getStartValue();
1572
11.2k
1573
11.2k
  // Construct the initial value of the vector IV in the vector loop preheader
1574
11.2k
  auto CurrIP = Builder.saveIP();
1575
11.2k
  Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1576
11.2k
  if (isa<TruncInst>(EntryVal)) {
1577
2.39k
    assert(Start->getType()->isIntegerTy() &&
1578
2.39k
           "Truncation requires an integer type");
1579
2.39k
    auto *TruncType = cast<IntegerType>(EntryVal->getType());
1580
2.39k
    Step = Builder.CreateTrunc(Step, TruncType);
1581
2.39k
    Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1582
2.39k
  }
1583
11.2k
  Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1584
11.2k
  Value *SteppedStart =
1585
11.2k
      getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1586
11.2k
1587
11.2k
  // We create vector phi nodes for both integer and floating-point induction
1588
11.2k
  // variables. Here, we determine the kind of arithmetic we will perform.
1589
11.2k
  Instruction::BinaryOps AddOp;
1590
11.2k
  Instruction::BinaryOps MulOp;
1591
11.2k
  if (Step->getType()->isIntegerTy()) {
1592
11.1k
    AddOp = Instruction::Add;
1593
11.1k
    MulOp = Instruction::Mul;
1594
11.1k
  } else {
1595
20
    AddOp = II.getInductionOpcode();
1596
20
    MulOp = Instruction::FMul;
1597
20
  }
1598
11.2k
1599
11.2k
  // Multiply the vectorization factor by the step using integer or
1600
11.2k
  // floating-point arithmetic as appropriate.
1601
11.2k
  Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1602
11.2k
  Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1603
11.2k
1604
11.2k
  // Create a vector splat to use in the induction update.
1605
11.2k
  //
1606
11.2k
  // FIXME: If the step is non-constant, we create the vector splat with
1607
11.2k
  //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1608
11.2k
  //        handle a constant vector splat.
1609
11.2k
  Value *SplatVF = isa<Constant>(Mul)
1610
11.2k
                       ? 
ConstantVector::getSplat(VF, cast<Constant>(Mul))11.1k
1611
11.2k
                       : 
Builder.CreateVectorSplat(VF, Mul)32
;
1612
11.2k
  Builder.restoreIP(CurrIP);
1613
11.2k
1614
11.2k
  // We may need to add the step a number of times, depending on the unroll
1615
11.2k
  // factor. The last of those goes into the PHI.
1616
11.2k
  PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1617
11.2k
                                    &*LoopVectorBody->getFirstInsertionPt());
1618
11.2k
  VecInd->setDebugLoc(EntryVal->getDebugLoc());
1619
11.2k
  Instruction *LastInduction = VecInd;
1620
32.9k
  for (unsigned Part = 0; Part < UF; 
++Part21.7k
) {
1621
21.7k
    VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1622
21.7k
1623
21.7k
    if (isa<TruncInst>(EntryVal))
1624
4.46k
      addMetadata(LastInduction, EntryVal);
1625
21.7k
    recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1626
21.7k
1627
21.7k
    LastInduction = cast<Instruction>(addFastMathFlag(
1628
21.7k
        Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1629
21.7k
    LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1630
21.7k
  }
1631
11.2k
1632
11.2k
  // Move the last step to the end of the latch block. This ensures consistent
1633
11.2k
  // placement of all induction updates.
1634
11.2k
  auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1635
11.2k
  auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1636
11.2k
  auto *ICmp = cast<Instruction>(Br->getCondition());
1637
11.2k
  LastInduction->moveBefore(ICmp);
1638
11.2k
  LastInduction->setName("vec.ind.next");
1639
11.2k
1640
11.2k
  VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1641
11.2k
  VecInd->addIncoming(LastInduction, LoopVectorLatch);
1642
11.2k
}
1643
1644
48.8k
bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1645
48.8k
  return Cost->isScalarAfterVectorization(I, VF) ||
1646
48.8k
         
Cost->isProfitableToScalarize(I, VF)27.8k
;
1647
48.8k
}
1648
1649
17.4k
bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1650
17.4k
  if (shouldScalarizeInstruction(IV))
1651
6.20k
    return true;
1652
13.9k
  
auto isScalarInst = [&](User *U) -> bool 11.2k
{
1653
13.9k
    auto *I = cast<Instruction>(U);
1654
13.9k
    return (OrigLoop->contains(I) && 
shouldScalarizeInstruction(I)13.9k
);
1655
13.9k
  };
1656
11.2k
  return llvm::any_of(IV->users(), isScalarInst);
1657
11.2k
}
1658
1659
void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1660
    const InductionDescriptor &ID, const Instruction *EntryVal,
1661
96.1k
    Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1662
96.1k
  assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1663
96.1k
         "Expected either an induction phi-node or a truncate of it!");
1664
96.1k
1665
96.1k
  // This induction variable is not the phi from the original loop but the
1666
96.1k
  // newly-created IV based on the proof that casted Phi is equal to the
1667
96.1k
  // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1668
96.1k
  // re-uses the same InductionDescriptor that original IV uses but we don't
1669
96.1k
  // have to do any recording in this case - that is done when original IV is
1670
96.1k
  // processed.
1671
96.1k
  if (isa<TruncInst>(EntryVal))
1672
4.81k
    return;
1673
91.3k
1674
91.3k
  const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1675
91.3k
  if (Casts.empty())
1676
91.3k
    return;
1677
24
  // Only the first Cast instruction in the Casts vector is of interest.
1678
24
  // The rest of the Casts (if exist) have no uses outside the
1679
24
  // induction update chain itself.
1680
24
  Instruction *CastInst = *Casts.begin();
1681
24
  if (Lane < UINT_MAX)
1682
24
    
VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal)4
;
1683
20
  else
1684
20
    VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1685
24
}
1686
1687
19.2k
void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1688
19.2k
  assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1689
19.2k
         "Primary induction variable must have an integer type");
1690
19.2k
1691
19.2k
  auto II = Legal->getInductionVars()->find(IV);
1692
19.2k
  assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
1693
19.2k
1694
19.2k
  auto ID = II->second;
1695
19.2k
  assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1696
19.2k
1697
19.2k
  // The scalar value to broadcast. This will be derived from the canonical
1698
19.2k
  // induction variable.
1699
19.2k
  Value *ScalarIV = nullptr;
1700
19.2k
1701
19.2k
  // The value from the original loop to which we are mapping the new induction
1702
19.2k
  // variable.
1703
19.2k
  Instruction *EntryVal = Trunc ? 
cast<Instruction>(Trunc)2.50k
:
IV16.7k
;
1704
19.2k
1705
19.2k
  // True if we have vectorized the induction variable.
1706
19.2k
  auto VectorizedIV = false;
1707
19.2k
1708
19.2k
  // Determine if we want a scalar version of the induction variable. This is
1709
19.2k
  // true if the induction variable itself is not widened, or if it has at
1710
19.2k
  // least one user in the loop that is not widened.
1711
19.2k
  auto NeedsScalarIV = VF > 1 && 
needsScalarInduction(EntryVal)17.4k
;
1712
19.2k
1713
19.2k
  // Generate code for the induction step. Note that induction steps are
1714
19.2k
  // required to be loop-invariant
1715
19.2k
  assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
1716
19.2k
         "Induction step should be loop invariant");
1717
19.2k
  auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1718
19.2k
  Value *Step = nullptr;
1719
19.2k
  if (PSE.getSE()->isSCEVable(IV->getType())) {
1720
19.2k
    SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1721
19.2k
    Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
1722
19.2k
                             LoopVectorPreHeader->getTerminator());
1723
19.2k
  } else {
1724
26
    Step = cast<SCEVUnknown>(ID.getStep())->getValue();
1725
26
  }
1726
19.2k
1727
19.2k
  // Try to create a new independent vector induction variable. If we can't
1728
19.2k
  // create the phi node, we will splat the scalar induction variable in each
1729
19.2k
  // loop iteration.
1730
19.2k
  if (VF > 1 && 
!shouldScalarizeInstruction(EntryVal)17.4k
) {
1731
11.2k
    createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1732
11.2k
    VectorizedIV = true;
1733
11.2k
  }
1734
19.2k
1735
19.2k
  // If we haven't yet vectorized the induction variable, or if we will create
1736
19.2k
  // a scalar one, we need to define the scalar induction variable and step
1737
19.2k
  // values. If we were given a truncation type, truncate the canonical
1738
19.2k
  // induction variable and step. Otherwise, derive these values from the
1739
19.2k
  // induction descriptor.
1740
19.2k
  if (!VectorizedIV || 
NeedsScalarIV11.2k
) {
1741
16.6k
    ScalarIV = Induction;
1742
16.6k
    if (IV != OldInduction) {
1743
3.31k
      ScalarIV = IV->getType()->isIntegerTy()
1744
3.31k
                     ? 
Builder.CreateSExtOrTrunc(Induction, IV->getType())3.30k
1745
3.31k
                     : Builder.CreateCast(Instruction::SIToFP, Induction,
1746
9
                                          IV->getType());
1747
3.31k
      ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1748
3.31k
      ScalarIV->setName("offset.idx");
1749
3.31k
    }
1750
16.6k
    if (Trunc) {
1751
118
      auto *TruncType = cast<IntegerType>(Trunc->getType());
1752
118
      assert(Step->getType()->isIntegerTy() &&
1753
118
             "Truncation requires an integer step");
1754
118
      ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1755
118
      Step = Builder.CreateTrunc(Step, TruncType);
1756
118
    }
1757
16.6k
  }
1758
19.2k
1759
19.2k
  // If we haven't yet vectorized the induction variable, splat the scalar
1760
19.2k
  // induction variable, and build the necessary step vectors.
1761
19.2k
  // TODO: Don't do it unless the vectorized IV is really required.
1762
19.2k
  if (!VectorizedIV) {
1763
8.01k
    Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1764
23.1k
    for (unsigned Part = 0; Part < UF; 
++Part15.1k
) {
1765
15.1k
      Value *EntryPart =
1766
15.1k
          getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1767
15.1k
      VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1768
15.1k
      if (Trunc)
1769
199
        addMetadata(EntryPart, Trunc);
1770
15.1k
      recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1771
15.1k
    }
1772
8.01k
  }
1773
19.2k
1774
19.2k
  // If an induction variable is only used for counting loop iterations or
1775
19.2k
  // calculating addresses, it doesn't need to be widened. Create scalar steps
1776
19.2k
  // that can be used by instructions we will later scalarize. Note that the
1777
19.2k
  // addition of the scalar steps will not increase the number of instructions
1778
19.2k
  // in the loop in the common case prior to InstCombine. We will be trading
1779
19.2k
  // one vector extract for each scalar step.
1780
19.2k
  if (NeedsScalarIV)
1781
14.8k
    buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1782
19.2k
}
1783
1784
Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
1785
22.7k
                                          Instruction::BinaryOps BinOp) {
1786
22.7k
  // Create and check the types.
1787
22.7k
  assert(Val->getType()->isVectorTy() && "Must be a vector");
1788
22.7k
  int VLen = Val->getType()->getVectorNumElements();
1789
22.7k
1790
22.7k
  Type *STy = Val->getType()->getScalarType();
1791
22.7k
  assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1792
22.7k
         "Induction Step must be an integer or FP");
1793
22.7k
  assert(Step->getType() == STy && "Step has wrong type");
1794
22.7k
1795
22.7k
  SmallVector<Constant *, 8> Indices;
1796
22.7k
1797
22.7k
  if (STy->isIntegerTy()) {
1798
22.7k
    // Create a vector of consecutive numbers from zero to VF.
1799
111k
    for (int i = 0; i < VLen; 
++i89.0k
)
1800
89.0k
      Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1801
22.7k
1802
22.7k
    // Add the consecutive indices to the vector value.
1803
22.7k
    Constant *Cv = ConstantVector::get(Indices);
1804
22.7k
    assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1805
22.7k
    Step = Builder.CreateVectorSplat(VLen, Step);
1806
22.7k
    assert(Step->getType() == Val->getType() && "Invalid step vec");
1807
22.7k
    // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1808
22.7k
    // which can be found from the original scalar operations.
1809
22.7k
    Step = Builder.CreateMul(Cv, Step);
1810
22.7k
    return Builder.CreateAdd(Val, Step, "induction");
1811
22.7k
  }
1812
20
1813
20
  // Floating point induction.
1814
20
  assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1815
20
         "Binary Opcode should be specified for FP induction");
1816
20
  // Create a vector of consecutive numbers from zero to VF.
1817
92
  for (int i = 0; i < VLen; 
++i72
)
1818
72
    Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1819
20
1820
20
  // Add the consecutive indices to the vector value.
1821
20
  Constant *Cv = ConstantVector::get(Indices);
1822
20
1823
20
  Step = Builder.CreateVectorSplat(VLen, Step);
1824
20
1825
20
  // Floating point operations had to be 'fast' to enable the induction.
1826
20
  FastMathFlags Flags;
1827
20
  Flags.setFast();
1828
20
1829
20
  Value *MulOp = Builder.CreateFMul(Cv, Step);
1830
20
  if (isa<Instruction>(MulOp))
1831
6
    // Have to check, MulOp may be a constant
1832
6
    cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1833
20
1834
20
  Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1835
20
  if (isa<Instruction>(BOp))
1836
9
    cast<Instruction>(BOp)->setFastMathFlags(Flags);
1837
20
  return BOp;
1838
20
}
1839
1840
void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
1841
                                           Instruction *EntryVal,
1842
14.8k
                                           const InductionDescriptor &ID) {
1843
14.8k
  // We shouldn't have to build scalar steps if we aren't vectorizing.
1844
14.8k
  assert(VF > 1 && "VF should be greater than one");
1845
14.8k
1846
14.8k
  // Get the value type and ensure it and the step have the same integer type.
1847
14.8k
  Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1848
14.8k
  assert(ScalarIVTy == Step->getType() &&
1849
14.8k
         "Val and Step should have the same type");
1850
14.8k
1851
14.8k
  // We build scalar steps for both integer and floating-point induction
1852
14.8k
  // variables. Here, we determine the kind of arithmetic we will perform.
1853
14.8k
  Instruction::BinaryOps AddOp;
1854
14.8k
  Instruction::BinaryOps MulOp;
1855
14.8k
  if (ScalarIVTy->isIntegerTy()) {
1856
14.8k
    AddOp = Instruction::Add;
1857
14.8k
    MulOp = Instruction::Mul;
1858
14.8k
  } else {
1859
3
    AddOp = ID.getInductionOpcode();
1860
3
    MulOp = Instruction::FMul;
1861
3
  }
1862
14.8k
1863
14.8k
  // Determine the number of scalars we need to generate for each unroll
1864
14.8k
  // iteration. If EntryVal is uniform, we only need to generate the first
1865
14.8k
  // lane. Otherwise, we generate all VF values.
1866
14.8k
  unsigned Lanes =
1867
14.8k
      Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 
16.11k
1868
14.8k
                                                                         : 
VF8.71k
;
1869
14.8k
  // Compute the scalar steps and save the results in VectorLoopValueMap.
1870
43.2k
  for (unsigned Part = 0; Part < UF; 
++Part28.4k
) {
1871
87.7k
    for (unsigned Lane = 0; Lane < Lanes; 
++Lane59.2k
) {
1872
59.2k
      auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
1873
59.2k
      auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
1874
59.2k
      auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
1875
59.2k
      VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
1876
59.2k
      recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
1877
59.2k
    }
1878
28.4k
  }
1879
14.8k
}
1880
1881
197k
Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
1882
197k
  assert(V != Induction && "The new induction variable should not be used.");
1883
197k
  assert(!V->getType()->isVectorTy() && "Can't widen a vector");
1884
197k
  assert(!V->getType()->isVoidTy() && "Type does not produce a value");
1885
197k
1886
197k
  // If we have a stride that is replaced by one, do it here. Defer this for
1887
197k
  // the VPlan-native path until we start running Legal checks in that path.
1888
197k
  if (!EnableVPlanNativePath && 
Legal->hasStride(V)197k
)
1889
0
    V = ConstantInt::get(V->getType(), 1);
1890
197k
1891
197k
  // If we have a vector mapped to this value, return it.
1892
197k
  if (VectorLoopValueMap.hasVectorValue(V, Part))
1893
149k
    return VectorLoopValueMap.getVectorValue(V, Part);
1894
48.1k
1895
48.1k
  // If the value has not been vectorized, check if it has been scalarized
1896
48.1k
  // instead. If it has been scalarized, and we actually need the value in
1897
48.1k
  // vector form, we will construct the vector values on demand.
1898
48.1k
  if (VectorLoopValueMap.hasAnyScalarValue(V)) {
1899
1.49k
    Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
1900
1.49k
1901
1.49k
    // If we've scalarized a value, that value should be an instruction.
1902
1.49k
    auto *I = cast<Instruction>(V);
1903
1.49k
1904
1.49k
    // If we aren't vectorizing, we can just copy the scalar map values over to
1905
1.49k
    // the vector map.
1906
1.49k
    if (VF == 1) {
1907
980
      VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
1908
980
      return ScalarValue;
1909
980
    }
1910
519
1911
519
    // Get the last scalar instruction we generated for V and Part. If the value
1912
519
    // is known to be uniform after vectorization, this corresponds to lane zero
1913
519
    // of the Part unroll iteration. Otherwise, the last instruction is the one
1914
519
    // we created for the last vector lane of the Part unroll iteration.
1915
519
    unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 
00
: VF - 1;
1916
519
    auto *LastInst = cast<Instruction>(
1917
519
        VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
1918
519
1919
519
    // Set the insert point after the last scalarized instruction. This ensures
1920
519
    // the insertelement sequence will directly follow the scalar definitions.
1921
519
    auto OldIP = Builder.saveIP();
1922
519
    auto NewIP = std::next(BasicBlock::iterator(LastInst));
1923
519
    Builder.SetInsertPoint(&*NewIP);
1924
519
1925
519
    // However, if we are vectorizing, we need to construct the vector values.
1926
519
    // If the value is known to be uniform after vectorization, we can just
1927
519
    // broadcast the scalar value corresponding to lane zero for each unroll
1928
519
    // iteration. Otherwise, we construct the vector values using insertelement
1929
519
    // instructions. Since the resulting vectors are stored in
1930
519
    // VectorLoopValueMap, we will only generate the insertelements once.
1931
519
    Value *VectorValue = nullptr;
1932
519
    if (Cost->isUniformAfterVectorization(I, VF)) {
1933
0
      VectorValue = getBroadcastInstrs(ScalarValue);
1934
0
      VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
1935
519
    } else {
1936
519
      // Initialize packing with insertelements to start from undef.
1937
519
      Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
1938
519
      VectorLoopValueMap.setVectorValue(V, Part, Undef);
1939
2.27k
      for (unsigned Lane = 0; Lane < VF; 
++Lane1.75k
)
1940
1.75k
        packScalarIntoVectorValue(V, {Part, Lane});
1941
519
      VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
1942
519
    }
1943
519
    Builder.restoreIP(OldIP);
1944
519
    return VectorValue;
1945
519
  }
1946
46.6k
1947
46.6k
  // If this scalar is unknown, assume that it is a constant or that it is
1948
46.6k
  // loop invariant. Broadcast V and save the value for future uses.
1949
46.6k
  Value *B = getBroadcastInstrs(V);
1950
46.6k
  VectorLoopValueMap.setVectorValue(V, Part, B);
1951
46.6k
  return B;
1952
46.6k
}
1953
1954
Value *
1955
InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
1956
189k
                                            const VPIteration &Instance) {
1957
189k
  // If the value is not an instruction contained in the loop, it should
1958
189k
  // already be scalar.
1959
189k
  if (OrigLoop->isLoopInvariant(V))
1960
91.7k
    return V;
1961
98.1k
1962
98.1k
  assert(Instance.Lane > 0
1963
98.1k
             ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
1964
98.1k
             : true && "Uniform values only have lane zero");
1965
98.1k
1966
98.1k
  // If the value from the original loop has not been vectorized, it is
1967
98.1k
  // represented by UF x VF scalar values in the new loop. Return the requested
1968
98.1k
  // scalar value.
1969
98.1k
  if (VectorLoopValueMap.hasScalarValue(V, Instance))
1970
92.2k
    return VectorLoopValueMap.getScalarValue(V, Instance);
1971
5.90k
1972
5.90k
  // If the value has not been scalarized, get its entry in VectorLoopValueMap
1973
5.90k
  // for the given unroll part. If this entry is not a vector type (i.e., the
1974
5.90k
  // vectorization factor is one), there is no need to generate an
1975
5.90k
  // extractelement instruction.
1976
5.90k
  auto *U = getOrCreateVectorValue(V, Instance.Part);
1977
5.90k
  if (!U->getType()->isVectorTy()) {
1978
4.27k
    assert(VF == 1 && "Value not scalarized has non-vector type");
1979
4.27k
    return U;
1980
4.27k
  }
1981
1.63k
1982
1.63k
  // Otherwise, the value from the original loop has been vectorized and is
1983
1.63k
  // represented by UF vector values. Extract and return the requested scalar
1984
1.63k
  // value from the appropriate vector lane.
1985
1.63k
  return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
1986
1.63k
}
1987
1988
void InnerLoopVectorizer::packScalarIntoVectorValue(
1989
1.93k
    Value *V, const VPIteration &Instance) {
1990
1.93k
  assert(V != Induction && "The new induction variable should not be used.");
1991
1.93k
  assert(!V->getType()->isVectorTy() && "Can't pack a vector");
1992
1.93k
  assert(!V->getType()->isVoidTy() && "Type does not produce a value");
1993
1.93k
1994
1.93k
  Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
1995
1.93k
  Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
1996
1.93k
  VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
1997
1.93k
                                            Builder.getInt32(Instance.Lane));
1998
1.93k
  VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
1999
1.93k
}
2000
2001
1.24k
Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2002
1.24k
  assert(Vec->getType()->isVectorTy() && "Invalid type");
2003
1.24k
  SmallVector<Constant *, 8> ShuffleMask;
2004
14.3k
  for (unsigned i = 0; i < VF; 
++i13.1k
)
2005
13.1k
    ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
2006
1.24k
2007
1.24k
  return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2008
1.24k
                                     ConstantVector::get(ShuffleMask),
2009
1.24k
                                     "reverse");
2010
1.24k
}
2011
2012
// Return whether we allow using masked interleave-groups (for dealing with
2013
// strided loads/stores that reside in predicated blocks, or for dealing
2014
// with gaps).
2015
19.6k
static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2016
19.6k
  // If an override option has been passed in for interleaved accesses, use it.
2017
19.6k
  if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2018
24
    return EnableMaskedInterleavedMemAccesses;
2019
19.5k
2020
19.5k
  return TTI.enableMaskedInterleavedAccessVectorization();
2021
19.5k
}
2022
2023
// Try to vectorize the interleave group that \p Instr belongs to.
2024
//
2025
// E.g. Translate following interleaved load group (factor = 3):
2026
//   for (i = 0; i < N; i+=3) {
2027
//     R = Pic[i];             // Member of index 0
2028
//     G = Pic[i+1];           // Member of index 1
2029
//     B = Pic[i+2];           // Member of index 2
2030
//     ... // do something to R, G, B
2031
//   }
2032
// To:
2033
//   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2034
//   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2035
//   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2036
//   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2037
//
2038
// Or translate following interleaved store group (factor = 3):
2039
//   for (i = 0; i < N; i+=3) {
2040
//     ... do something to R, G, B
2041
//     Pic[i]   = R;           // Member of index 0
2042
//     Pic[i+1] = G;           // Member of index 1
2043
//     Pic[i+2] = B;           // Member of index 2
2044
//   }
2045
// To:
2046
//   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2047
//   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2048
//   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2049
//        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2050
//   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2051
void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
2052
669
                                                   VectorParts *BlockInMask) {
2053
669
  const InterleaveGroup<Instruction> *Group =
2054
669
      Cost->getInterleavedAccessGroup(Instr);
2055
669
  assert(Group && "Fail to get an interleaved access group.");
2056
669
2057
669
  // Skip if current instruction is not the insert position.
2058
669
  if (Instr != Group->getInsertPos())
2059
0
    return;
2060
669
2061
669
  const DataLayout &DL = Instr->getModule()->getDataLayout();
2062
669
  Value *Ptr = getLoadStorePointerOperand(Instr);
2063
669
2064
669
  // Prepare for the vector type of the interleaved load/store.
2065
669
  Type *ScalarTy = getMemInstValueType(Instr);
2066
669
  unsigned InterleaveFactor = Group->getFactor();
2067
669
  Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2068
669
  Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr));
2069
669
2070
669
  // Prepare for the new pointers.
2071
669
  setDebugLocFromInst(Builder, Ptr);
2072
669
  SmallVector<Value *, 2> NewPtrs;
2073
669
  unsigned Index = Group->getIndex(Instr);
2074
669
2075
669
  VectorParts Mask;
2076
669
  bool IsMaskForCondRequired = BlockInMask;
2077
669
  if (IsMaskForCondRequired) {
2078
11
    Mask = *BlockInMask;
2079
11
    // TODO: extend the masked interleaved-group support to reversed access.
2080
11
    assert(!Group->isReverse() && "Reversed masked interleave-group "
2081
11
                                  "not supported.");
2082
11
  }
2083
669
2084
669
  // If the group is reverse, adjust the index to refer to the last vector lane
2085
669
  // instead of the first. We adjust the index from the first vector lane,
2086
669
  // rather than directly getting the pointer for lane VF - 1, because the
2087
669
  // pointer operand of the interleaved access is supposed to be uniform. For
2088
669
  // uniform instructions, we're only required to generate a value for the
2089
669
  // first vector lane in each unroll iteration.
2090
669
  if (Group->isReverse())
2091
4
    Index += (VF - 1) * Group->getFactor();
2092
669
2093
669
  bool InBounds = false;
2094
669
  if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2095
567
    InBounds = gep->isInBounds();
2096
669
2097
1.76k
  for (unsigned Part = 0; Part < UF; 
Part++1.09k
) {
2098
1.09k
    Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});
2099
1.09k
2100
1.09k
    // Notice current instruction could be any index. Need to adjust the address
2101
1.09k
    // to the member of index 0.
2102
1.09k
    //
2103
1.09k
    // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2104
1.09k
    //       b = A[i];       // Member of index 0
2105
1.09k
    // Current pointer is pointed to A[i+1], adjust it to A[i].
2106
1.09k
    //
2107
1.09k
    // E.g.  A[i+1] = a;     // Member of index 1
2108
1.09k
    //       A[i]   = b;     // Member of index 0
2109
1.09k
    //       A[i+2] = c;     // Member of index 2 (Current instruction)
2110
1.09k
    // Current pointer is pointed to A[i+2], adjust it to A[i].
2111
1.09k
    NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index));
2112
1.09k
    if (InBounds)
2113
860
      cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true);
2114
1.09k
2115
1.09k
    // Cast to the vector pointer type.
2116
1.09k
    NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
2117
1.09k
  }
2118
669
2119
669
  setDebugLocFromInst(Builder, Instr);
2120
669
  Value *UndefVec = UndefValue::get(VecTy);
2121
669
2122
669
  Value *MaskForGaps = nullptr;
2123
669
  if (Group->requiresScalarEpilogue() && 
!Cost->isScalarEpilogueAllowed()68
) {
2124
5
    MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2125
5
    assert(MaskForGaps && "Mask for Gaps is required but it is null");
2126
5
  }
2127
669
2128
669
  // Vectorize the interleaved load group.
2129
669
  if (isa<LoadInst>(Instr)) {
2130
324
    // For each unroll part, create a wide load for the group.
2131
324
    SmallVector<Value *, 2> NewLoads;
2132
841
    for (unsigned Part = 0; Part < UF; 
Part++517
) {
2133
517
      Instruction *NewLoad;
2134
517
      if (IsMaskForCondRequired || 
MaskForGaps509
) {
2135
9
        assert(useMaskedInterleavedAccesses(*TTI) &&
2136
9
               "masked interleaved groups are not allowed.");
2137
9
        Value *GroupMask = MaskForGaps;
2138
9
        if (IsMaskForCondRequired) {
2139
8
          auto *Undefs = UndefValue::get(Mask[Part]->getType());
2140
8
          auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2141
8
          Value *ShuffledMask = Builder.CreateShuffleVector(
2142
8
              Mask[Part], Undefs, RepMask, "interleaved.mask");
2143
8
          GroupMask = MaskForGaps
2144
8
                          ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2145
4
                                                MaskForGaps)
2146
8
                          : 
ShuffledMask4
;
2147
8
        }
2148
9
        NewLoad =
2149
9
            Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(),
2150
9
                                     GroupMask, UndefVec, "wide.masked.vec");
2151
9
      }
2152
508
      else
2153
508
        NewLoad = Builder.CreateAlignedLoad(VecTy, NewPtrs[Part],
2154
508
                                            Group->getAlignment(), "wide.vec");
2155
517
      Group->addMetadata(NewLoad);
2156
517
      NewLoads.push_back(NewLoad);
2157
517
    }
2158
324
2159
324
    // For each member in the group, shuffle out the appropriate data from the
2160
324
    // wide loads.
2161
1.17k
    for (unsigned I = 0; I < InterleaveFactor; 
++I849
) {
2162
849
      Instruction *Member = Group->getMember(I);
2163
849
2164
849
      // Skip the gaps in the group.
2165
849
      if (!Member)
2166
137
        continue;
2167
712
2168
712
      Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2169
1.89k
      for (unsigned Part = 0; Part < UF; 
Part++1.17k
) {
2170
1.17k
        Value *StridedVec = Builder.CreateShuffleVector(
2171
1.17k
            NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2172
1.17k
2173
1.17k
        // If this member has different type, cast the result type.
2174
1.17k
        if (Member->getType() != ScalarTy) {
2175
7
          VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2176
7
          StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2177
7
        }
2178
1.17k
2179
1.17k
        if (Group->isReverse())
2180
6
          StridedVec = reverseVector(StridedVec);
2181
1.17k
2182
1.17k
        VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2183
1.17k
      }
2184
712
    }
2185
324
    return;
2186
324
  }
2187
345
2188
345
  // The sub vector type for current instruction.
2189
345
  VectorType *SubVT = VectorType::get(ScalarTy, VF);
2190
345
2191
345
  // Vectorize the interleaved store group.
2192
922
  for (unsigned Part = 0; Part < UF; 
Part++577
) {
2193
577
    // Collect the stored vector from each member.
2194
577
    SmallVector<Value *, 4> StoredVecs;
2195
1.90k
    for (unsigned i = 0; i < InterleaveFactor; 
i++1.33k
) {
2196
1.33k
      // Interleaved store group doesn't allow a gap, so each index has a member
2197
1.33k
      Instruction *Member = Group->getMember(i);
2198
1.33k
      assert(Member && "Fail to get a member from an interleaved store group");
2199
1.33k
2200
1.33k
      Value *StoredVec = getOrCreateVectorValue(
2201
1.33k
          cast<StoreInst>(Member)->getValueOperand(), Part);
2202
1.33k
      if (Group->isReverse())
2203
2
        StoredVec = reverseVector(StoredVec);
2204
1.33k
2205
1.33k
      // If this member has different type, cast it to a unified type.
2206
1.33k
2207
1.33k
      if (StoredVec->getType() != SubVT)
2208
7
        StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2209
1.33k
2210
1.33k
      StoredVecs.push_back(StoredVec);
2211
1.33k
    }
2212
577
2213
577
    // Concatenate all vectors into a wide vector.
2214
577
    Value *WideVec = concatenateVectors(Builder, StoredVecs);
2215
577
2216
577
    // Interleave the elements in the wide vector.
2217
577
    Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2218
577
    Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2219
577
                                              "interleaved.vec");
2220
577
2221
577
    Instruction *NewStoreInstr;
2222
577
    if (IsMaskForCondRequired) {
2223
3
      auto *Undefs = UndefValue::get(Mask[Part]->getType());
2224
3
      auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2225
3
      Value *ShuffledMask = Builder.CreateShuffleVector(
2226
3
          Mask[Part], Undefs, RepMask, "interleaved.mask");
2227
3
      NewStoreInstr = Builder.CreateMaskedStore(
2228
3
          IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask);
2229
3
    }
2230
574
    else
2231
574
      NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part], 
2232
574
        Group->getAlignment());
2233
577
2234
577
    Group->addMetadata(NewStoreInstr);
2235
577
  }
2236
345
}
2237
2238
void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2239
21.6k
                                                     VectorParts *BlockInMask) {
2240
21.6k
  // Attempt to issue a wide load.
2241
21.6k
  LoadInst *LI = dyn_cast<LoadInst>(Instr);
2242
21.6k
  StoreInst *SI = dyn_cast<StoreInst>(Instr);
2243
21.6k
2244
21.6k
  assert((LI || SI) && "Invalid Load/Store instruction");
2245
21.6k
2246
21.6k
  LoopVectorizationCostModel::InstWidening Decision =
2247
21.6k
      Cost->getWideningDecision(Instr, VF);
2248
21.6k
  assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
2249
21.6k
         "CM decision should be taken at this point");
2250
21.6k
  if (Decision == LoopVectorizationCostModel::CM_Interleave)
2251
0
    return vectorizeInterleaveGroup(Instr);
2252
21.6k
2253
21.6k
  Type *ScalarDataTy = getMemInstValueType(Instr);
2254
21.6k
  Type *DataTy = VectorType::get(ScalarDataTy, VF);
2255
21.6k
  Value *Ptr = getLoadStorePointerOperand(Instr);
2256
21.6k
  unsigned Alignment = getLoadStoreAlignment(Instr);
2257
21.6k
  // An alignment of 0 means target abi alignment. We need to use the scalar's
2258
21.6k
  // target abi alignment in such a case.
2259
21.6k
  const DataLayout &DL = Instr->getModule()->getDataLayout();
2260
21.6k
  if (!Alignment)
2261
79
    Alignment = DL.getABITypeAlignment(ScalarDataTy);
2262
21.6k
  unsigned AddressSpace = getLoadStoreAddressSpace(Instr);
2263
21.6k
2264
21.6k
  // Determine if the pointer operand of the access is either consecutive or
2265
21.6k
  // reverse consecutive.
2266
21.6k
  bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2267
21.6k
  bool ConsecutiveStride =
2268
21.6k
      Reverse || 
(Decision == LoopVectorizationCostModel::CM_Widen)21.0k
;
2269
21.6k
  bool CreateGatherScatter =
2270
21.6k
      (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2271
21.6k
2272
21.6k
  // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2273
21.6k
  // gather/scatter. Otherwise Decision should have been to Scalarize.
2274
21.6k
  assert((ConsecutiveStride || CreateGatherScatter) &&
2275
21.6k
         "The instruction should be scalarized");
2276
21.6k
2277
21.6k
  // Handle consecutive loads/stores.
2278
21.6k
  if (ConsecutiveStride)
2279
21.5k
    Ptr = getOrCreateScalarValue(Ptr, {0, 0});
2280
21.6k
2281
21.6k
  VectorParts Mask;
2282
21.6k
  bool isMaskRequired = BlockInMask;
2283
21.6k
  if (isMaskRequired)
2284
95
    Mask = *BlockInMask;
2285
21.6k
2286
21.6k
  bool InBounds = false;
2287
21.6k
  if (auto *gep = dyn_cast<GetElementPtrInst>(
2288
19.2k
          getLoadStorePointerOperand(Instr)->stripPointerCasts()))
2289
19.2k
    InBounds = gep->isInBounds();
2290
21.6k
2291
40.2k
  const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2292
40.2k
    // Calculate the pointer for the specific unroll-part.
2293
40.2k
    GetElementPtrInst *PartPtr = nullptr;
2294
40.2k
2295
40.2k
    if (Reverse) {
2296
1.21k
      // If the address is consecutive but reversed, then the
2297
1.21k
      // wide store needs to start at the last vector element.
2298
1.21k
      PartPtr = cast<GetElementPtrInst>(
2299
1.21k
          Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2300
1.21k
      PartPtr->setIsInBounds(InBounds);
2301
1.21k
      PartPtr = cast<GetElementPtrInst>(
2302
1.21k
          Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2303
1.21k
      PartPtr->setIsInBounds(InBounds);
2304
1.21k
      if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2305
16
        Mask[Part] = reverseVector(Mask[Part]);
2306
39.0k
    } else {
2307
39.0k
      PartPtr = cast<GetElementPtrInst>(
2308
39.0k
          Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2309
39.0k
      PartPtr->setIsInBounds(InBounds);
2310
39.0k
    }
2311
40.2k
2312
40.2k
    return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2313
40.2k
  };
2314
21.6k
2315
21.6k
  // Handle Stores:
2316
21.6k
  if (SI) {
2317
14.4k
    setDebugLocFromInst(Builder, SI);
2318
14.4k
2319
42.2k
    for (unsigned Part = 0; Part < UF; 
++Part27.7k
) {
2320
27.7k
      Instruction *NewSI = nullptr;
2321
27.7k
      Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
2322
27.7k
      if (CreateGatherScatter) {
2323
29
        Value *MaskPart = isMaskRequired ? 
Mask[Part]12
:
nullptr17
;
2324
29
        Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2325
29
        NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2326
29
                                            MaskPart);
2327
27.7k
      } else {
2328
27.7k
        if (Reverse) {
2329
574
          // If we store to reverse consecutive memory locations, then we need
2330
574
          // to reverse the order of elements in the stored value.
2331
574
          StoredVal = reverseVector(StoredVal);
2332
574
          // We don't want to update the value in the map as it might be used in
2333
574
          // another expression. So don't call resetVectorValue(StoredVal).
2334
574
        }
2335
27.7k
        auto *VecPtr = CreateVecPtr(Part, Ptr);
2336
27.7k
        if (isMaskRequired)
2337
100
          NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2338
100
                                            Mask[Part]);
2339
27.6k
        else
2340
27.6k
          NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2341
27.7k
      }
2342
27.7k
      addMetadata(NewSI, SI);
2343
27.7k
    }
2344
14.4k
    return;
2345
14.4k
  }
2346
7.20k
2347
7.20k
  // Handle loads.
2348
7.20k
  assert(LI && "Must have a load instruction");
2349
7.20k
  setDebugLocFromInst(Builder, LI);
2350
19.7k
  for (unsigned Part = 0; Part < UF; 
++Part12.5k
) {
2351
12.5k
    Value *NewLI;
2352
12.5k
    if (CreateGatherScatter) {
2353
58
      Value *MaskPart = isMaskRequired ? 
Mask[Part]29
:
nullptr29
;
2354
58
      Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2355
58
      NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2356
58
                                         nullptr, "wide.masked.gather");
2357
58
      addMetadata(NewLI, LI);
2358
12.4k
    } else {
2359
12.4k
      auto *VecPtr = CreateVecPtr(Part, Ptr);
2360
12.4k
      if (isMaskRequired)
2361
80
        NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
2362
80
                                         UndefValue::get(DataTy),
2363
80
                                         "wide.masked.load");
2364
12.4k
      else
2365
12.4k
        NewLI =
2366
12.4k
            Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2367
12.4k
2368
12.4k
      // Add metadata to the load, but setVectorValue to the reverse shuffle.
2369
12.4k
      addMetadata(NewLI, LI);
2370
12.4k
      if (Reverse)
2371
643
        NewLI = reverseVector(NewLI);
2372
12.4k
    }
2373
12.5k
    VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2374
12.5k
  }
2375
7.20k
}
2376
2377
void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2378
                                               const VPIteration &Instance,
2379
71.6k
                                               bool IfPredicateInstr) {
2380
71.6k
  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2381
71.6k
2382
71.6k
  setDebugLocFromInst(Builder, Instr);
2383
71.6k
2384
71.6k
  // Does this instruction return a value ?
2385
71.6k
  bool IsVoidRetTy = Instr->getType()->isVoidTy();
2386
71.6k
2387
71.6k
  Instruction *Cloned = Instr->clone();
2388
71.6k
  if (!IsVoidRetTy)
2389
68.4k
    Cloned->setName(Instr->getName() + ".cloned");
2390
71.6k
2391
71.6k
  // Replace the operands of the cloned instructions with their scalar
2392
71.6k
  // equivalents in the new loop.
2393
238k
  for (unsigned op = 0, e = Instr->getNumOperands(); op != e; 
++op166k
) {
2394
166k
    auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2395
166k
    Cloned->setOperand(op, NewOp);
2396
166k
  }
2397
71.6k
  addNewMetadata(Cloned, Instr);
2398
71.6k
2399
71.6k
  // Place the cloned scalar in the new loop.
2400
71.6k
  Builder.Insert(Cloned);
2401
71.6k
2402
71.6k
  // Add the cloned scalar to the scalar map entry.
2403
71.6k
  VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2404
71.6k
2405
71.6k
  // If we just cloned a new assumption, add it the assumption cache.
2406
71.6k
  if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2407
64
    if (II->getIntrinsicID() == Intrinsic::assume)
2408
24
      AC->registerAssumption(II);
2409
71.6k
2410
71.6k
  // End if-block.
2411
71.6k
  if (IfPredicateInstr)
2412
948
    PredicatedInstructions.push_back(Cloned);
2413
71.6k
}
2414
2415
PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2416
                                                      Value *End, Value *Step,
2417
17.0k
                                                      Instruction *DL) {
2418
17.0k
  BasicBlock *Header = L->getHeader();
2419
17.0k
  BasicBlock *Latch = L->getLoopLatch();
2420
17.0k
  // As we're just creating this loop, it's possible no latch exists
2421
17.0k
  // yet. If so, use the header as this will be a single block loop.
2422
17.0k
  if (!Latch)
2423
17.0k
    Latch = Header;
2424
17.0k
2425
17.0k
  IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2426
17.0k
  Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2427
17.0k
  setDebugLocFromInst(Builder, OldInst);
2428
17.0k
  auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2429
17.0k
2430
17.0k
  Builder.SetInsertPoint(Latch->getTerminator());
2431
17.0k
  setDebugLocFromInst(Builder, OldInst);
2432
17.0k
2433
17.0k
  // Create i+1 and fill the PHINode.
2434
17.0k
  Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2435
17.0k
  Induction->addIncoming(Start, L->getLoopPreheader());
2436
17.0k
  Induction->addIncoming(Next, Latch);
2437
17.0k
  // Create the compare.
2438
17.0k
  Value *ICmp = Builder.CreateICmpEQ(Next, End);
2439
17.0k
  Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2440
17.0k
2441
17.0k
  // Now we have two terminators. Remove the old one from the block.
2442
17.0k
  Latch->getTerminator()->eraseFromParent();
2443
17.0k
2444
17.0k
  return Induction;
2445
17.0k
}
2446
2447
68.2k
Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2448
68.2k
  if (TripCount)
2449
51.1k
    return TripCount;
2450
17.0k
2451
17.0k
  assert(L && "Create Trip Count for null loop.");
2452
17.0k
  IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2453
17.0k
  // Find the loop boundaries.
2454
17.0k
  ScalarEvolution *SE = PSE.getSE();
2455
17.0k
  const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2456
17.0k
  assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2457
17.0k
         "Invalid loop count");
2458
17.0k
2459
17.0k
  Type *IdxTy = Legal->getWidestInductionType();
2460
17.0k
  assert(IdxTy && "No type for induction");
2461
17.0k
2462
17.0k
  // The exit count might have the type of i64 while the phi is i32. This can
2463
17.0k
  // happen if we have an induction variable that is sign extended before the
2464
17.0k
  // compare. The only way that we get a backedge taken count is that the
2465
17.0k
  // induction variable was signed and as such will not overflow. In such a case
2466
17.0k
  // truncation is legal.
2467
17.0k
  if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
2468
17.0k
      IdxTy->getPrimitiveSizeInBits())
2469
2
    BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2470
17.0k
  BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2471
17.0k
2472
17.0k
  // Get the total trip count from the count by adding 1.
2473
17.0k
  const SCEV *ExitCount = SE->getAddExpr(
2474
17.0k
      BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2475
17.0k
2476
17.0k
  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2477
17.0k
2478
17.0k
  // Expand the trip count and place the new instructions in the preheader.
2479
17.0k
  // Notice that the pre-header does not change, only the loop body.
2480
17.0k
  SCEVExpander Exp(*SE, DL, "induction");
2481
17.0k
2482
17.0k
  // Count holds the overall loop count (N).
2483
17.0k
  TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2484
17.0k
                                L->getLoopPreheader()->getTerminator());
2485
17.0k
2486
17.0k
  if (TripCount->getType()->isPointerTy())
2487
159
    TripCount =
2488
159
        CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2489
159
                                    L->getLoopPreheader()->getTerminator());
2490
17.0k
2491
17.0k
  return TripCount;
2492
17.0k
}
2493
2494
37.3k
Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2495
37.3k
  if (VectorTripCount)
2496
20.3k
    return VectorTripCount;
2497
17.0k
2498
17.0k
  Value *TC = getOrCreateTripCount(L);
2499
17.0k
  IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2500
17.0k
2501
17.0k
  Type *Ty = TC->getType();
2502
17.0k
  Constant *Step = ConstantInt::get(Ty, VF * UF);
2503
17.0k
2504
17.0k
  // If the tail is to be folded by masking, round the number of iterations N
2505
17.0k
  // up to a multiple of Step instead of rounding down. This is done by first
2506
17.0k
  // adding Step-1 and then rounding down. Note that it's ok if this addition
2507
17.0k
  // overflows: the vector induction variable will eventually wrap to zero given
2508
17.0k
  // that it starts at zero and its Step is a power of two; the loop will then
2509
17.0k
  // exit, with the last early-exit vector comparison also producing all-true.
2510
17.0k
  if (Cost->foldTailByMasking()) {
2511
18
    assert(isPowerOf2_32(VF * UF) &&
2512
18
           "VF*UF must be a power of 2 when folding tail by masking");
2513
18
    TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2514
18
  }
2515
17.0k
2516
17.0k
  // Now we need to generate the expression for the part of the loop that the
2517
17.0k
  // vectorized body will execute. This is equal to N - (N % Step) if scalar
2518
17.0k
  // iterations are not required for correctness, or N - Step, otherwise. Step
2519
17.0k
  // is equal to the vectorization factor (number of SIMD elements) times the
2520
17.0k
  // unroll factor (number of SIMD instructions).
2521
17.0k
  Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2522
17.0k
2523
17.0k
  // If there is a non-reversed interleaved group that may speculatively access
2524
17.0k
  // memory out-of-bounds, we need to ensure that there will be at least one
2525
17.0k
  // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2526
17.0k
  // the trip count, we set the remainder to be equal to the step. If the step
2527
17.0k
  // does not evenly divide the trip count, no adjustment is necessary since
2528
17.0k
  // there will already be scalar iterations. Note that the minimum iterations
2529
17.0k
  // check ensures that N >= Step.
2530
17.0k
  if (VF > 1 && 
Cost->requiresScalarEpilogue()15.3k
) {
2531
56
    auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2532
56
    R = Builder.CreateSelect(IsZero, Step, R);
2533
56
  }
2534
17.0k
2535
17.0k
  VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2536
17.0k
2537
17.0k
  return VectorTripCount;
2538
17.0k
}
2539
2540
Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2541
14
                                                   const DataLayout &DL) {
2542
14
  // Verify that V is a vector type with same number of elements as DstVTy.
2543
14
  unsigned VF = DstVTy->getNumElements();
2544
14
  VectorType *SrcVecTy = cast<VectorType>(V->getType());
2545
14
  assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2546
14
  Type *SrcElemTy = SrcVecTy->getElementType();
2547
14
  Type *DstElemTy = DstVTy->getElementType();
2548
14
  assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2549
14
         "Vector elements must have same size");
2550
14
2551
14
  // Do a direct cast if element types are castable.
2552
14
  if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2553
2
    return Builder.CreateBitOrPointerCast(V, DstVTy);
2554
2
  }
2555
12
  // V cannot be directly casted to desired vector type.
2556
12
  // May happen when V is a floating point vector but DstVTy is a vector of
2557
12
  // pointers or vice-versa. Handle this using a two-step bitcast using an
2558
12
  // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2559
12
  assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2560
12
         "Only one type should be a pointer type");
2561
12
  assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2562
12
         "Only one type should be a floating point type");
2563
12
  Type *IntTy =
2564
12
      IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2565
12
  VectorType *VecIntTy = VectorType::get(IntTy, VF);
2566
12
  Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2567
12
  return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2568
12
}
2569
2570
void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2571
17.0k
                                                         BasicBlock *Bypass) {
2572
17.0k
  Value *Count = getOrCreateTripCount(L);
2573
17.0k
  BasicBlock *BB = L->getLoopPreheader();
2574
17.0k
  IRBuilder<> Builder(BB->getTerminator());
2575
17.0k
2576
17.0k
  // Generate code to check if the loop's trip count is less than VF * UF, or
2577
17.0k
  // equal to it in case a scalar epilogue is required; this implies that the
2578
17.0k
  // vector trip count is zero. This check also covers the case where adding one
2579
17.0k
  // to the backedge-taken count overflowed leading to an incorrect trip count
2580
17.0k
  // of zero. In this case we will also jump to the scalar loop.
2581
17.0k
  auto P = Cost->requiresScalarEpilogue() ? 
ICmpInst::ICMP_ULE106
2582
17.0k
                                          : 
ICmpInst::ICMP_ULT16.9k
;
2583
17.0k
2584
17.0k
  // If tail is to be folded, vector loop takes care of all iterations.
2585
17.0k
  Value *CheckMinIters = Builder.getFalse();
2586
17.0k
  if (!Cost->foldTailByMasking())
2587
17.0k
    CheckMinIters = Builder.CreateICmp(
2588
17.0k
        P, Count, ConstantInt::get(Count->getType(), VF * UF),
2589
17.0k
        "min.iters.check");
2590
17.0k
2591
17.0k
  BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2592
17.0k
  // Update dominator tree immediately if the generated block is a
2593
17.0k
  // LoopBypassBlock because SCEV expansions to generate loop bypass
2594
17.0k
  // checks may query it before the current function is finished.
2595
17.0k
  DT->addNewBlock(NewBB, BB);
2596
17.0k
  if (L->getParentLoop())
2597
4.84k
    L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2598
17.0k
  ReplaceInstWithInst(BB->getTerminator(),
2599
17.0k
                      BranchInst::Create(Bypass, NewBB, CheckMinIters));
2600
17.0k
  LoopBypassBlocks.push_back(BB);
2601
17.0k
}
2602
2603
17.0k
void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2604
17.0k
  BasicBlock *BB = L->getLoopPreheader();
2605
17.0k
2606
17.0k
  // Generate the code to check that the SCEV assumptions that we made.
2607
17.0k
  // We want the new basic block to start at the first instruction in a
2608
17.0k
  // sequence of instructions that form a check.
2609
17.0k
  SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2610
17.0k
                   "scev.check");
2611
17.0k
  Value *SCEVCheck =
2612
17.0k
      Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
2613
17.0k
2614
17.0k
  if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2615
16.7k
    if (C->isZero())
2616
16.7k
      return;
2617
332
2618
332
  assert(!Cost->foldTailByMasking() &&
2619
332
         "Cannot SCEV check stride or overflow when folding tail");
2620
332
  // Create a new block containing the stride check.
2621
332
  BB->setName("vector.scevcheck");
2622
332
  auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2623
332
  // Update dominator tree immediately if the generated block is a
2624
332
  // LoopBypassBlock because SCEV expansions to generate loop bypass
2625
332
  // checks may query it before the current function is finished.
2626
332
  DT->addNewBlock(NewBB, BB);
2627
332
  if (L->getParentLoop())
2628
159
    L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2629
332
  ReplaceInstWithInst(BB->getTerminator(),
2630
332
                      BranchInst::Create(Bypass, NewBB, SCEVCheck));
2631
332
  LoopBypassBlocks.push_back(BB);
2632
332
  AddedSafetyChecks = true;
2633
332
}
2634
2635
17.0k
void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2636
17.0k
  // VPlan-native path does not do any analysis for runtime checks currently.
2637
17.0k
  if (EnableVPlanNativePath)
2638
7
    return;
2639
17.0k
2640
17.0k
  BasicBlock *BB = L->getLoopPreheader();
2641
17.0k
2642
17.0k
  // Generate the code that checks in runtime if arrays overlap. We put the
2643
17.0k
  // checks into a separate block to make the more common case of few elements
2644
17.0k
  // faster.
2645
17.0k
  Instruction *FirstCheckInst;
2646
17.0k
  Instruction *MemRuntimeCheck;
2647
17.0k
  std::tie(FirstCheckInst, MemRuntimeCheck) =
2648
17.0k
      Legal->getLAI()->addRuntimeChecks(BB->getTerminator());
2649
17.0k
  if (!MemRuntimeCheck)
2650
14.5k
    return;
2651
2.48k
2652
2.48k
  assert(!Cost->foldTailByMasking() && "Cannot check memory when folding tail");
2653
2.48k
  // Create a new block containing the memory check.
2654
2.48k
  BB->setName("vector.memcheck");
2655
2.48k
  auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2656
2.48k
  // Update dominator tree immediately if the generated block is a
2657
2.48k
  // LoopBypassBlock because SCEV expansions to generate loop bypass
2658
2.48k
  // checks may query it before the current function is finished.
2659
2.48k
  DT->addNewBlock(NewBB, BB);
2660
2.48k
  if (L->getParentLoop())
2661
887
    L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2662
2.48k
  ReplaceInstWithInst(BB->getTerminator(),
2663
2.48k
                      BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
2664
2.48k
  LoopBypassBlocks.push_back(BB);
2665
2.48k
  AddedSafetyChecks = true;
2666
2.48k
2667
2.48k
  // We currently don't use LoopVersioning for the actual loop cloning but we
2668
2.48k
  // still use it to add the noalias metadata.
2669
2.48k
  LVer = llvm::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2670
2.48k
                                           PSE.getSE());
2671
2.48k
  LVer->prepareNoAliasMetadata();
2672
2.48k
}
2673
2674
Value *InnerLoopVectorizer::emitTransformedIndex(
2675
    IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2676
17.7k
    const InductionDescriptor &ID) const {
2677
17.7k
2678
17.7k
  SCEVExpander Exp(*SE, DL, "induction");
2679
17.7k
  auto Step = ID.getStep();
2680
17.7k
  auto StartValue = ID.getStartValue();
2681
17.7k
  assert(Index->getType() == Step->getType() &&
2682
17.7k
         "Index type does not match StepValue type");
2683
17.7k
2684
17.7k
  // Note: the IR at this point is broken. We cannot use SE to create any new
2685
17.7k
  // SCEV and then expand it, hoping that SCEV's simplification will give us
2686
17.7k
  // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2687
17.7k
  // lead to various SCEV crashes. So all we can do is to use builder and rely
2688
17.7k
  // on InstCombine for future simplifications. Here we handle some trivial
2689
17.7k
  // cases only.
2690
17.7k
  auto CreateAdd = [&B](Value *X, Value *Y) {
2691
5.07k
    assert(X->getType() == Y->getType() && "Types don't match!");
2692
5.07k
    if (auto *CX = dyn_cast<ConstantInt>(X))
2693
3.56k
      if (CX->isZero())
2694
2.74k
        return Y;
2695
2.32k
    if (auto *CY = dyn_cast<ConstantInt>(Y))
2696
150
      if (CY->isZero())
2697
9
        return X;
2698
2.31k
    return B.CreateAdd(X, Y);
2699
2.31k
  };
2700
17.7k
2701
17.7k
  auto CreateMul = [&B](Value *X, Value *Y) {
2702
16.0k
    assert(X->getType() == Y->getType() && "Types don't match!");
2703
16.0k
    if (auto *CX = dyn_cast<ConstantInt>(X))
2704
2.70k
      if (CX->isOne())
2705
0
        return Y;
2706
16.0k
    if (auto *CY = dyn_cast<ConstantInt>(Y))
2707
15.6k
      if (CY->isOne())
2708
8.13k
        return X;
2709
7.93k
    return B.CreateMul(X, Y);
2710
7.93k
  };
2711
17.7k
2712
17.7k
  switch (ID.getKind()) {
2713
17.7k
  case InductionDescriptor::IK_IntInduction: {
2714
6.75k
    assert(Index->getType() == StartValue->getType() &&
2715
6.75k
           "Index type does not match StartValue type");
2716
6.75k
    if (ID.getConstIntStepValue() && 
ID.getConstIntStepValue()->isMinusOne()6.35k
)
2717
1.68k
      return B.CreateSub(StartValue, Index);
2718
5.07k
    auto *Offset = CreateMul(
2719
5.07k
        Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
2720
5.07k
    return CreateAdd(StartValue, Offset);
2721
5.07k
  }
2722
10.9k
  case InductionDescriptor::IK_PtrInduction: {
2723
10.9k
    assert(isa<SCEVConstant>(Step) &&
2724
10.9k
           "Expected constant step for pointer induction");
2725
10.9k
    return B.CreateGEP(
2726
10.9k
        StartValue->getType()->getPointerElementType(), StartValue,
2727
10.9k
        CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
2728
10.9k
                                           &*B.GetInsertPoint())));
2729
5.07k
  }
2730
5.07k
  case InductionDescriptor::IK_FpInduction: {
2731
36
    assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2732
36
    auto InductionBinOp = ID.getInductionBinOp();
2733
36
    assert(InductionBinOp &&
2734
36
           (InductionBinOp->getOpcode() == Instruction::FAdd ||
2735
36
            InductionBinOp->getOpcode() == Instruction::FSub) &&
2736
36
           "Original bin op should be defined for FP induction");
2737
36
2738
36
    Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2739
36
2740
36
    // Floating point operations had to be 'fast' to enable the induction.
2741
36
    FastMathFlags Flags;
2742
36
    Flags.setFast();
2743
36
2744
36
    Value *MulExp = B.CreateFMul(StepValue, Index);
2745
36
    if (isa<Instruction>(MulExp))
2746
36
      // We have to check, the MulExp may be a constant.
2747
36
      cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2748
36
2749
36
    Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2750
36
                               "induction");
2751
36
    if (isa<Instruction>(BOp))
2752
36
      cast<Instruction>(BOp)->setFastMathFlags(Flags);
2753
36
2754
36
    return BOp;
2755
5.07k
  }
2756
5.07k
  case InductionDescriptor::IK_NoInduction:
2757
0
    return nullptr;
2758
0
  }
2759
0
  llvm_unreachable("invalid enum");
2760
0
}
2761
2762
17.0k
BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2763
17.0k
  /*
2764
17.0k
   In this function we generate a new loop. The new loop will contain
2765
17.0k
   the vectorized instructions while the old loop will continue to run the
2766
17.0k
   scalar remainder.
2767
17.0k
2768
17.0k
       [ ] <-- loop iteration number check.
2769
17.0k
    /   |
2770
17.0k
   /    v
2771
17.0k
  |    [ ] <-- vector loop bypass (may consist of multiple blocks).
2772
17.0k
  |  /  |
2773
17.0k
  | /   v
2774
17.0k
  ||   [ ]     <-- vector pre header.
2775
17.0k
  |/    |
2776
17.0k
  |     v
2777
17.0k
  |    [  ] \
2778
17.0k
  |    [  ]_|   <-- vector loop.
2779
17.0k
  |     |
2780
17.0k
  |     v
2781
17.0k
  |   -[ ]   <--- middle-block.
2782
17.0k
  |  /  |
2783
17.0k
  | /   v
2784
17.0k
  -|- >[ ]     <--- new preheader.
2785
17.0k
   |    |
2786
17.0k
   |    v
2787
17.0k
   |   [ ] \
2788
17.0k
   |   [ ]_|   <-- old scalar loop to handle remainder.
2789
17.0k
    \   |
2790
17.0k
     \  v
2791
17.0k
      >[ ]     <-- exit block.
2792
17.0k
   ...
2793
17.0k
   */
2794
17.0k
2795
17.0k
  BasicBlock *OldBasicBlock = OrigLoop->getHeader();
2796
17.0k
  BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
2797
17.0k
  BasicBlock *ExitBlock = OrigLoop->getExitBlock();
2798
17.0k
  MDNode *OrigLoopID = OrigLoop->getLoopID();
2799
17.0k
  assert(VectorPH && "Invalid loop structure");
2800
17.0k
  assert(ExitBlock && "Must have an exit block");
2801
17.0k
2802
17.0k
  // Some loops have a single integer induction variable, while other loops
2803
17.0k
  // don't. One example is c++ iterators that often have multiple pointer
2804
17.0k
  // induction variables. In the code below we also support a case where we
2805
17.0k
  // don't have a single induction variable.
2806
17.0k
  //
2807
17.0k
  // We try to obtain an induction variable from the original loop as hard
2808
17.0k
  // as possible. However if we don't find one that:
2809
17.0k
  //   - is an integer
2810
17.0k
  //   - counts from zero, stepping by one
2811
17.0k
  //   - is the size of the widest induction variable type
2812
17.0k
  // then we create a new one.
2813
17.0k
  OldInduction = Legal->getPrimaryInduction();
2814
17.0k
  Type *IdxTy = Legal->getWidestInductionType();
2815
17.0k
2816
17.0k
  // Split the single block loop into the two loop structure described above.
2817
17.0k
  BasicBlock *VecBody =
2818
17.0k
      VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
2819
17.0k
  BasicBlock *MiddleBlock =
2820
17.0k
      VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
2821
17.0k
  BasicBlock *ScalarPH =
2822
17.0k
      MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
2823
17.0k
2824
17.0k
  // Create and register the new vector loop.
2825
17.0k
  Loop *Lp = LI->AllocateLoop();
2826
17.0k
  Loop *ParentLoop = OrigLoop->getParentLoop();
2827
17.0k
2828
17.0k
  // Insert the new loop into the loop nest and register the new basic blocks
2829
17.0k
  // before calling any utilities such as SCEV that require valid LoopInfo.
2830
17.0k
  if (ParentLoop) {
2831
4.84k
    ParentLoop->addChildLoop(Lp);
2832
4.84k
    ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
2833
4.84k
    ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
2834
12.2k
  } else {
2835
12.2k
    LI->addTopLevelLoop(Lp);
2836
12.2k
  }
2837
17.0k
  Lp->addBasicBlockToLoop(VecBody, *LI);
2838
17.0k
2839
17.0k
  // Find the loop boundaries.
2840
17.0k
  Value *Count = getOrCreateTripCount(Lp);
2841
17.0k
2842
17.0k
  Value *StartIdx = ConstantInt::get(IdxTy, 0);
2843
17.0k
2844
17.0k
  // Now, compare the new count to zero. If it is zero skip the vector loop and
2845
17.0k
  // jump to the scalar loop. This check also covers the case where the
2846
17.0k
  // backedge-taken count is uint##_max: adding one to it will overflow leading
2847
17.0k
  // to an incorrect trip count of zero. In this (rare) case we will also jump
2848
17.0k
  // to the scalar loop.
2849
17.0k
  emitMinimumIterationCountCheck(Lp, ScalarPH);
2850
17.0k
2851
17.0k
  // Generate the code to check any assumptions that we've made for SCEV
2852
17.0k
  // expressions.
2853
17.0k
  emitSCEVChecks(Lp, ScalarPH);
2854
17.0k
2855
17.0k
  // Generate the code that checks in runtime if arrays overlap. We put the
2856
17.0k
  // checks into a separate block to make the more common case of few elements
2857
17.0k
  // faster.
2858
17.0k
  emitMemRuntimeChecks(Lp, ScalarPH);
2859
17.0k
2860
17.0k
  // Generate the induction variable.
2861
17.0k
  // The loop step is equal to the vectorization factor (num of SIMD elements)
2862
17.0k
  // times the unroll factor (num of SIMD instructions).
2863
17.0k
  Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
2864
17.0k
  Constant *Step = ConstantInt::get(IdxTy, VF * UF);
2865
17.0k
  Induction =
2866
17.0k
      createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
2867
17.0k
                              getDebugLocFromInstOrOperands(OldInduction));
2868
17.0k
2869
17.0k
  // We are going to resume the execution of the scalar loop.
2870
17.0k
  // Go over all of the induction variables that we found and fix the
2871
17.0k
  // PHIs that are left in the scalar version of the loop.
2872
17.0k
  // The starting values of PHI nodes depend on the counter of the last
2873
17.0k
  // iteration in the vectorized loop.
2874
17.0k
  // If we come from a bypass edge then we need to start from the original
2875
17.0k
  // start value.
2876
17.0k
2877
17.0k
  // This variable saves the new starting index for the scalar loop. It is used
2878
17.0k
  // to test if there are any tail iterations left once the vector loop has
2879
17.0k
  // completed.
2880
17.0k
  LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
2881
20.3k
  for (auto &InductionEntry : *List) {
2882
20.3k
    PHINode *OrigPhi = InductionEntry.first;
2883
20.3k
    InductionDescriptor II = InductionEntry.second;
2884
20.3k
2885
20.3k
    // Create phi nodes to merge from the  backedge-taken check block.
2886
20.3k
    PHINode *BCResumeVal = PHINode::Create(
2887
20.3k
        OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
2888
20.3k
    // Copy original phi DL over to the new one.
2889
20.3k
    BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
2890
20.3k
    Value *&EndValue = IVEndValues[OrigPhi];
2891
20.3k
    if (OrigPhi == OldInduction) {
2892
13.2k
      // We know what the end value is.
2893
13.2k
      EndValue = CountRoundDown;
2894
13.2k
    } else {
2895
7.04k
      IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
2896
7.04k
      Type *StepType = II.getStep()->getType();
2897
7.04k
      Instruction::CastOps CastOp =
2898
7.04k
        CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
2899
7.04k
      Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
2900
7.04k
      const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2901
7.04k
      EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
2902
7.04k
      EndValue->setName("ind.end");
2903
7.04k
    }
2904
20.3k
2905
20.3k
    // The new PHI merges the original incoming value, in case of a bypass,
2906
20.3k
    // or the value at the end of the vectorized loop.
2907
20.3k
    BCResumeVal->addIncoming(EndValue, MiddleBlock);
2908
20.3k
2909
20.3k
    // Fix the scalar body counter (PHI node).
2910
20.3k
    // The old induction's phi node in the scalar body needs the truncated
2911
20.3k
    // value.
2912
20.3k
    for (BasicBlock *BB : LoopBypassBlocks)
2913
24.3k
      BCResumeVal->addIncoming(II.getStartValue(), BB);
2914
20.3k
    OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal);
2915
20.3k
  }
2916
17.0k
2917
17.0k
  // We need the OrigLoop (scalar loop part) latch terminator to help
2918
17.0k
  // produce correct debug info for the middle block BB instructions.
2919
17.0k
  // The legality check stage guarantees that the loop will have a single
2920
17.0k
  // latch.
2921
17.0k
  assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
2922
17.0k
         "Scalar loop latch terminator isn't a branch");
2923
17.0k
  BranchInst *ScalarLatchBr =
2924
17.0k
      cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
2925
17.0k
2926
17.0k
  // Add a check in the middle block to see if we have completed
2927
17.0k
  // all of the iterations in the first vector loop.
2928
17.0k
  // If (N - N%VF) == N, then we *don't* need to run the remainder.
2929
17.0k
  // If tail is to be folded, we know we don't need to run the remainder.
2930
17.0k
  Value *CmpN = Builder.getTrue();
2931
17.0k
  if (!Cost->foldTailByMasking()) {
2932
17.0k
    CmpN =
2933
17.0k
        CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
2934
17.0k
                        CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
2935
17.0k
2936
17.0k
    // Here we use the same DebugLoc as the scalar loop latch branch instead
2937
17.0k
    // of the corresponding compare because they may have ended up with
2938
17.0k
    // different line numbers and we want to avoid awkward line stepping while
2939
17.0k
    // debugging. Eg. if the compare has got a line number inside the loop.
2940
17.0k
    cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
2941
17.0k
  }
2942
17.0k
2943
17.0k
  BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN);
2944
17.0k
  BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
2945
17.0k
  ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst);
2946
17.0k
2947
17.0k
  // Get ready to start creating new instructions into the vectorized body.
2948
17.0k
  Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt());
2949
17.0k
2950
17.0k
  // Save the state.
2951
17.0k
  LoopVectorPreHeader = Lp->getLoopPreheader();
2952
17.0k
  LoopScalarPreHeader = ScalarPH;
2953
17.0k
  LoopMiddleBlock = MiddleBlock;
2954
17.0k
  LoopExitBlock = ExitBlock;
2955
17.0k
  LoopVectorBody = VecBody;
2956
17.0k
  LoopScalarBody = OldBasicBlock;
2957
17.0k
2958
17.0k
  Optional<MDNode *> VectorizedLoopID =
2959
17.0k
      makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
2960
17.0k
                                      LLVMLoopVectorizeFollowupVectorized});
2961
17.0k
  if (VectorizedLoopID.hasValue()) {
2962
1
    Lp->setLoopID(VectorizedLoopID.getValue());
2963
1
2964
1
    // Do not setAlreadyVectorized if loop attributes have been defined
2965
1
    // explicitly.
2966
1
    return LoopVectorPreHeader;
2967
1
  }
2968
17.0k
2969
17.0k
  // Keep all loop hints from the original loop on the vector loop (we'll
2970
17.0k
  // replace the vectorizer-specific hints below).
2971
17.0k
  if (MDNode *LID = OrigLoop->getLoopID())
2972
1.06k
    Lp->setLoopID(LID);
2973
17.0k
2974
17.0k
  LoopVectorizeHints Hints(Lp, true, *ORE);
2975
17.0k
  Hints.setAlreadyVectorized();
2976
17.0k
2977
17.0k
  return LoopVectorPreHeader;
2978
17.0k
}
2979
2980
// Fix up external users of the induction variable. At this point, we are
2981
// in LCSSA form, with all external PHIs that use the IV having one input value,
2982
// coming from the remainder loop. We need those PHIs to also have a correct
2983
// value for the IV when arriving directly from the middle block.
2984
void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
2985
                                       const InductionDescriptor &II,
2986
                                       Value *CountRoundDown, Value *EndValue,
2987
20.3k
                                       BasicBlock *MiddleBlock) {
2988
20.3k
  // There are two kinds of external IV usages - those that use the value
2989
20.3k
  // computed in the last iteration (the PHI) and those that use the penultimate
2990
20.3k
  // value (the value that feeds into the phi from the loop latch).
2991
20.3k
  // We allow both, but they, obviously, have different values.
2992
20.3k
2993
20.3k
  assert(OrigLoop->getExitBlock() && "Expected a single exit block");
2994
20.3k
2995
20.3k
  DenseMap<Value *, Value *> MissingVals;
2996
20.3k
2997
20.3k
  // An external user of the last iteration's value should see the value that
2998
20.3k
  // the remainder loop uses to initialize its own IV.
2999
20.3k
  Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3000
45.4k
  for (User *U : PostInc->users()) {
3001
45.4k
    Instruction *UI = cast<Instruction>(U);
3002
45.4k
    if (!OrigLoop->contains(UI)) {
3003
1.16k
      assert(isa<PHINode>(UI) && "Expected LCSSA form");
3004
1.16k
      MissingVals[UI] = EndValue;
3005
1.16k
    }
3006
45.4k
  }
3007
20.3k
3008
20.3k
  // An external user of the penultimate value need to see EndValue - Step.
3009
20.3k
  // The simplest way to get this is to recompute it from the constituent SCEVs,
3010
20.3k
  // that is Start + (Step * (CRD - 1)).
3011
46.5k
  for (User *U : OrigPhi->users()) {
3012
46.5k
    auto *UI = cast<Instruction>(U);
3013
46.5k
    if (!OrigLoop->contains(UI)) {
3014
37
      const DataLayout &DL =
3015
37
          OrigLoop->getHeader()->getModule()->getDataLayout();
3016
37
      assert(isa<PHINode>(UI) && "Expected LCSSA form");
3017
37
3018
37
      IRBuilder<> B(MiddleBlock->getTerminator());
3019
37
      Value *CountMinusOne = B.CreateSub(
3020
37
          CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3021
37
      Value *CMO =
3022
37
          !II.getStep()->getType()->isIntegerTy()
3023
37
              ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3024
1
                             II.getStep()->getType())
3025
37
              : 
B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType())36
;
3026
37
      CMO->setName("cast.cmo");
3027
37
      Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3028
37
      Escape->setName("ind.escape");
3029
37
      MissingVals[UI] = Escape;
3030
37
    }
3031
46.5k
  }
3032
20.3k
3033
20.3k
  for (auto &I : MissingVals) {
3034
1.19k
    PHINode *PHI = cast<PHINode>(I.first);
3035
1.19k
    // One corner case we have to handle is two IVs "chasing" each-other,
3036
1.19k
    // that is %IV2 = phi [...], [ %IV1, %latch ]
3037
1.19k
    // In this case, if IV1 has an external use, we need to avoid adding both
3038
1.19k
    // "last value of IV1" and "penultimate value of IV2". So, verify that we
3039
1.19k
    // don't already have an incoming value for the middle block.
3040
1.19k
    if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3041
1.19k
      PHI->addIncoming(I.second, MiddleBlock);
3042
1.19k
  }
3043
20.3k
}
3044
3045
namespace {
3046
3047
struct CSEDenseMapInfo {
3048
509k
  static bool canHandle(const Instruction *I) {
3049
509k
    return isa<InsertElementInst>(I) || 
isa<ExtractElementInst>(I)501k
||
3050
509k
           
isa<ShuffleVectorInst>(I)499k
||
isa<GetElementPtrInst>(I)489k
;
3051
509k
  }
3052
3053
3.40M
  static inline Instruction *getEmptyKey() {
3054
3.40M
    return DenseMapInfo<Instruction *>::getEmptyKey();
3055
3.40M
  }
3056
3057
907k
  static inline Instruction *getTombstoneKey() {
3058
907k
    return DenseMapInfo<Instruction *>::getTombstoneKey();
3059
907k
  }
3060
3061
275k
  static unsigned getHashValue(const Instruction *I) {
3062
275k
    assert(canHandle(I) && "Unknown instruction!");
3063
275k
    return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3064
275k
                                                           I->value_op_end()));
3065
275k
  }
3066
3067
2.13M
  static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3068
2.13M
    if (LHS == getEmptyKey() || 
RHS == getEmptyKey()799k
||
3069
2.13M
        
LHS == getTombstoneKey()291k
||
RHS == getTombstoneKey()291k
)
3070
2.07M
      return LHS == RHS;
3071
57.2k
    return LHS->isIdenticalTo(RHS);
3072
57.2k
  }
3073
};
3074
3075
} // end anonymous namespace
3076
3077
///Perform cse of induction variable instructions.
3078
17.0k
static void cse(BasicBlock *BB) {
3079
17.0k
  // Perform simple cse.
3080
17.0k
  SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3081
526k
  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3082
509k
    Instruction *In = &*I++;
3083
509k
3084
509k
    if (!CSEDenseMapInfo::canHandle(In))
3085
395k
      continue;
3086
113k
3087
113k
    // Check if we can replace this instruction with any of the
3088
113k
    // visited instructions.
3089
113k
    if (Instruction *V = CSEMap.lookup(In)) {
3090
1.49k
      In->replaceAllUsesWith(V);
3091
1.49k
      In->eraseFromParent();
3092
1.49k
      continue;
3093
1.49k
    }
3094
112k
3095
112k
    CSEMap[In] = In;
3096
112k
  }
3097
17.0k
}
3098
3099
unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3100
                                                       unsigned VF,
3101
1.93k
                                                       bool &NeedToScalarize) {
3102
1.93k
  Function *F = CI->getCalledFunction();
3103
1.93k
  StringRef FnName = CI->getCalledFunction()->getName();
3104
1.93k
  Type *ScalarRetTy = CI->getType();
3105
1.93k
  SmallVector<Type *, 4> Tys, ScalarTys;
3106
1.93k
  for (auto &ArgOp : CI->arg_operands())
3107
2.17k
    ScalarTys.push_back(ArgOp->getType());
3108
1.93k
3109
1.93k
  // Estimate cost of scalarized vector call. The source operands are assumed
3110
1.93k
  // to be vectors, so we need to extract individual elements from there,
3111
1.93k
  // execute VF scalar calls, and then gather the result into the vector return
3112
1.93k
  // value.
3113
1.93k
  unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3114
1.93k
  if (VF == 1)
3115
363
    return ScalarCallCost;
3116
1.57k
3117
1.57k
  // Compute corresponding vector type for return value and arguments.
3118
1.57k
  Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3119
1.57k
  for (Type *ScalarTy : ScalarTys)
3120
1.77k
    Tys.push_back(ToVectorTy(ScalarTy, VF));
3121
1.57k
3122
1.57k
  // Compute costs of unpacking argument values for the scalar calls and
3123
1.57k
  // packing the return values to a vector.
3124
1.57k
  unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3125
1.57k
3126
1.57k
  unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3127
1.57k
3128
1.57k
  // If we can't emit a vector call for this function, then the currently found
3129
1.57k
  // cost is the cost we need to return.
3130
1.57k
  NeedToScalarize = true;
3131
1.57k
  if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || 
CI->isNoBuiltin()378
)
3132
1.20k
    return Cost;
3133
376
3134
376
  // If the corresponding vector cost is cheaper, return its cost.
3135
376
  unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3136
376
  if (VectorCallCost < Cost) {
3137
376
    NeedToScalarize = false;
3138
376
    return VectorCallCost;
3139
376
  }
3140
0
  return Cost;
3141
0
}
3142
3143
unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3144
1.57k
                                                            unsigned VF) {
3145
1.57k
  Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3146
1.57k
  assert(ID && "Expected intrinsic call!");
3147
1.57k
3148
1.57k
  FastMathFlags FMF;
3149
1.57k
  if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3150
1.30k
    FMF = FPMO->getFastMathFlags();
3151
1.57k
3152
1.57k
  SmallVector<Value *, 4> Operands(CI->arg_operands());
3153
1.57k
  return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
3154
1.57k
}
3155
3156
334
static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3157
334
  auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3158
334
  auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3159
334
  return I1->getBitWidth() < I2->getBitWidth() ? 
T10
: T2;
3160
334
}
3161
275
static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3162
275
  auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3163
275
  auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3164
275
  return I1->getBitWidth() > I2->getBitWidth() ? 
T13
:
T2272
;
3165
275
}
3166
3167
15.3k
void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3168
15.3k
  // For every instruction `I` in MinBWs, truncate the operands, create a
3169
15.3k
  // truncated version of `I` and reextend its result. InstCombine runs
3170
15.3k
  // later and will remove any ext/trunc pairs.
3171
15.3k
  SmallPtrSet<Value *, 4> Erased;
3172
15.3k
  for (const auto &KV : Cost->getMinimalBitwidths()) {
3173
186
    // If the value wasn't vectorized, we must maintain the original scalar
3174
186
    // type. The absence of the value from VectorLoopValueMap indicates that it
3175
186
    // wasn't vectorized.
3176
186
    if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3177
2
      continue;
3178
420
    
for (unsigned Part = 0; 184
Part < UF;
++Part236
) {
3179
236
      Value *I = getOrCreateVectorValue(KV.first, Part);
3180
236
      if (Erased.find(I) != Erased.end() || I->use_empty() ||
3181
236
          
!isa<Instruction>(I)232
)
3182
6
        continue;
3183
230
      Type *OriginalTy = I->getType();
3184
230
      Type *ScalarTruncatedTy =
3185
230
          IntegerType::get(OriginalTy->getContext(), KV.second);
3186
230
      Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3187
230
                                          OriginalTy->getVectorNumElements());
3188
230
      if (TruncatedTy == OriginalTy)
3189
17
        continue;
3190
213
3191
213
      IRBuilder<> B(cast<Instruction>(I));
3192
240
      auto ShrinkOperand = [&](Value *V) -> Value * {
3193
240
        if (auto *ZI = dyn_cast<ZExtInst>(V))
3194
63
          if (ZI->getSrcTy() == TruncatedTy)
3195
5
            return ZI->getOperand(0);
3196
235
        return B.CreateZExtOrTrunc(V, TruncatedTy);
3197
235
      };
3198
213
3199
213
      // The actual instruction modification depends on the instruction type,
3200
213
      // unfortunately.
3201
213
      Value *NewI = nullptr;
3202
213
      if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3203
112
        NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3204
112
                             ShrinkOperand(BO->getOperand(1)));
3205
112
3206
112
        // Any wrapping introduced by shrinking this operation shouldn't be
3207
112
        // considered undefined behavior. So, we can't unconditionally copy
3208
112
        // arithmetic wrapping flags to NewI.
3209
112
        cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3210
112
      } else 
if (auto *101
CI101
= dyn_cast<ICmpInst>(I)) {
3211
0
        NewI =
3212
0
            B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3213
0
                         ShrinkOperand(CI->getOperand(1)));
3214
101
      } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3215
0
        NewI = B.CreateSelect(SI->getCondition(),
3216
0
                              ShrinkOperand(SI->getTrueValue()),
3217
0
                              ShrinkOperand(SI->getFalseValue()));
3218
101
      } else if (auto *CI = dyn_cast<CastInst>(I)) {
3219
75
        switch (CI->getOpcode()) {
3220
75
        default:
3221
0
          llvm_unreachable("Unhandled cast!");
3222
75
        case Instruction::Trunc:
3223
16
          NewI = ShrinkOperand(CI->getOperand(0));
3224
16
          break;
3225
75
        case Instruction::SExt:
3226
1
          NewI = B.CreateSExtOrTrunc(
3227
1
              CI->getOperand(0),
3228
1
              smallestIntegerVectorType(OriginalTy, TruncatedTy));
3229
1
          break;
3230
75
        case Instruction::ZExt:
3231
58
          NewI = B.CreateZExtOrTrunc(
3232
58
              CI->getOperand(0),
3233
58
              smallestIntegerVectorType(OriginalTy, TruncatedTy));
3234
58
          break;
3235
26
        }
3236
26
      } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3237
24
        auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3238
24
        auto *O0 = B.CreateZExtOrTrunc(
3239
24
            SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3240
24
        auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3241
24
        auto *O1 = B.CreateZExtOrTrunc(
3242
24
            SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3243
24
3244
24
        NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3245
24
      } else 
if (2
isa<LoadInst>(I)2
||
isa<PHINode>(I)2
) {
3246
1
        // Don't do anything with the operands, just extend the result.
3247
1
        continue;
3248
1
      } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3249
1
        auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3250
1
        auto *O0 = B.CreateZExtOrTrunc(
3251
1
            IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3252
1
        auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3253
1
        NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3254
1
      } else 
if (auto *0
EE0
= dyn_cast<ExtractElementInst>(I)) {
3255
0
        auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3256
0
        auto *O0 = B.CreateZExtOrTrunc(
3257
0
            EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3258
0
        NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3259
0
      } else {
3260
0
        // If we don't know what to do, be conservative and don't do anything.
3261
0
        continue;
3262
0
      }
3263
212
3264
212
      // Lastly, extend the result.
3265
212
      NewI->takeName(cast<Instruction>(I));
3266
212
      Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3267
212
      I->replaceAllUsesWith(Res);
3268
212
      cast<Instruction>(I)->eraseFromParent();
3269
212
      Erased.insert(I);
3270
212
      VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3271
212
    }
3272
184
  }
3273
15.3k
3274
15.3k
  // We'll have created a bunch of ZExts that are now parentless. Clean up.
3275
15.3k
  for (const auto &KV : Cost->getMinimalBitwidths()) {
3276
186
    // If the value wasn't vectorized, we must maintain the original scalar
3277
186
    // type. The absence of the value from VectorLoopValueMap indicates that it
3278
186
    // wasn't vectorized.
3279
186
    if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3280
2
      continue;
3281
420
    
for (unsigned Part = 0; 184
Part < UF;
++Part236
) {
3282
236
      Value *I = getOrCreateVectorValue(KV.first, Part);
3283
236
      ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3284
236
      if (Inst && 
Inst->use_empty()198
) {
3285
3
        Value *NewI = Inst->getOperand(0);
3286
3
        Inst->eraseFromParent();
3287
3
        VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3288
3
      }
3289
236
    }
3290
184
  }
3291
15.3k
}
3292
3293
17.0k
void InnerLoopVectorizer::fixVectorizedLoop() {
3294
17.0k
  // Insert truncates and extends for any truncated instructions as hints to
3295
17.0k
  // InstCombine.
3296
17.0k
  if (VF > 1)
3297
15.3k
    truncateToMinimalBitwidths();
3298
17.0k
3299
17.0k
  // Fix widened non-induction PHIs by setting up the PHI operands.
3300
17.0k
  if (OrigPHIsToFix.size()) {
3301
7
    assert(EnableVPlanNativePath &&
3302
7
           "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3303
7
    fixNonInductionPHIs();
3304
7
  }
3305
17.0k
3306
17.0k
  // At this point every instruction in the original loop is widened to a
3307
17.0k
  // vector form. Now we need to fix the recurrences in the loop. These PHI
3308
17.0k
  // nodes are currently empty because we did not want to introduce cycles.
3309
17.0k
  // This is the second stage of vectorizing recurrences.
3310
17.0k
  fixCrossIterationPHIs();
3311
17.0k
3312
17.0k
  // Update the dominator tree.
3313
17.0k
  //
3314
17.0k
  // FIXME: After creating the structure of the new loop, the dominator tree is
3315
17.0k
  //        no longer up-to-date, and it remains that way until we update it
3316
17.0k
  //        here. An out-of-date dominator tree is problematic for SCEV,
3317
17.0k
  //        because SCEVExpander uses it to guide code generation. The
3318
17.0k
  //        vectorizer use SCEVExpanders in several places. Instead, we should
3319
17.0k
  //        keep the dominator tree up-to-date as we go.
3320
17.0k
  updateAnalysis();
3321
17.0k
3322
17.0k
  // Fix-up external users of the induction variables.
3323
17.0k
  for (auto &Entry : *Legal->getInductionVars())
3324
20.3k
    fixupIVUsers(Entry.first, Entry.second,
3325
20.3k
                 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3326
20.3k
                 IVEndValues[Entry.first], LoopMiddleBlock);
3327
17.0k
3328
17.0k
  fixLCSSAPHIs();
3329
17.0k
  for (Instruction *PI : PredicatedInstructions)
3330
948
    sinkScalarOperands(&*PI);
3331
17.0k
3332
17.0k
  // Remove redundant induction instructions.
3333
17.0k
  cse(LoopVectorBody);
3334
17.0k
}
3335
3336
17.0k
void InnerLoopVectorizer::fixCrossIterationPHIs() {
3337
17.0k
  // In order to support recurrences we need to be able to vectorize Phi nodes.
3338
17.0k
  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3339
17.0k
  // stage #2: We now need to fix the recurrences by adding incoming edges to
3340
17.0k
  // the currently empty PHI nodes. At this point every instruction in the
3341
17.0k
  // original loop is widened to a vector form so we can use them to construct
3342
17.0k
  // the incoming edges.
3343
21.7k
  for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3344
21.7k
    // Handle first-order recurrences and reductions that need to be fixed.
3345
21.7k
    if (Legal->isFirstOrderRecurrence(&Phi))
3346
90
      fixFirstOrderRecurrence(&Phi);
3347
21.6k
    else if (Legal->isReductionVariable(&Phi))
3348
1.36k
      fixReduction(&Phi);
3349
21.7k
  }
3350
17.0k
}
3351
3352
90
void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3353
90
  // This is the second phase of vectorizing first-order recurrences. An
3354
90
  // overview of the transformation is described below. Suppose we have the
3355
90
  // following loop.
3356
90
  //
3357
90
  //   for (int i = 0; i < n; ++i)
3358
90
  //     b[i] = a[i] - a[i - 1];
3359
90
  //
3360
90
  // There is a first-order recurrence on "a". For this loop, the shorthand
3361
90
  // scalar IR looks like:
3362
90
  //
3363
90
  //   scalar.ph:
3364
90
  //     s_init = a[-1]
3365
90
  //     br scalar.body
3366
90
  //
3367
90
  //   scalar.body:
3368
90
  //     i = phi [0, scalar.ph], [i+1, scalar.body]
3369
90
  //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3370
90
  //     s2 = a[i]
3371
90
  //     b[i] = s2 - s1
3372
90
  //     br cond, scalar.body, ...
3373
90
  //
3374
90
  // In this example, s1 is a recurrence because it's value depends on the
3375
90
  // previous iteration. In the first phase of vectorization, we created a
3376
90
  // temporary value for s1. We now complete the vectorization and produce the
3377
90
  // shorthand vector IR shown below (for VF = 4, UF = 1).
3378
90
  //
3379
90
  //   vector.ph:
3380
90
  //     v_init = vector(..., ..., ..., a[-1])
3381
90
  //     br vector.body
3382
90
  //
3383
90
  //   vector.body
3384
90
  //     i = phi [0, vector.ph], [i+4, vector.body]
3385
90
  //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3386
90
  //     v2 = a[i, i+1, i+2, i+3];
3387
90
  //     v3 = vector(v1(3), v2(0, 1, 2))
3388
90
  //     b[i, i+1, i+2, i+3] = v2 - v3
3389
90
  //     br cond, vector.body, middle.block
3390
90
  //
3391
90
  //   middle.block:
3392
90
  //     x = v2(3)
3393
90
  //     br scalar.ph
3394
90
  //
3395
90
  //   scalar.ph:
3396
90
  //     s_init = phi [x, middle.block], [a[-1], otherwise]
3397
90
  //     br scalar.body
3398
90
  //
3399
90
  // After execution completes the vector loop, we extract the next value of
3400
90
  // the recurrence (x) to use as the initial value in the scalar loop.
3401
90
3402
90
  // Get the original loop preheader and single loop latch.
3403
90
  auto *Preheader = OrigLoop->getLoopPreheader();
3404
90
  auto *Latch = OrigLoop->getLoopLatch();
3405
90
3406
90
  // Get the initial and previous values of the scalar recurrence.
3407
90
  auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3408
90
  auto *Previous = Phi->getIncomingValueForBlock(Latch);
3409
90
3410
90
  // Create a vector from the initial value.
3411
90
  auto *VectorInit = ScalarInit;
3412
90
  if (VF > 1) {
3413
79
    Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3414
79
    VectorInit = Builder.CreateInsertElement(
3415
79
        UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3416
79
        Builder.getInt32(VF - 1), "vector.recur.init");
3417
79
  }
3418
90
3419
90
  // We constructed a temporary phi node in the first phase of vectorization.
3420
90
  // This phi node will eventually be deleted.
3421
90
  Builder.SetInsertPoint(
3422
90
      cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3423
90
3424
90
  // Create a phi node for the new recurrence. The current value will either be
3425
90
  // the initial value inserted into a vector or loop-varying vector value.
3426
90
  auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3427
90
  VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3428
90
3429
90
  // Get the vectorized previous value of the last part UF - 1. It appears last
3430
90
  // among all unrolled iterations, due to the order of their construction.
3431
90
  Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3432
90
3433
90
  // Set the insertion point after the previous value if it is an instruction.
3434
90
  // Note that the previous value may have been constant-folded so it is not
3435
90
  // guaranteed to be an instruction in the vector loop. Also, if the previous
3436
90
  // value is a phi node, we should insert after all the phi nodes to avoid
3437
90
  // breaking basic block verification.
3438
90
  if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) ||
3439
90
      
isa<PHINode>(PreviousLastPart)85
)
3440
7
    Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3441
83
  else
3442
83
    Builder.SetInsertPoint(
3443
83
        &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart)));
3444
90
3445
90
  // We will construct a vector for the recurrence by combining the values for
3446
90
  // the current and previous iterations. This is the required shuffle mask.
3447
90
  SmallVector<Constant *, 8> ShuffleMask(VF);
3448
90
  ShuffleMask[0] = Builder.getInt32(VF - 1);
3449
323
  for (unsigned I = 1; I < VF; 
++I233
)
3450
233
    ShuffleMask[I] = Builder.getInt32(I + VF - 1);
3451
90
3452
90
  // The vector from which to take the initial value for the current iteration
3453
90
  // (actual or unrolled). Initially, this is the vector phi node.
3454
90
  Value *Incoming = VecPhi;
3455
90
3456
90
  // Shuffle the current and previous vector and update the vector parts.
3457
221
  for (unsigned Part = 0; Part < UF; 
++Part131
) {
3458
131
    Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3459
131
    Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3460
131
    auto *Shuffle =
3461
131
        VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3462
109
                                             ConstantVector::get(ShuffleMask))
3463
131
               : 
Incoming22
;
3464
131
    PhiPart->replaceAllUsesWith(Shuffle);
3465
131
    cast<Instruction>(PhiPart)->eraseFromParent();
3466
131
    VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3467
131
    Incoming = PreviousPart;
3468
131
  }
3469
90
3470
90
  // Fix the latch value of the new recurrence in the vector loop.
3471
90
  VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3472
90
3473
90
  // Extract the last vector element in the middle block. This will be the
3474
90
  // initial value for the recurrence when jumping to the scalar loop.
3475
90
  auto *ExtractForScalar = Incoming;
3476
90
  if (VF > 1) {
3477
79
    Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3478
79
    ExtractForScalar = Builder.CreateExtractElement(
3479
79
        ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3480
79
  }
3481
90
  // Extract the second last element in the middle block if the
3482
90
  // Phi is used outside the loop. We need to extract the phi itself
3483
90
  // and not the last element (the phi update in the current iteration). This
3484
90
  // will be the value when jumping to the exit block from the LoopMiddleBlock,
3485
90
  // when the scalar loop is not run at all.
3486
90
  Value *ExtractForPhiUsedOutsideLoop = nullptr;
3487
90
  if (VF > 1)
3488
79
    ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3489
79
        Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3490
11
  // When loop is unrolled without vectorizing, initialize
3491
11
  // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3492
11
  // `Incoming`. This is analogous to the vectorized case above: extracting the
3493
11
  // second last element when VF > 1.
3494
11
  else if (UF > 1)
3495
11
    ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3496
90
3497
90
  // Fix the initial value of the original recurrence in the scalar loop.
3498
90
  Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3499
90
  auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3500
205
  for (auto *BB : predecessors(LoopScalarPreHeader)) {
3501
205
    auto *Incoming = BB == LoopMiddleBlock ? 
ExtractForScalar90
:
ScalarInit115
;
3502
205
    Start->addIncoming(Incoming, BB);
3503
205
  }
3504
90
3505
90
  Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3506
90
  Phi->setName("scalar.recur");
3507
90
3508
90
  // Finally, fix users of the recurrence outside the loop. The users will need
3509
90
  // either the last value of the scalar recurrence or the last value of the
3510
90
  // vector recurrence we extracted in the middle block. Since the loop is in
3511
90
  // LCSSA form, we just need to find all the phi nodes for the original scalar
3512
90
  // recurrence in the exit block, and then add an edge for the middle block.
3513
90
  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3514
49
    if (LCSSAPhi.getIncomingValue(0) == Phi) {
3515
9
      LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3516
9
    }
3517
49
  }
3518
90
}
3519
3520
1.36k
void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3521
1.36k
  Constant *Zero = Builder.getInt32(0);
3522
1.36k
3523
1.36k
  // Get it's reduction variable descriptor.
3524
1.36k
  assert(Legal->isReductionVariable(Phi) &&
3525
1.36k
         "Unable to find the reduction variable");
3526
1.36k
  RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
3527
1.36k
3528
1.36k
  RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3529
1.36k
  TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3530
1.36k
  Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3531
1.36k
  RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3532
1.36k
    RdxDesc.getMinMaxRecurrenceKind();
3533
1.36k
  setDebugLocFromInst(Builder, ReductionStartValue);
3534
1.36k
3535
1.36k
  // We need to generate a reduction vector from the incoming scalar.
3536
1.36k
  // To do so, we need to generate the 'identity' vector and override
3537
1.36k
  // one of the elements with the incoming scalar reduction. We need
3538
1.36k
  // to do it in the vector-loop preheader.
3539
1.36k
  Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3540
1.36k
3541
1.36k
  // This is the vector-clone of the value that leaves the loop.
3542
1.36k
  Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3543
1.36k
3544
1.36k
  // Find the reduction identity variable. Zero for addition, or, xor,
3545
1.36k
  // one for multiplication, -1 for And.
3546
1.36k
  Value *Identity;
3547
1.36k
  Value *VectorStart;
3548
1.36k
  if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3549
1.36k
      
RK == RecurrenceDescriptor::RK_FloatMinMax1.27k
) {
3550
107
    // MinMax reduction have the start value as their identify.
3551
107
    if (VF == 1) {
3552
44
      VectorStart = Identity = ReductionStartValue;
3553
63
    } else {
3554
63
      VectorStart = Identity =
3555
63
        Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3556
63
    }
3557
1.25k
  } else {
3558
1.25k
    // Handle other reduction kinds:
3559
1.25k
    Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3560
1.25k
        RK, VecTy->getScalarType());
3561
1.25k
    if (VF == 1) {
3562
261
      Identity = Iden;
3563
261
      // This vector is the Identity vector where the first element is the
3564
261
      // incoming scalar reduction.
3565
261
      VectorStart = ReductionStartValue;
3566
994
    } else {
3567
994
      Identity = ConstantVector::getSplat(VF, Iden);
3568
994
3569
994
      // This vector is the Identity vector where the first element is the
3570
994
      // incoming scalar reduction.
3571
994
      VectorStart =
3572
994
        Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3573
994
    }
3574
1.25k
  }
3575
1.36k
3576
1.36k
  // Fix the vector-loop phi.
3577
1.36k
3578
1.36k
  // Reductions do not have to start at zero. They can start with
3579
1.36k
  // any loop invariant values.
3580
1.36k
  BasicBlock *Latch = OrigLoop->getLoopLatch();
3581
1.36k
  Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3582
3.98k
  for (unsigned Part = 0; Part < UF; 
++Part2.62k
) {
3583
2.62k
    Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3584
2.62k
    Value *Val = getOrCreateVectorValue(LoopVal, Part);
3585
2.62k
    // Make sure to add the reduction stat value only to the
3586
2.62k
    // first unroll part.
3587
2.62k
    Value *StartVal = (Part == 0) ? 
VectorStart1.36k
:
Identity1.26k
;
3588
2.62k
    cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3589
2.62k
    cast<PHINode>(VecRdxPhi)
3590
2.62k
      ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3591
2.62k
  }
3592
1.36k
3593
1.36k
  // Before each round, move the insertion point right between
3594
1.36k
  // the PHIs and the values we are going to write.
3595
1.36k
  // This allows us to write both PHINodes and the extractelement
3596
1.36k
  // instructions.
3597
1.36k
  Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3598
1.36k
3599
1.36k
  setDebugLocFromInst(Builder, LoopExitInst);
3600
1.36k
3601
1.36k
  // If the vector reduction can be performed in a smaller type, we truncate
3602
1.36k
  // then extend the loop exit value to enable InstCombine to evaluate the
3603
1.36k
  // entire expression in the smaller type.
3604
1.36k
  if (VF > 1 && 
Phi->getType() != RdxDesc.getRecurrenceType()1.05k
) {
3605
6
    Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3606
6
    Builder.SetInsertPoint(
3607
6
        LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
3608
6
    VectorParts RdxParts(UF);
3609
13
    for (unsigned Part = 0; Part < UF; 
++Part7
) {
3610
7
      RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3611
7
      Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3612
7
      Value *Extnd = RdxDesc.isSigned() ? 
Builder.CreateSExt(Trunc, VecTy)1
3613
7
                                        : 
Builder.CreateZExt(Trunc, VecTy)6
;
3614
7
      for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3615
21
           UI != RdxParts[Part]->user_end();)
3616
14
        if (*UI != Trunc) {
3617
7
          (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3618
7
          RdxParts[Part] = Extnd;
3619
7
        } else {
3620
7
          ++UI;
3621
7
        }
3622
7
    }
3623
6
    Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3624
13
    for (unsigned Part = 0; Part < UF; 
++Part7
) {
3625
7
      RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3626
7
      VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3627
7
    }
3628
6
  }
3629
1.36k
3630
1.36k
  // Reduce all of the unrolled parts into a single vector.
3631
1.36k
  Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3632
1.36k
  unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
3633
1.36k
3634
1.36k
  // The middle block terminator has already been assigned a DebugLoc here (the
3635
1.36k
  // OrigLoop's single latch terminator). We want the whole middle block to
3636
1.36k
  // appear to execute on this line because: (a) it is all compiler generated,
3637
1.36k
  // (b) these instructions are always executed after evaluating the latch
3638
1.36k
  // conditional branch, and (c) other passes may add new predecessors which
3639
1.36k
  // terminate on this line. This is the easiest way to ensure we don't
3640
1.36k
  // accidentally cause an extra step back into the loop while debugging.
3641
1.36k
  setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
3642
2.62k
  for (unsigned Part = 1; Part < UF; 
++Part1.26k
) {
3643
1.26k
    Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3644
1.26k
    if (Op != Instruction::ICmp && 
Op != Instruction::FCmp1.19k
)
3645
1.19k
      // Floating point operations had to be 'fast' to enable the reduction.
3646
1.19k
      ReducedPartRdx = addFastMathFlag(
3647
1.19k
          Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
3648
1.19k
                              ReducedPartRdx, "bin.rdx"),
3649
1.19k
          RdxDesc.getFastMathFlags());
3650
73
    else
3651
73
      ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3652
73
                                      RdxPart);
3653
1.26k
  }
3654
1.36k
3655
1.36k
  if (VF > 1) {
3656
1.05k
    bool NoNaN = Legal->hasFunNoNaNAttr();
3657
1.05k
    ReducedPartRdx =
3658
1.05k
        createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3659
1.05k
    // If the reduction can be performed in a smaller type, we need to extend
3660
1.05k
    // the reduction to the wider type before we branch to the original loop.
3661
1.05k
    if (Phi->getType() != RdxDesc.getRecurrenceType())
3662
6
      ReducedPartRdx =
3663
6
        RdxDesc.isSigned()
3664
6
        ? 
Builder.CreateSExt(ReducedPartRdx, Phi->getType())1
3665
6
        : 
Builder.CreateZExt(ReducedPartRdx, Phi->getType())5
;
3666
1.05k
  }
3667
1.36k
3668
1.36k
  // Create a phi node that merges control-flow from the backedge-taken check
3669
1.36k
  // block and the middle block.
3670
1.36k
  PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3671
1.36k
                                        LoopScalarPreHeader->getTerminator());
3672
2.76k
  for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; 
++I1.40k
)
3673
1.40k
    BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3674
1.36k
  BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3675
1.36k
3676
1.36k
  // Now, we need to fix the users of the reduction variable
3677
1.36k
  // inside and outside of the scalar remainder loop.
3678
1.36k
  // We know that the loop is in LCSSA form. We need to update the
3679
1.36k
  // PHI nodes in the exit blocks.
3680
1.70k
  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3681
1.70k
    // All PHINodes need to have a single entry edge, or two if
3682
1.70k
    // we already fixed them.
3683
1.70k
    assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3684
1.70k
3685
1.70k
    // We found a reduction value exit-PHI. Update it with the
3686
1.70k
    // incoming bypass edge.
3687
1.70k
    if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3688
1.36k
      LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3689
1.70k
  } // end of the LCSSA phi scan.
3690
1.36k
3691
1.36k
    // Fix the scalar loop reduction variable with the incoming reduction sum
3692
1.36k
    // from the vector body and from the backedge value.
3693
1.36k
  int IncomingEdgeBlockIdx =
3694
1.36k
    Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
3695
1.36k
  assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3696
1.36k
  // Pick the other block.
3697
1.36k
  int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 
0851
:
1511
);
3698
1.36k
  Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3699
1.36k
  Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3700
1.36k
}
3701
3702
17.0k
void InnerLoopVectorizer::fixLCSSAPHIs() {
3703
17.0k
  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3704
2.58k
    if (LCSSAPhi.getNumIncomingValues() == 1) {
3705
15
      auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3706
15
      // Non-instruction incoming values will have only one value.
3707
15
      unsigned LastLane = 0;
3708
15
      if (isa<Instruction>(IncomingValue)) 
3709
13
          LastLane = Cost->isUniformAfterVectorization(
3710
13
                         cast<Instruction>(IncomingValue), VF)
3711
13
                         ? 
02
3712
13
                         : 
VF - 111
;
3713
15
      // Can be a loop invariant incoming value or the last scalar value to be
3714
15
      // extracted from the vectorized loop.
3715
15
      Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3716
15
      Value *lastIncomingValue =
3717
15
          getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3718
15
      LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3719
15
    }
3720
2.58k
  }
3721
17.0k
}
3722
3723
948
void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3724
948
  // The basic block and loop containing the predicated instruction.
3725
948
  auto *PredBB = PredInst->getParent();
3726
948
  auto *VectorLoop = LI->getLoopFor(PredBB);
3727
948
3728
948
  // Initialize a worklist with the operands of the predicated instruction.
3729
948
  SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3730
948
3731
948
  // Holds instructions that we need to analyze again. An instruction may be
3732
948
  // reanalyzed if we don't yet know if we can sink it or not.
3733
948
  SmallVector<Instruction *, 8> InstsToReanalyze;
3734
948
3735
948
  // Returns true if a given use occurs in the predicated block. Phi nodes use
3736
948
  // their operands in their corresponding predecessor blocks.
3737
3.39k
  auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3738
3.39k
    auto *I = cast<Instruction>(U.getUser());
3739
3.39k
    BasicBlock *BB = I->getParent();
3740
3.39k
    if (auto *Phi = dyn_cast<PHINode>(I))
3741
0
      BB = Phi->getIncomingBlock(
3742
0
          PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3743
3.39k
    return BB == PredBB;
3744
3.39k
  };
3745
948
3746
948
  // Iteratively sink the scalarized operands of the predicated instruction
3747
948
  // into the block we created for it. When an instruction is sunk, it's
3748
948
  // operands are then added to the worklist. The algorithm ends after one pass
3749
948
  // through the worklist doesn't sink a single instruction.
3750
948
  bool Changed;
3751
1.66k
  do {
3752
1.66k
    // Add the instructions that need to be reanalyzed to the worklist, and
3753
1.66k
    // reset the changed indicator.
3754
1.66k
    Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3755
1.66k
    InstsToReanalyze.clear();
3756
1.66k
    Changed = false;
3757
1.66k
3758
6.86k
    while (!Worklist.empty()) {
3759
5.19k
      auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3760
5.19k
3761
5.19k
      // We can't sink an instruction if it is a phi node, is already in the
3762
5.19k
      // predicated block, is not in the loop, or may have side effects.
3763
5.19k
      if (!I || 
isa<PHINode>(I)3.50k
||
I->getParent() == PredBB3.23k
||
3764
5.19k
          
!VectorLoop->contains(I)3.00k
||
I->mayHaveSideEffects()2.76k
)
3765
2.42k
        continue;
3766
2.76k
3767
2.76k
      // It's legal to sink the instruction if all its uses occur in the
3768
2.76k
      // predicated block. Otherwise, there's nothing to do yet, and we may
3769
2.76k
      // need to reanalyze the instruction.
3770
2.76k
      if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3771
1.41k
        InstsToReanalyze.push_back(I);
3772
1.41k
        continue;
3773
1.41k
      }
3774
1.34k
3775
1.34k
      // Move the instruction to the beginning of the predicated block, and add
3776
1.34k
      // it's operands to the worklist.
3777
1.34k
      I->moveBefore(&*PredBB->getFirstInsertionPt());
3778
1.34k
      Worklist.insert(I->op_begin(), I->op_end());
3779
1.34k
3780
1.34k
      // The sinking may have enabled other instructions to be sunk, so we will
3781
1.34k
      // need to iterate.
3782
1.34k
      Changed = true;
3783
1.34k
    }
3784
1.66k
  } while (Changed);
3785
948
}
3786
3787
7
void InnerLoopVectorizer::fixNonInductionPHIs() {
3788
9
  for (PHINode *OrigPhi : OrigPHIsToFix) {
3789
9
    PHINode *NewPhi =
3790
9
        cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
3791
9
    unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
3792
9
3793
9
    SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
3794
9
        predecessors(OrigPhi->getParent()));
3795
9
    SmallVector<BasicBlock *, 2> VectorBBPredecessors(
3796
9
        predecessors(NewPhi->getParent()));
3797
9
    assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
3798
9
           "Scalar and Vector BB should have the same number of predecessors");
3799
9
3800
9
    // The insertion point in Builder may be invalidated by the time we get
3801
9
    // here. Force the Builder insertion point to something valid so that we do
3802
9
    // not run into issues during insertion point restore in
3803
9
    // getOrCreateVectorValue calls below.
3804
9
    Builder.SetInsertPoint(NewPhi);
3805
9
3806
9
    // The predecessor order is preserved and we can rely on mapping between
3807
9
    // scalar and vector block predecessors.
3808
26
    for (unsigned i = 0; i < NumIncomingValues; 
++i17
) {
3809
17
      BasicBlock *NewPredBB = VectorBBPredecessors[i];
3810
17
3811
17
      // When looking up the new scalar/vector values to fix up, use incoming
3812
17
      // values from original phi.
3813
17
      Value *ScIncV =
3814
17
          OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
3815
17
3816
17
      // Scalar incoming value may need a broadcast
3817
17
      Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
3818
17
      NewPhi->addIncoming(NewIncV, NewPredBB);
3819
17
    }
3820
9
  }
3821
7
}
3822
3823
void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
3824
5.05k
                                              unsigned VF) {
3825
5.05k
  PHINode *P = cast<PHINode>(PN);
3826
5.05k
  if (EnableVPlanNativePath) {
3827
9
    // Currently we enter here in the VPlan-native path for non-induction
3828
9
    // PHIs where all control flow is uniform. We simply widen these PHIs.
3829
9
    // Create a vector phi with no operands - the vector phi operands will be
3830
9
    // set at the end of vector code generation.
3831
9
    Type *VecTy =
3832
9
        (VF == 1) ? 
PN->getType()0
: VectorType::get(PN->getType(), VF);
3833
9
    Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
3834
9
    VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
3835
9
    OrigPHIsToFix.push_back(P);
3836
9
3837
9
    return;
3838
9
  }
3839
5.04k
3840
5.04k
  assert(PN->getParent() == OrigLoop->getHeader() &&
3841
5.04k
         "Non-header phis should have been handled elsewhere");
3842
5.04k
3843
5.04k
  // In order to support recurrences we need to be able to vectorize Phi nodes.
3844
5.04k
  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3845
5.04k
  // stage #1: We create a new vector PHI node with no incoming edges. We'll use
3846
5.04k
  // this value when we vectorize all of the instructions that use the PHI.
3847
5.04k
  if (Legal->isReductionVariable(P) || 
Legal->isFirstOrderRecurrence(P)3.68k
) {
3848
4.20k
    for (unsigned Part = 0; Part < UF; 
++Part2.75k
) {
3849
2.75k
      // This is phase one of vectorizing PHIs.
3850
2.75k
      Type *VecTy =
3851
2.75k
          (VF == 1) ? 
PN->getType()640
:
VectorType::get(PN->getType(), VF)2.11k
;
3852
2.75k
      Value *EntryPart = PHINode::Create(
3853
2.75k
          VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
3854
2.75k
      VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
3855
2.75k
    }
3856
1.45k
    return;
3857
1.45k
  }
3858
3.59k
3859
3.59k
  setDebugLocFromInst(Builder, P);
3860
3.59k
3861
3.59k
  // This PHINode must be an induction variable.
3862
3.59k
  // Make sure that we know about it.
3863
3.59k
  assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
3864
3.59k
3865
3.59k
  InductionDescriptor II = Legal->getInductionVars()->lookup(P);
3866
3.59k
  const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
3867
3.59k
3868
3.59k
  // FIXME: The newly created binary instructions should contain nsw/nuw flags,
3869
3.59k
  // which can be found from the original scalar operations.
3870
3.59k
  switch (II.getKind()) {
3871
3.59k
  case InductionDescriptor::IK_NoInduction:
3872
0
    llvm_unreachable("Unknown induction");
3873
3.59k
  case InductionDescriptor::IK_IntInduction:
3874
0
  case InductionDescriptor::IK_FpInduction:
3875
0
    llvm_unreachable("Integer/fp induction is handled elsewhere.");
3876
3.59k
  case InductionDescriptor::IK_PtrInduction: {
3877
3.59k
    // Handle the pointer induction variable case.
3878
3.59k
    assert(P->getType()->isPointerTy() && "Unexpected type.");
3879
3.59k
    // This is the normalized GEP that starts counting at zero.
3880
3.59k
    Value *PtrInd = Induction;
3881
3.59k
    PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
3882
3.59k
    // Determine the number of scalars we need to generate for each unroll
3883
3.59k
    // iteration. If the instruction is uniform, we only need to generate the
3884
3.59k
    // first lane. Otherwise, we generate all VF values.
3885
3.59k
    unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 
13.56k
:
VF25
;
3886
3.59k
    // These are the scalar results. Notice that we don't generate vector GEPs
3887
3.59k
    // because scalar GEPs result in better code.
3888
10.9k
    for (unsigned Part = 0; Part < UF; 
++Part7.33k
) {
3889
14.7k
      for (unsigned Lane = 0; Lane < Lanes; 
++Lane7.38k
) {
3890
7.38k
        Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
3891
7.38k
        Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
3892
7.38k
        Value *SclrGep =
3893
7.38k
            emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
3894
7.38k
        SclrGep->setName("next.gep");
3895
7.38k
        VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
3896
7.38k
      }
3897
7.33k
    }
3898
3.59k
    return;
3899
0
  }
3900
3.59k
  }
3901
3.59k
}
3902
3903
/// A helper function for checking whether an integer division-related
3904
/// instruction may divide by zero (in which case it must be predicated if
3905
/// executed conditionally in the scalar code).
3906
/// TODO: It may be worthwhile to generalize and check isKnownNonZero().
3907
/// Non-zero divisors that are non compile-time constants will not be
3908
/// converted into multiplication, so we will still end up scalarizing
3909
/// the division, but can do so w/o predication.
3910
575
static bool mayDivideByZero(Instruction &I) {
3911
575
  assert((I.getOpcode() == Instruction::UDiv ||
3912
575
          I.getOpcode() == Instruction::SDiv ||
3913
575
          I.getOpcode() == Instruction::URem ||
3914
575
          I.getOpcode() == Instruction::SRem) &&
3915
575
         "Unexpected instruction");
3916
575
  Value *Divisor = I.getOperand(1);
3917
575
  auto *CInt = dyn_cast<ConstantInt>(Divisor);
3918
575
  return !CInt || 
CInt->isZero()117
;
3919
575
}
3920
3921
50.4k
void InnerLoopVectorizer::widenInstruction(Instruction &I) {
3922
50.4k
  switch (I.getOpcode()) {
3923
50.4k
  case Instruction::Br:
3924
0
  case Instruction::PHI:
3925
0
    llvm_unreachable("This instruction is handled by a different recipe.");
3926
93
  case Instruction::GetElementPtr: {
3927
93
    // Construct a vector GEP by widening the operands of the scalar GEP as
3928
93
    // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
3929
93
    // results in a vector of pointers when at least one operand of the GEP
3930
93
    // is vector-typed. Thus, to keep the representation compact, we only use
3931
93
    // vector-typed operands for loop-varying values.
3932
93
    auto *GEP = cast<GetElementPtrInst>(&I);
3933
93
3934
93
    if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
3935
1
      // If we are vectorizing, but the GEP has only loop-invariant operands,
3936
1
      // the GEP we build (by only using vector-typed operands for
3937
1
      // loop-varying values) would be a scalar pointer. Thus, to ensure we
3938
1
      // produce a vector of pointers, we need to either arbitrarily pick an
3939
1
      // operand to broadcast, or broadcast a clone of the original GEP.
3940
1
      // Here, we broadcast a clone of the original.
3941
1
      //
3942
1
      // TODO: If at some point we decide to scalarize instructions having
3943
1
      //       loop-invariant operands, this special case will no longer be
3944
1
      //       required. We would add the scalarization decision to
3945
1
      //       collectLoopScalars() and teach getVectorValue() to broadcast
3946
1
      //       the lane-zero scalar value.
3947
1
      auto *Clone = Builder.Insert(GEP->clone());
3948
2
      for (unsigned Part = 0; Part < UF; 
++Part1
) {
3949
1
        Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
3950
1
        VectorLoopValueMap.setVectorValue(&I, Part, EntryPart);
3951
1
        addMetadata(EntryPart, GEP);
3952
1
      }
3953
92
    } else {
3954
92
      // If the GEP has at least one loop-varying operand, we are sure to
3955
92
      // produce a vector of pointers. But if we are only unrolling, we want
3956
92
      // to produce a scalar GEP for each unroll part. Thus, the GEP we
3957
92
      // produce with the code below will be scalar (if VF == 1) or vector
3958
92
      // (otherwise). Note that for the unroll-only case, we still maintain
3959
92
      // values in the vector mapping with initVector, as we do for other
3960
92
      // instructions.
3961
248
      for (unsigned Part = 0; Part < UF; 
++Part156
) {
3962
156
        // The pointer operand of the new GEP. If it's loop-invariant, we
3963
156
        // won't broadcast it.
3964
156
        auto *Ptr =
3965
156
            OrigLoop->isLoopInvariant(GEP->getPointerOperand())
3966
156
                ? 
GEP->getPointerOperand()141
3967
156
                : 
getOrCreateVectorValue(GEP->getPointerOperand(), Part)15
;
3968
156
3969
156
        // Collect all the indices for the new GEP. If any index is
3970
156
        // loop-invariant, we won't broadcast it.
3971
156
        SmallVector<Value *, 4> Indices;
3972
215
        for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
3973
215
          if (OrigLoop->isLoopInvariant(U.get()))
3974
63
            Indices.push_back(U.get());
3975
152
          else
3976
152
            Indices.push_back(getOrCreateVectorValue(U.get(), Part));
3977
215
        }
3978
156
3979
156
        // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
3980
156
        // but it should be a vector, otherwise.
3981
156
        auto *NewGEP =
3982
156
            GEP->isInBounds()
3983
156
                ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
3984
154
                                            Indices)
3985
156
                : 
Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices)2
;
3986
156
        assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
3987
156
               "NewGEP is not a pointer vector");
3988
156
        VectorLoopValueMap.setVectorValue(&I, Part, NewGEP);
3989
156
        addMetadata(NewGEP, GEP);
3990
156
      }
3991
92
    }
3992
93
3993
93
    break;
3994
0
  }
3995
30.6k
  case Instruction::UDiv:
3996
30.6k
  case Instruction::SDiv:
3997
30.6k
  case Instruction::SRem:
3998
30.6k
  case Instruction::URem:
3999
30.6k
  case Instruction::Add:
4000
30.6k
  case Instruction::FAdd:
4001
30.6k
  case Instruction::Sub:
4002
30.6k
  case Instruction::FSub:
4003
30.6k
  case Instruction::FNeg:
4004
30.6k
  case Instruction::Mul:
4005
30.6k
  case Instruction::FMul:
4006
30.6k
  case Instruction::FDiv:
4007
30.6k
  case Instruction::FRem:
4008
30.6k
  case Instruction::Shl:
4009
30.6k
  case Instruction::LShr:
4010
30.6k
  case Instruction::AShr:
4011
30.6k
  case Instruction::And:
4012
30.6k
  case Instruction::Or:
4013
30.6k
  case Instruction::Xor: {
4014
30.6k
    // Just widen unops and binops.
4015
30.6k
    setDebugLocFromInst(Builder, &I);
4016
30.6k
4017
87.2k
    for (unsigned Part = 0; Part < UF; 
++Part56.5k
) {
4018
56.5k
      SmallVector<Value *, 2> Ops;
4019
56.5k
      for (Value *Op : I.operands())
4020
113k
        Ops.push_back(getOrCreateVectorValue(Op, Part));
4021
56.5k
4022
56.5k
      Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4023
56.5k
4024
56.5k
      if (auto *VecOp = dyn_cast<Instruction>(V))
4025
56.5k
        VecOp->copyIRFlags(&I);
4026
56.5k
4027
56.5k
      // Use this vector value for all users of the original instruction.
4028
56.5k
      VectorLoopValueMap.setVectorValue(&I, Part, V);
4029
56.5k
      addMetadata(V, &I);
4030
56.5k
    }
4031
30.6k
4032
30.6k
    break;
4033
30.6k
  }
4034
30.6k
  case Instruction::Select: {
4035
596
    // Widen selects.
4036
596
    // If the selector is loop invariant we can create a select
4037
596
    // instruction with a scalar condition. Otherwise, use vector-select.
4038
596
    auto *SE = PSE.getSE();
4039
596
    bool InvariantCond =
4040
596
        SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
4041
596
    setDebugLocFromInst(Builder, &I);
4042
596
4043
596
    // The condition can be loop invariant  but still defined inside the
4044
596
    // loop. This means that we can't just use the original 'cond' value.
4045
596
    // We have to take the 'vectorized' value and pick the first lane.
4046
596
    // Instcombine will make this a no-op.
4047
596
4048
596
    auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4049
596
4050
1.49k
    for (unsigned Part = 0; Part < UF; 
++Part902
) {
4051
902
      Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4052
902
      Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4053
902
      Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4054
902
      Value *Sel =
4055
902
          Builder.CreateSelect(InvariantCond ? 
ScalarCond1
:
Cond901
, Op0, Op1);
4056
902
      VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4057
902
      addMetadata(Sel, &I);
4058
902
    }
4059
596
4060
596
    break;
4061
30.6k
  }
4062
30.6k
4063
30.6k
  case Instruction::ICmp:
4064
907
  case Instruction::FCmp: {
4065
907
    // Widen compares. Generate vector compares.
4066
907
    bool FCmp = (I.getOpcode() == Instruction::FCmp);
4067
907
    auto *Cmp = dyn_cast<CmpInst>(&I);
4068
907
    setDebugLocFromInst(Builder, Cmp);
4069
2.31k
    for (unsigned Part = 0; Part < UF; 
++Part1.40k
) {
4070
1.40k
      Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4071
1.40k
      Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4072
1.40k
      Value *C = nullptr;
4073
1.40k
      if (FCmp) {
4074
181
        // Propagate fast math flags.
4075
181
        IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4076
181
        Builder.setFastMathFlags(Cmp->getFastMathFlags());
4077
181
        C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4078
1.22k
      } else {
4079
1.22k
        C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4080
1.22k
      }
4081
1.40k
      VectorLoopValueMap.setVectorValue(&I, Part, C);
4082
1.40k
      addMetadata(C, &I);
4083
1.40k
    }
4084
907
4085
907
    break;
4086
907
  }
4087
907
4088
17.8k
  case Instruction::ZExt:
4089
17.8k
  case Instruction::SExt:
4090
17.8k
  case Instruction::FPToUI:
4091
17.8k
  case Instruction::FPToSI:
4092
17.8k
  case Instruction::FPExt:
4093
17.8k
  case Instruction::PtrToInt:
4094
17.8k
  case Instruction::IntToPtr:
4095
17.8k
  case Instruction::SIToFP:
4096
17.8k
  case Instruction::UIToFP:
4097
17.8k
  case Instruction::Trunc:
4098
17.8k
  case Instruction::FPTrunc:
4099
17.8k
  case Instruction::BitCast: {
4100
17.8k
    auto *CI = dyn_cast<CastInst>(&I);
4101
17.8k
    setDebugLocFromInst(Builder, CI);
4102
17.8k
4103
17.8k
    /// Vectorize casts.
4104
17.8k
    Type *DestTy =
4105
17.8k
        (VF == 1) ? 
CI->getType()0
: VectorType::get(CI->getType(), VF);
4106
17.8k
4107
52.0k
    for (unsigned Part = 0; Part < UF; 
++Part34.1k
) {
4108
34.1k
      Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4109
34.1k
      Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4110
34.1k
      VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4111
34.1k
      addMetadata(Cast, &I);
4112
34.1k
    }
4113
17.8k
    break;
4114
17.8k
  }
4115
17.8k
4116
17.8k
  case Instruction::Call: {
4117
297
    // Ignore dbg intrinsics.
4118
297
    if (isa<DbgInfoIntrinsic>(I))
4119
0
      break;
4120
297
    setDebugLocFromInst(Builder, &I);
4121
297
4122
297
    Module *M = I.getParent()->getParent()->getParent();
4123
297
    auto *CI = cast<CallInst>(&I);
4124
297
4125
297
    StringRef FnName = CI->getCalledFunction()->getName();
4126
297
    Function *F = CI->getCalledFunction();
4127
297
    Type *RetTy = ToVectorTy(CI->getType(), VF);
4128
297
    SmallVector<Type *, 4> Tys;
4129
297
    for (Value *ArgOperand : CI->arg_operands())
4130
342
      Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4131
297
4132
297
    Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4133
297
4134
297
    // The flag shows whether we use Intrinsic or a usual Call for vectorized
4135
297
    // version of the instruction.
4136
297
    // Is it beneficial to perform intrinsic call compared to lib call?
4137
297
    bool NeedToScalarize;
4138
297
    unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4139
297
    bool UseVectorIntrinsic =
4140
297
        ID && 
Cost->getVectorIntrinsicCost(CI, VF) <= CallCost232
;
4141
297
    assert((UseVectorIntrinsic || !NeedToScalarize) &&
4142
297
           "Instruction should be scalarized elsewhere.");
4143
297
4144
652
    for (unsigned Part = 0; Part < UF; 
++Part355
) {
4145
355
      SmallVector<Value *, 4> Args;
4146
761
      for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; 
++i406
) {
4147
406
        Value *Arg = CI->getArgOperand(i);
4148
406
        // Some intrinsics have a scalar argument - don't replace it with a
4149
406
        // vector.
4150
406
        if (!UseVectorIntrinsic || 
!hasVectorInstrinsicScalarOpd(ID, i)262
)
4151
403
          Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4152
406
        Args.push_back(Arg);
4153
406
      }
4154
355
4155
355
      Function *VectorF;
4156
355
      if (UseVectorIntrinsic) {
4157
221
        // Use vector version of the intrinsic.
4158
221
        Type *TysForDecl[] = {CI->getType()};
4159
221
        if (VF > 1)
4160
221
          TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4161
221
        VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4162
221
      } else {
4163
134
        // Use vector version of the library call.
4164
134
        StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
4165
134
        assert(!VFnName.empty() && "Vector function name is empty.");
4166
134
        VectorF = M->getFunction(VFnName);
4167
134
        if (!VectorF) {
4168
83
          // Generate a declaration
4169
83
          FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
4170
83
          VectorF =
4171
83
              Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
4172
83
          VectorF->copyAttributesFrom(F);
4173
83
        }
4174
134
      }
4175
355
      assert(VectorF && "Can't create vector function.");
4176
355
4177
355
      SmallVector<OperandBundleDef, 1> OpBundles;
4178
355
      CI->getOperandBundlesAsDefs(OpBundles);
4179
355
      CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4180
355
4181
355
      if (isa<FPMathOperator>(V))
4182
335
        V->copyFastMathFlags(CI);
4183
355
4184
355
      VectorLoopValueMap.setVectorValue(&I, Part, V);
4185
355
      addMetadata(V, &I);
4186
355
    }
4187
297
4188
297
    break;
4189
297
  }
4190
297
4191
297
  default:
4192
0
    // This instruction is not vectorized by simple widening.
4193
0
    LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4194
0
    llvm_unreachable("Unhandled instruction!");
4195
50.4k
  } // end of switch.
4196
50.4k
}
4197
4198
17.0k
void InnerLoopVectorizer::updateAnalysis() {
4199
17.0k
  // Forget the original basic block.
4200
17.0k
  PSE.getSE()->forgetLoop(OrigLoop);
4201
17.0k
4202
17.0k
  // DT is not kept up-to-date for outer loop vectorization
4203
17.0k
  if (EnableVPlanNativePath)
4204
7
    return;
4205
17.0k
4206
17.0k
  // Update the dominator tree information.
4207
17.0k
  assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
4208
17.0k
         "Entry does not dominate exit.");
4209
17.0k
4210
17.0k
  DT->addNewBlock(LoopMiddleBlock,
4211
17.0k
                  LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4212
17.0k
  DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
4213
17.0k
  DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
4214
17.0k
  DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
4215
17.0k
  assert(DT->verify(DominatorTree::VerificationLevel::Fast));
4216
17.0k
}
4217
4218
32.3k
void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4219
32.3k
  // We should not collect Scalars more than once per VF. Right now, this
4220
32.3k
  // function is called from collectUniformsAndScalars(), which already does
4221
32.3k
  // this check. Collecting Scalars for VF=1 does not make any sense.
4222
32.3k
  assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4223
32.3k
         "This function should not be visited twice for the same VF");
4224
32.3k
4225
32.3k
  SmallSetVector<Instruction *, 8> Worklist;
4226
32.3k
4227
32.3k
  // These sets are used to seed the analysis with pointers used by memory
4228
32.3k
  // accesses that will remain scalar.
4229
32.3k
  SmallSetVector<Instruction *, 8> ScalarPtrs;
4230
32.3k
  SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4231
32.3k
4232
32.3k
  // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4233
32.3k
  // The pointer operands of loads and stores will be scalar as long as the
4234
32.3k
  // memory access is not a gather or scatter operation. The value operand of a
4235
32.3k
  // store will remain scalar if the store is scalarized.
4236
32.3k
  auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4237
16.3k
    InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4238
16.3k
    assert(WideningDecision != CM_Unknown &&
4239
16.3k
           "Widening decision should be ready at this moment");
4240
16.3k
    if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4241
6.44k
      if (Ptr == Store->getValueOperand())
4242
82
        return WideningDecision == CM_Scalarize;
4243
16.2k
    assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4244
16.2k
           "Ptr is neither a value or pointer operand");
4245
16.2k
    return WideningDecision != CM_GatherScatter;
4246
16.2k
  };
4247
32.3k
4248
32.3k
  // A helper that returns true if the given value is a bitcast or
4249
32.3k
  // getelementptr instruction contained in the loop.
4250
240k
  auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4251
240k
    return ((isa<BitCastInst>(V) && 
V->getType()->isPointerTy()8.18k
) ||
4252
240k
            
isa<GetElementPtrInst>(V)232k
) &&
4253
240k
           
!TheLoop->isLoopInvariant(V)76.6k
;
4254
240k
  };
4255
32.3k
4256
32.3k
  // A helper that evaluates a memory access's use of a pointer. If the use
4257
32.3k
  // will be a scalar use, and the pointer is only used by memory accesses, we
4258
32.3k
  // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4259
32.3k
  // PossibleNonScalarPtrs.
4260
96.9k
  auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4261
96.9k
    // We only care about bitcast and getelementptr instructions contained in
4262
96.9k
    // the loop.
4263
96.9k
    if (!isLoopVaryingBitCastOrGEP(Ptr))
4264
36.5k
      return;
4265
60.3k
4266
60.3k
    // If the pointer has already been identified as scalar (e.g., if it was
4267
60.3k
    // also identified as uniform), there's nothing to do.
4268
60.3k
    auto *I = cast<Instruction>(Ptr);
4269
60.3k
    if (Worklist.count(I))
4270
46.0k
      return;
4271
14.3k
4272
14.3k
    // If the use of the pointer will be a scalar use, and all users of the
4273
14.3k
    // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4274
14.3k
    // place the pointer in PossibleNonScalarPtrs.
4275
16.5k
    
if (14.3k
isScalarUse(MemAccess, Ptr)14.3k
&&
llvm::all_of(I->users(), [&](User *U) 14.1k
{
4276
16.5k
          return isa<LoadInst>(U) || 
isa<StoreInst>(U)6.66k
;
4277
16.5k
        }))
4278
14.1k
      ScalarPtrs.insert(I);
4279
196
    else
4280
196
      PossibleNonScalarPtrs.insert(I);
4281
14.3k
  };
4282
32.3k
4283
32.3k
  // We seed the scalars analysis with three classes of instructions: (1)
4284
32.3k
  // instructions marked uniform-after-vectorization, (2) bitcast and
4285
32.3k
  // getelementptr instructions used by memory accesses requiring a scalar use,
4286
32.3k
  // and (3) pointer induction variables and their update instructions (we
4287
32.3k
  // currently only scalarize these).
4288
32.3k
  //
4289
32.3k
  // (1) Add to the worklist all instructions that have been identified as
4290
32.3k
  // uniform-after-vectorization.
4291
32.3k
  Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4292
32.3k
4293
32.3k
  // (2) Add to the worklist all bitcast and getelementptr instructions used by
4294
32.3k
  // memory accesses requiring a scalar use. The pointer operands of loads and
4295
32.3k
  // stores will be scalar as long as the memory accesses is not a gather or
4296
32.3k
  // scatter operation. The value operand of a store will remain scalar if the
4297
32.3k
  // store is scalarized.
4298
32.3k
  for (auto *BB : TheLoop->blocks())
4299
389k
    
for (auto &I : *BB)35.8k
{
4300
389k
      if (auto *Load = dyn_cast<LoadInst>(&I)) {
4301
31.6k
        evaluatePtrUse(Load, Load->getPointerOperand());
4302
357k
      } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4303
32.6k
        evaluatePtrUse(Store, Store->getPointerOperand());
4304
32.6k
        evaluatePtrUse(Store, Store->getValueOperand());
4305
32.6k
      }
4306
389k
    }
4307
32.3k
  for (auto *I : ScalarPtrs)
4308
13.0k
    if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4309
13.0k
      LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4310
13.0k
      Worklist.insert(I);
4311
13.0k
    }
4312
32.3k
4313
32.3k
  // (3) Add to the worklist all pointer induction variables and their update
4314
32.3k
  // instructions.
4315
32.3k
  //
4316
32.3k
  // TODO: Once we are able to vectorize pointer induction variables we should
4317
32.3k
  //       no longer insert them into the worklist here.
4318
32.3k
  auto *Latch = TheLoop->getLoopLatch();
4319
39.0k
  for (auto &Induction : *Legal->getInductionVars()) {
4320
39.0k
    auto *Ind = Induction.first;
4321
39.0k
    auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4322
39.0k
    if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4323
31.8k
      continue;
4324
7.21k
    Worklist.insert(Ind);
4325
7.21k
    Worklist.insert(IndUpdate);
4326
7.21k
    LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4327
7.21k
    LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4328
7.21k
                      << "\n");
4329
7.21k
  }
4330
32.3k
4331
32.3k
  // Insert the forced scalars.
4332
32.3k
  // FIXME: Currently widenPHIInstruction() often creates a dead vector
4333
32.3k
  // induction variable when the PHI user is scalarized.
4334
32.3k
  auto ForcedScalar = ForcedScalars.find(VF);
4335
32.3k
  if (ForcedScalar != ForcedScalars.end())
4336
13
    for (auto *I : ForcedScalar->second)
4337
60
      Worklist.insert(I);
4338
32.3k
4339
32.3k
  // Expand the worklist by looking through any bitcasts and getelementptr
4340
32.3k
  // instructions we've already identified as scalar. This is similar to the
4341
32.3k
  // expansion step in collectLoopUniforms(); however, here we're only
4342
32.3k
  // expanding to include additional bitcasts and getelementptr instructions.
4343
32.3k
  unsigned Idx = 0;
4344
175k
  while (Idx != Worklist.size()) {
4345
143k
    Instruction *Dst = Worklist[Idx++];
4346
143k
    if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4347
132k
      continue;
4348
10.9k
    auto *Src = cast<Instruction>(Dst->getOperand(0));
4349
30.6k
    if (
llvm::all_of(Src->users(), [&](User *U) -> bool 10.9k
{
4350
30.6k
          auto *J = cast<Instruction>(U);
4351
30.6k
          return !TheLoop->contains(J) || 
Worklist.count(J)28.7k
||
4352
30.6k
                 
(1.99k
(1.99k
isa<LoadInst>(J)1.99k
||
isa<StoreInst>(J)1.03k
) &&
4353
1.99k
                  isScalarUse(J, Src));
4354
30.6k
        })) {
4355
10.9k
      Worklist.insert(Src);
4356
10.9k
      LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4357
10.9k
    }
4358
10.9k
  }
4359
32.3k
4360
32.3k
  // An induction variable will remain scalar if all users of the induction
4361
32.3k
  // variable and induction variable update remain scalar.
4362
39.0k
  for (auto &Induction : *Legal->getInductionVars()) {
4363
39.0k
    auto *Ind = Induction.first;
4364
39.0k
    auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4365
39.0k
4366
39.0k
    // We already considered pointer induction variables, so there's no reason
4367
39.0k
    // to look at their users again.
4368
39.0k
    //
4369
39.0k
    // TODO: Once we are able to vectorize pointer induction variables we
4370
39.0k
    //       should no longer skip over them here.
4371
39.0k
    if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4372
7.21k
      continue;
4373
31.8k
4374
31.8k
    // Determine if all users of the induction variable are scalar after
4375
31.8k
    // vectorization.
4376
73.1k
    
auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool 31.8k
{
4377
73.1k
      auto *I = cast<Instruction>(U);
4378
73.1k
      return I == IndUpdate || 
!TheLoop->contains(I)42.7k
||
Worklist.count(I)42.6k
;
4379
73.1k
    });
4380
31.8k
    if (!ScalarInd)
4381
4.16k
      continue;
4382
27.6k
4383
27.6k
    // Determine if all users of the induction variable update instruction are
4384
27.6k
    // scalar after vectorization.
4385
27.6k
    auto ScalarIndUpdate =
4386
65.9k
        llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4387
65.9k
          auto *I = cast<Instruction>(U);
4388
65.9k
          return I == Ind || 
!TheLoop->contains(I)38.3k
||
Worklist.count(I)37.6k
;
4389
65.9k
        });
4390
27.6k
    if (!ScalarIndUpdate)
4391
9.78k
      continue;
4392
17.8k
4393
17.8k
    // The induction variable and its update instruction will remain scalar.
4394
17.8k
    Worklist.insert(Ind);
4395
17.8k
    Worklist.insert(IndUpdate);
4396
17.8k
    LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4397
17.8k
    LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4398
17.8k
                      << "\n");
4399
17.8k
  }
4400
32.3k
4401
32.3k
  Scalars[VF].insert(Worklist.begin(), Worklist.end());
4402
32.3k
}
4403
4404
749k
bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
4405
749k
  if (!blockNeedsPredication(I->getParent()))
4406
708k
    return false;
4407
41.0k
  switch(I->getOpcode()) {
4408
41.0k
  default:
4409
27.6k
    break;
4410
41.0k
  case Instruction::Load:
4411
12.7k
  case Instruction::Store: {
4412
12.7k
    if (!Legal->isMaskRequired(I))
4413
86
      return false;
4414
12.7k
    auto *Ptr = getLoadStorePointerOperand(I);
4415
12.7k
    auto *Ty = getMemInstValueType(I);
4416
12.7k
    // We have already decided how to vectorize this instruction, get that
4417
12.7k
    // result.
4418
12.7k
    if (VF > 1) {
4419
4.54k
      InstWidening WideningDecision = getWideningDecision(I, VF);
4420
4.54k
      assert(WideningDecision != CM_Unknown &&
4421
4.54k
             "Widening decision should be ready at this moment");
4422
4.54k
      return WideningDecision == CM_Scalarize;
4423
4.54k
    }
4424
8.16k
    return isa<LoadInst>(I) ?
4425
3.64k
        !(isLegalMaskedLoad(Ty, Ptr)  || 
isLegalMaskedGather(Ty)3.41k
)
4426
8.16k
      : 
!(4.52k
isLegalMaskedStore(Ty, Ptr)4.52k
||
isLegalMaskedScatter(Ty)4.17k
);
4427
8.16k
  }
4428
8.16k
  case Instruction::UDiv:
4429
575
  case Instruction::SDiv:
4430
575
  case Instruction::SRem:
4431
575
  case Instruction::URem:
4432
575
    return mayDivideByZero(*I);
4433
27.6k
  }
4434
27.6k
  return false;
4435
27.6k
}
4436
4437
bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
4438
2.26k
                                                               unsigned VF) {
4439
2.26k
  assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4440
2.26k
  assert(getWideningDecision(I, VF) == CM_Unknown &&
4441
2.26k
         "Decision should not be set yet.");
4442
2.26k
  auto *Group = getInterleavedAccessGroup(I);
4443
2.26k
  assert(Group && "Must have a group.");
4444
2.26k
4445
2.26k
  // If the instruction's allocated size doesn't equal it's type size, it
4446
2.26k
  // requires padding and will be scalarized.
4447
2.26k
  auto &DL = I->getModule()->getDataLayout();
4448
2.26k
  auto *ScalarTy = getMemInstValueType(I);
4449
2.26k
  if (hasIrregularType(ScalarTy, DL, VF))
4450
1
    return false;
4451
2.26k
4452
2.26k
  // Check if masking is required.
4453
2.26k
  // A Group may need masking for one of two reasons: it resides in a block that
4454
2.26k
  // needs predication, or it was decided to use masking to deal with gaps.
4455
2.26k
  bool PredicatedAccessRequiresMasking = 
4456
2.26k
      Legal->blockNeedsPredication(I->getParent()) && 
Legal->isMaskRequired(I)8
;
4457
2.26k
  bool AccessWithGapsRequiresMasking = 
4458
2.26k
      Group->requiresScalarEpilogue() && 
!IsScalarEpilogueAllowed342
;
4459
2.26k
  if (!PredicatedAccessRequiresMasking && 
!AccessWithGapsRequiresMasking2.25k
)
4460
2.25k
    return true;
4461
10
4462
10
  // If masked interleaving is required, we expect that the user/target had
4463
10
  // enabled it, because otherwise it either wouldn't have been created or
4464
10
  // it should have been invalidated by the CostModel.
4465
10
  assert(useMaskedInterleavedAccesses(TTI) &&
4466
10
         "Masked interleave-groups for predicated accesses are not enabled.");
4467
10
4468
10
  auto *Ty = getMemInstValueType(I);
4469
10
  return isa<LoadInst>(I) ? 
TTI.isLegalMaskedLoad(Ty)8
4470
10
                          : 
TTI.isLegalMaskedStore(Ty)2
;
4471
10
}
4472
4473
bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
4474
63.6k
                                                               unsigned VF) {
4475
63.6k
  // Get and ensure we have a valid memory instruction.
4476
63.6k
  LoadInst *LI = dyn_cast<LoadInst>(I);
4477
63.6k
  StoreInst *SI = dyn_cast<StoreInst>(I);
4478
63.6k
  assert((LI || SI) && "Invalid memory instruction");
4479
63.6k
4480
63.6k
  auto *Ptr = getLoadStorePointerOperand(I);
4481
63.6k
4482
63.6k
  // In order to be widened, the pointer should be consecutive, first of all.
4483
63.6k
  if (!Legal->isConsecutivePtr(Ptr))
4484
19.1k
    return false;
4485
44.5k
4486
44.5k
  // If the instruction is a store located in a predicated block, it will be
4487
44.5k
  // scalarized.
4488
44.5k
  if (isScalarWithPredication(I))
4489
1.06k
    return false;
4490
43.4k
4491
43.4k
  // If the instruction's allocated size doesn't equal it's type size, it
4492
43.4k
  // requires padding and will be scalarized.
4493
43.4k
  auto &DL = I->getModule()->getDataLayout();
4494
43.4k
  auto *ScalarTy = LI ? 
LI->getType()18.8k
:
SI->getValueOperand()->getType()24.5k
;
4495
43.4k
  if (hasIrregularType(ScalarTy, DL, VF))
4496
1
    return false;
4497
43.4k
4498
43.4k
  return true;
4499
43.4k
}
4500
4501
32.3k
void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4502
32.3k
  // We should not collect Uniforms more than once per VF. Right now,
4503
32.3k
  // this function is called from collectUniformsAndScalars(), which
4504
32.3k
  // already does this check. Collecting Uniforms for VF=1 does not make any
4505
32.3k
  // sense.
4506
32.3k
4507
32.3k
  assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4508
32.3k
         "This function should not be visited twice for the same VF");
4509
32.3k
4510
32.3k
  // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4511
32.3k
  // not analyze again.  Uniforms.count(VF) will return 1.
4512
32.3k
  Uniforms[VF].clear();
4513
32.3k
4514
32.3k
  // We now know that the loop is vectorizable!
4515
32.3k
  // Collect instructions inside the loop that will remain uniform after
4516
32.3k
  // vectorization.
4517
32.3k
4518
32.3k
  // Global values, params and instructions outside of current loop are out of
4519
32.3k
  // scope.
4520
197k
  auto isOutOfScope = [&](Value *V) -> bool {
4521
197k
    Instruction *I = dyn_cast<Instruction>(V);
4522
197k
    return (!I || 
!TheLoop->contains(I)113k
);
4523
197k
  };
4524
32.3k
4525
32.3k
  SetVector<Instruction *> Worklist;
4526
32.3k
  BasicBlock *Latch = TheLoop->getLoopLatch();
4527
32.3k
4528
32.3k
  // Start with the conditional branch. If the branch condition is an
4529
32.3k
  // instruction contained in the loop that is only used by the branch, it is
4530
32.3k
  // uniform.
4531
32.3k
  auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4532
32.3k
  if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
4533
32.3k
    Worklist.insert(Cmp);
4534
32.3k
    LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
4535
32.3k
  }
4536
32.3k
4537
32.3k
  // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4538
32.3k
  // are pointers that are treated like consecutive pointers during
4539
32.3k
  // vectorization. The pointer operands of interleaved accesses are an
4540
32.3k
  // example.
4541
32.3k
  SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4542
32.3k
4543
32.3k
  // Holds pointer operands of instructions that are possibly non-uniform.
4544
32.3k
  SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4545
32.3k
4546
64.0k
  auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4547
64.0k
    InstWidening WideningDecision = getWideningDecision(I, VF);
4548
64.0k
    assert(WideningDecision != CM_Unknown &&
4549
64.0k
           "Widening decision should be ready at this moment");
4550
64.0k
4551
64.0k
    return (WideningDecision == CM_Widen ||
4552
64.0k
            
WideningDecision == CM_Widen_Reverse23.5k
||
4553
64.0k
            
WideningDecision == CM_Interleave20.5k
);
4554
64.0k
  };
4555
32.3k
  // Iterate over the instructions in the loop, and collect all
4556
32.3k
  // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4557
32.3k
  // that a consecutive-like pointer operand will be scalarized, we collect it
4558
32.3k
  // in PossibleNonUniformPtrs instead. We use two sets here because a single
4559
32.3k
  // getelementptr instruction can be used by both vectorized and scalarized
4560
32.3k
  // memory instructions. For example, if a loop loads and stores from the same
4561
32.3k
  // location, but the store is conditional, the store will be scalarized, and
4562
32.3k
  // the getelementptr won't remain uniform.
4563
32.3k
  for (auto *BB : TheLoop->blocks())
4564
389k
    
for (auto &I : *BB)35.8k
{
4565
389k
      // If there's no pointer operand, there's nothing to do.
4566
389k
      auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4567
389k
      if (!Ptr)
4568
324k
        continue;
4569
64.1k
4570
64.1k
      // True if all users of Ptr are memory accesses that have Ptr as their
4571
64.1k
      // pointer operand.
4572
64.1k
      auto UsersAreMemAccesses =
4573
73.7k
          llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4574
73.7k
            return getLoadStorePointerOperand(U) == Ptr;
4575
73.7k
          });
4576
64.1k
4577
64.1k
      // Ensure the memory instruction will not be scalarized or used by
4578
64.1k
      // gather/scatter, making its pointer operand non-uniform. If the pointer
4579
64.1k
      // operand is used by any instruction other than a memory access, we
4580
64.1k
      // conservatively assume the pointer operand may be non-uniform.
4581
64.1k
      if (!UsersAreMemAccesses || 
!isUniformDecision(&I, VF)59.8k
)
4582
18.8k
        PossibleNonUniformPtrs.insert(Ptr);
4583
45.2k
4584
45.2k
      // If the memory instruction will be vectorized and its pointer operand
4585
45.2k
      // is consecutive-like, or interleaving - the pointer operand should
4586
45.2k
      // remain uniform.
4587
45.2k
      else
4588
45.2k
        ConsecutiveLikePtrs.insert(Ptr);
4589
64.1k
    }
4590
32.3k
4591
32.3k
  // Add to the Worklist all consecutive and consecutive-like pointers that
4592
32.3k
  // aren't also identified as possibly non-uniform.
4593
32.3k
  for (auto *V : ConsecutiveLikePtrs)
4594
43.1k
    if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) {
4595
42.8k
      LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
4596
42.8k
      Worklist.insert(V);
4597
42.8k
    }
4598
32.3k
4599
32.3k
  // Expand Worklist in topological order: whenever a new instruction
4600
32.3k
  // is added , its users should be already inside Worklist.  It ensures
4601
32.3k
  // a uniform instruction will only be used by uniform instructions.
4602
32.3k
  unsigned idx = 0;
4603
115k
  while (idx != Worklist.size()) {
4604
82.6k
    Instruction *I = Worklist[idx++];
4605
82.6k
4606
197k
    for (auto OV : I->operand_values()) {
4607
197k
      // isOutOfScope operands cannot be uniform instructions.
4608
197k
      if (isOutOfScope(OV))
4609
114k
        continue;
4610
82.8k
      // First order recurrence Phi's should typically be considered
4611
82.8k
      // non-uniform.
4612
82.8k
      auto *OP = dyn_cast<PHINode>(OV);
4613
82.8k
      if (OP && 
Legal->isFirstOrderRecurrence(OP)39.5k
)
4614
0
        continue;
4615
82.8k
      // If all the users of the operand are uniform, then add the
4616
82.8k
      // operand into the uniform worklist.
4617
82.8k
      auto *OI = cast<Instruction>(OV);
4618
180k
      if (
llvm::all_of(OI->users(), [&](User *U) -> bool 82.8k
{
4619
180k
            auto *J = cast<Instruction>(U);
4620
180k
            return Worklist.count(J) ||
4621
180k
                   
(74.2k
OI == getLoadStorePointerOperand(J)74.2k
&&
4622
74.2k
                    
isUniformDecision(J, VF)282
);
4623
180k
          })) {
4624
8.88k
        Worklist.insert(OI);
4625
8.88k
        LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
4626
8.88k
      }
4627
82.8k
    }
4628
82.6k
  }
4629
32.3k
4630
32.3k
  // Returns true if Ptr is the pointer operand of a memory access instruction
4631
32.3k
  // I, and I is known to not require scalarization.
4632
32.3k
  auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4633
19.3k
    return getLoadStorePointerOperand(I) == Ptr && 
isUniformDecision(I, VF)3.89k
;
4634
19.3k
  };
4635
32.3k
4636
32.3k
  // For an instruction to be added into Worklist above, all its users inside
4637
32.3k
  // the loop should also be in Worklist. However, this condition cannot be
4638
32.3k
  // true for phi nodes that form a cyclic dependence. We must process phi
4639
32.3k
  // nodes separately. An induction variable will remain uniform if all users
4640
32.3k
  // of the induction variable and induction variable update remain uniform.
4641
32.3k
  // The code below handles both pointer and non-pointer induction variables.
4642
39.0k
  for (auto &Induction : *Legal->getInductionVars()) {
4643
39.0k
    auto *Ind = Induction.first;
4644
39.0k
    auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4645
39.0k
4646
39.0k
    // Determine if all users of the induction variable are uniform after
4647
39.0k
    // vectorization.
4648
83.6k
    auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4649
83.6k
      auto *I = cast<Instruction>(U);
4650
83.6k
      return I == IndUpdate || 
!TheLoop->contains(I)47.0k
||
Worklist.count(I)46.8k
||
4651
83.6k
             
isVectorizedMemAccessUse(I, Ind)8.65k
;
4652
83.6k
    });
4653
39.0k
    if (!UniformInd)
4654
5.77k
      continue;
4655
33.2k
4656
33.2k
    // Determine if all users of the induction variable update instruction are
4657
33.2k
    // uniform after vectorization.
4658
33.2k
    auto UniformIndUpdate =
4659
75.9k
        llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4660
75.9k
          auto *I = cast<Instruction>(U);
4661
75.9k
          return I == Ind || 
!TheLoop->contains(I)42.7k
||
Worklist.count(I)39.4k
||
4662
75.9k
                 
isVectorizedMemAccessUse(I, IndUpdate)10.7k
;
4663
75.9k
        });
4664
33.2k
    if (!UniformIndUpdate)
4665
9.82k
      continue;
4666
23.4k
4667
23.4k
    // The induction variable and its update instruction will remain uniform.
4668
23.4k
    Worklist.insert(Ind);
4669
23.4k
    Worklist.insert(IndUpdate);
4670
23.4k
    LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
4671
23.4k
    LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate
4672
23.4k
                      << "\n");
4673
23.4k
  }
4674
32.3k
4675
32.3k
  Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4676
32.3k
}
4677
4678
19.9k
Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
4679
19.9k
  if (Legal->getRuntimePointerChecking()->Need && 
TTI.hasBranchDivergence()3.16k
) {
4680
2
    // TODO: It may by useful to do since it's still likely to be dynamically
4681
2
    // uniform if the target can skip.
4682
2
    LLVM_DEBUG(
4683
2
        dbgs() << "LV: Not inserting runtime ptr check for divergent target");
4684
2
4685
2
    ORE->emit(
4686
2
      createMissedAnalysis("CantVersionLoopWithDivergentTarget")
4687
2
      << "runtime pointer checks needed. Not enabled for divergent target");
4688
2
4689
2
    return None;
4690
2
  }
4691
19.9k
4692
19.9k
  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4693
19.9k
  if (!OptForSize) // Remaining checks deal with scalar loop when OptForSize.
4694
19.7k
    return computeFeasibleMaxVF(OptForSize, TC);
4695
212
4696
212
  if (Legal->getRuntimePointerChecking()->Need) {
4697
37
    ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
4698
37
              << "runtime pointer checks needed. Enable vectorization of this "
4699
37
                 "loop with '#pragma clang loop vectorize(enable)' when "
4700
37
                 "compiling with -Os/-Oz");
4701
37
    LLVM_DEBUG(
4702
37
        dbgs()
4703
37
        << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");
4704
37
    return None;
4705
37
  }
4706
175
4707
175
  if (!PSE.getUnionPredicate().getPredicates().empty()) {
4708
9
    ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
4709
9
              << "runtime SCEV checks needed. Enable vectorization of this "
4710
9
                 "loop with '#pragma clang loop vectorize(enable)' when "
4711
9
                 "compiling with -Os/-Oz");
4712
9
    LLVM_DEBUG(
4713
9
        dbgs()
4714
9
        << "LV: Aborting. Runtime SCEV check is required with -Os/-Oz.\n");
4715
9
    return None;
4716
9
  }
4717
166
4718
166
  // FIXME: Avoid specializing for stride==1 instead of bailing out.
4719
166
  if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4720
1
    ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
4721
1
              << "runtime stride == 1 checks needed. Enable vectorization of "
4722
1
                 "this loop with '#pragma clang loop vectorize(enable)' when "
4723
1
                 "compiling with -Os/-Oz");
4724
1
    LLVM_DEBUG(
4725
1
        dbgs()
4726
1
        << "LV: Aborting. Runtime stride check is required with -Os/-Oz.\n");
4727
1
    return None;
4728
1
  }
4729
165
4730
165
  // If we optimize the program for size, avoid creating the tail loop.
4731
165
  LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4732
165
4733
165
  if (TC == 1) {
4734
5
    ORE->emit(createMissedAnalysis("SingleIterationLoop")
4735
5
              << "loop trip count is one, irrelevant for vectorization");
4736
5
    LLVM_DEBUG(dbgs() << "LV: Aborting, single iteration (non) loop.\n");
4737
5
    return None;
4738
5
  }
4739
160
4740
160
  // Record that scalar epilogue is not allowed.
4741
160
  LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4742
160
4743
160
  IsScalarEpilogueAllowed = !OptForSize;
4744
160
4745
160
  // We don't create an epilogue when optimizing for size.
4746
160
  // Invalidate interleave groups that require an epilogue if we can't mask
4747
160
  // the interleave-group.
4748
160
  if (!useMaskedInterleavedAccesses(TTI)) 
4749
153
    InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4750
160
4751
160
  unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC);
4752
160
4753
160
  if (TC > 0 && 
TC % MaxVF == 0129
) {
4754
85
    LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4755
85
    return MaxVF;
4756
85
  }
4757
75
4758
75
  // If we don't know the precise trip count, or if the trip count that we
4759
75
  // found modulo the vectorization factor is not zero, try to fold the tail
4760
75
  // by masking.
4761
75
  // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4762
75
  if (Legal->canFoldTailByMasking()) {
4763
26
    FoldTailByMasking = true;
4764
26
    return MaxVF;
4765
26
  }
4766
49
4767
49
  if (TC == 0) {
4768
16
    ORE->emit(
4769
16
        createMissedAnalysis("UnknownLoopCountComplexCFG")
4770
16
        << "unable to calculate the loop count due to complex control flow");
4771
16
    return None;
4772
16
  }
4773
33
4774
33
  ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize")
4775
33
            << "cannot optimize for size and vectorize at the same time. "
4776
33
               "Enable vectorization of this loop with '#pragma clang loop "
4777
33
               "vectorize(enable)' when compiling with -Os/-Oz");
4778
33
  return None;
4779
33
}
4780
4781
unsigned
4782
LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,
4783
19.9k
                                                 unsigned ConstTripCount) {
4784
19.9k
  MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4785
19.9k
  unsigned SmallestType, WidestType;
4786
19.9k
  std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4787
19.9k
  unsigned WidestRegister = TTI.getRegisterBitWidth(true);
4788
19.9k
4789
19.9k
  // Get the maximum safe dependence distance in bits computed by LAA.
4790
19.9k
  // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4791
19.9k
  // the memory accesses that is most restrictive (involved in the smallest
4792
19.9k
  // dependence distance).
4793
19.9k
  unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
4794
19.9k
4795
19.9k
  WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
4796
19.9k
4797
19.9k
  unsigned MaxVectorSize = WidestRegister / WidestType;
4798
19.9k
4799
19.9k
  LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4800
19.9k
                    << " / " << WidestType << " bits.\n");
4801
19.9k
  LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4802
19.9k
                    << WidestRegister << " bits.\n");
4803
19.9k
4804
19.9k
  assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
4805
19.9k
                                 " into one vector!");
4806
19.9k
  if (MaxVectorSize == 0) {
4807
76
    LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
4808
76
    MaxVectorSize = 1;
4809
76
    return MaxVectorSize;
4810
19.8k
  } else if (ConstTripCount && 
ConstTripCount < MaxVectorSize11.0k
&&
4811
19.8k
             
isPowerOf2_32(ConstTripCount)13
) {
4812
11
    // We need to clamp the VF to be the ConstTripCount. There is no point in
4813
11
    // choosing a higher viable VF as done in the loop below.
4814
11
    LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
4815
11
                      << ConstTripCount << "\n");
4816
11
    MaxVectorSize = ConstTripCount;
4817
11
    return MaxVectorSize;
4818
11
  }
4819
19.8k
4820
19.8k
  unsigned MaxVF = MaxVectorSize;
4821
19.8k
  if (TTI.shouldMaximizeVectorBandwidth(OptForSize) ||
4822
19.8k
      
(19.8k
MaximizeBandwidth19.8k
&&
!OptForSize0
)) {
4823
2
    // Collect all viable vectorization factors larger than the default MaxVF
4824
2
    // (i.e. MaxVectorSize).
4825
2
    SmallVector<unsigned, 8> VFs;
4826
2
    unsigned NewMaxVectorSize = WidestRegister / SmallestType;
4827
3
    for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; 
VS *= 21
)
4828
1
      VFs.push_back(VS);
4829
2
4830
2
    // For each VF calculate its register usage.
4831
2
    auto RUs = calculateRegisterUsage(VFs);
4832
2
4833
2
    // Select the largest VF which doesn't require more registers than existing
4834
2
    // ones.
4835
2
    unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);
4836
2
    for (int i = RUs.size() - 1; i >= 0; 
--i0
) {
4837
1
      if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
4838
1
        MaxVF = VFs[i];
4839
1
        break;
4840
1
      }
4841
1
    }
4842
2
    if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
4843
2
      if (MaxVF < MinVF) {
4844
0
        LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4845
0
                          << ") with target's minimum: " << MinVF << '\n');
4846
0
        MaxVF = MinVF;
4847
0
      }
4848
2
    }
4849
2
  }
4850
19.8k
  return MaxVF;
4851
19.8k
}
4852
4853
VectorizationFactor
4854
19.1k
LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
4855
19.1k
  float Cost = expectedCost(1).first;
4856
19.1k
  const float ScalarCost = Cost;
4857
19.1k
  unsigned Width = 1;
4858
19.1k
  LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
4859
19.1k
4860
19.1k
  bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
4861
19.1k
  if (ForceVectorization && 
MaxVF > 117
) {
4862
17
    // Ignore scalar width, because the user explicitly wants vectorization.
4863
17
    // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4864
17
    // evaluation.
4865
17
    Cost = std::numeric_limits<float>::max();
4866
17
  }
4867
19.1k
4868
50.9k
  for (unsigned i = 2; i <= MaxVF; 
i *= 231.7k
) {
4869
31.7k
    // Notice that the vector loop needs to be executed less times, so
4870
31.7k
    // we need to divide the cost of the vector loops by the width of
4871
31.7k
    // the vector elements.
4872
31.7k
    VectorizationCostTy C = expectedCost(i);
4873
31.7k
    float VectorCost = C.first / (float)i;
4874
31.7k
    LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
4875
31.7k
                      << " costs: " << (int)VectorCost << ".\n");
4876
31.7k
    if (!C.second && 
!ForceVectorization2.93k
) {
4877
2.93k
      LLVM_DEBUG(
4878
2.93k
          dbgs() << "LV: Not considering vector loop of width " << i
4879
2.93k
                 << " because it will not generate any vector instructions.\n");
4880
2.93k
      continue;
4881
2.93k
    }
4882
28.8k
    if (VectorCost < Cost) {
4883
23.9k
      Cost = VectorCost;
4884
23.9k
      Width = i;
4885
23.9k
    }
4886
28.8k
  }
4887
19.1k
4888
19.1k
  if (!EnableCondStoresVectorization && 
NumPredStores0
) {
4889
0
    ORE->emit(createMissedAnalysis("ConditionalStore")
4890
0
              << "store that is conditionally executed prevents vectorization");
4891
0
    LLVM_DEBUG(
4892
0
        dbgs() << "LV: No vectorization. There are conditional stores.\n");
4893
0
    Width = 1;
4894
0
    Cost = ScalarCost;
4895
0
  }
4896
19.1k
4897
19.1k
  LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
4898
19.1k
             << "LV: Vectorization seems to be not beneficial, "
4899
19.1k
             << "but was forced by a user.\n");
4900
19.1k
  LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
4901
19.1k
  VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
4902
19.1k
  return Factor;
4903
19.1k
}
4904
4905
std::pair<unsigned, unsigned>
4906
19.9k
LoopVectorizationCostModel::getSmallestAndWidestTypes() {
4907
19.9k
  unsigned MinWidth = -1U;
4908
19.9k
  unsigned MaxWidth = 8;
4909
19.9k
  const DataLayout &DL = TheFunction->getParent()->getDataLayout();
4910
19.9k
4911
19.9k
  // For each block.
4912
22.0k
  for (BasicBlock *BB : TheLoop->blocks()) {
4913
22.0k
    // For each instruction in the loop.
4914
251k
    for (Instruction &I : BB->instructionsWithoutDebug()) {
4915
251k
      Type *T = I.getType();
4916
251k
4917
251k
      // Skip ignored values.
4918
251k
      if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
4919
9
        continue;
4920
251k
4921
251k
      // Only examine Loads, Stores and PHINodes.
4922
251k
      if (!isa<LoadInst>(I) && 
!isa<StoreInst>(I)231k
&&
!isa<PHINode>(I)211k
)
4923
183k
        continue;
4924
68.4k
4925
68.4k
      // Examine PHI nodes that are reduction variables. Update the type to
4926
68.4k
      // account for the recurrence type.
4927
68.4k
      if (auto *PN = dyn_cast<PHINode>(&I)) {
4928
27.9k
        if (!Legal->isReductionVariable(PN))
4929
24.4k
          continue;
4930
3.43k
        RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
4931
3.43k
        T = RdxDesc.getRecurrenceType();
4932
3.43k
      }
4933
68.4k
4934
68.4k
      // Examine the stored values.
4935
68.4k
      
if (auto *43.9k
ST43.9k
= dyn_cast<StoreInst>(&I))
4936
20.7k
        T = ST->getValueOperand()->getType();
4937
43.9k
4938
43.9k
      // Ignore loaded pointer types and stored pointer types that are not
4939
43.9k
      // vectorizable.
4940
43.9k
      //
4941
43.9k
      // FIXME: The check here attempts to predict whether a load or store will
4942
43.9k
      //        be vectorized. We only know this for certain after a VF has
4943
43.9k
      //        been selected. Here, we assume that if an access can be
4944
43.9k
      //        vectorized, it will be. We should also look at extending this
4945
43.9k
      //        optimization to non-pointer types.
4946
43.9k
      //
4947
43.9k
      if (T->isPointerTy() && 
!isConsecutiveLoadOrStore(&I)627
&&
4948
43.9k
          
!isAccessInterleaved(&I)287
&&
!isLegalGatherOrScatter(&I)258
)
4949
258
        continue;
4950
43.7k
4951
43.7k
      MinWidth = std::min(MinWidth,
4952
43.7k
                          (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
4953
43.7k
      MaxWidth = std::max(MaxWidth,
4954
43.7k
                          (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
4955
43.7k
    }
4956
22.0k
  }
4957
19.9k
4958
19.9k
  return {MinWidth, MaxWidth};
4959
19.9k
}
4960
4961
unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
4962
                                                           unsigned VF,
4963
19.8k
                                                           unsigned LoopCost) {
4964
19.8k
  // -- The interleave heuristics --
4965
19.8k
  // We interleave the loop in order to expose ILP and reduce the loop overhead.
4966
19.8k
  // There are many micro-architectural considerations that we can't predict
4967
19.8k
  // at this level. For example, frontend pressure (on decode or fetch) due to
4968
19.8k
  // code size, or the number and capabilities of the execution ports.
4969
19.8k
  //
4970
19.8k
  // We use the following heuristics to select the interleave count:
4971
19.8k
  // 1. If the code has reductions, then we interleave to break the cross
4972
19.8k
  // iteration dependency.
4973
19.8k
  // 2. If the loop is really small, then we interleave to reduce the loop
4974
19.8k
  // overhead.
4975
19.8k
  // 3. We don't interleave if we think that we will spill registers to memory
4976
19.8k
  // due to the increased register pressure.
4977
19.8k
4978
19.8k
  // When we optimize for size, we don't interleave.
4979
19.8k
  if (OptForSize)
4980
111
    return 1;
4981
19.7k
4982
19.7k
  // We used the distance for the interleave count.
4983
19.7k
  if (Legal->getMaxSafeDepDistBytes() != -1U)
4984
140
    return 1;
4985
19.6k
4986
19.6k
  // Do not interleave loops with a relatively small trip count.
4987
19.6k
  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4988
19.6k
  if (TC > 1 && 
TC < TinyTripCountInterleaveThreshold10.8k
)
4989
496
    return 1;
4990
19.1k
4991
19.1k
  unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
4992
19.1k
  LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
4993
19.1k
                    << " registers\n");
4994
19.1k
4995
19.1k
  if (VF == 1) {
4996
2.91k
    if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
4997
1
      TargetNumRegisters = ForceTargetNumScalarRegs;
4998
16.1k
  } else {
4999
16.1k
    if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5000
0
      TargetNumRegisters = ForceTargetNumVectorRegs;
5001
16.1k
  }
5002
19.1k
5003
19.1k
  RegisterUsage R = calculateRegisterUsage({VF})[0];
5004
19.1k
  // We divide by these constants so assume that we have at least one
5005
19.1k
  // instruction that uses at least one register.
5006
19.1k
  R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
5007
19.1k
5008
19.1k
  // We calculate the interleave count using the following formula.
5009
19.1k
  // Subtract the number of loop invariants from the number of available
5010
19.1k
  // registers. These registers are used by all of the interleaved instances.
5011
19.1k
  // Next, divide the remaining registers by the number of registers that is
5012
19.1k
  // required by the loop, in order to estimate how many parallel instances
5013
19.1k
  // fit without causing spills. All of this is rounded down if necessary to be
5014
19.1k
  // a power of two. We want power of two interleave count to simplify any
5015
19.1k
  // addressing operations or alignment considerations.
5016
19.1k
  // We also want power of two interleave counts to ensure that the induction
5017
19.1k
  // variable of the vector loop wraps to zero, when tail is folded by masking;
5018
19.1k
  // this currently happens when OptForSize, in which case IC is set to 1 above.
5019
19.1k
  unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
5020
19.1k
                              R.MaxLocalUsers);
5021
19.1k
5022
19.1k
  // Don't count the induction variable as interleaved.
5023
19.1k
  if (EnableIndVarRegisterHeur)
5024
19.1k
    IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
5025
19.1k
                       std::max(1U, (R.MaxLocalUsers - 1)));
5026
19.1k
5027
19.1k
  // Clamp the interleave ranges to reasonable counts.
5028
19.1k
  unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5029
19.1k
5030
19.1k
  // Check if the user has overridden the max.
5031
19.1k
  if (VF == 1) {
5032
2.91k
    if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)