Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Line
Count
Source (jump to first uncovered line)
1
//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10
// stores that can be put together into vector-stores. Next, it attempts to
11
// construct vectorizable tree using the use-def chains. If a profitable tree
12
// was found, the SLP vectorizer performs vectorization on the tree.
13
//
14
// The pass is inspired by the work described in the paper:
15
//  "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16
//
17
//===----------------------------------------------------------------------===//
18
19
#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
20
#include "llvm/ADT/ArrayRef.h"
21
#include "llvm/ADT/DenseMap.h"
22
#include "llvm/ADT/DenseSet.h"
23
#include "llvm/ADT/MapVector.h"
24
#include "llvm/ADT/None.h"
25
#include "llvm/ADT/Optional.h"
26
#include "llvm/ADT/PostOrderIterator.h"
27
#include "llvm/ADT/STLExtras.h"
28
#include "llvm/ADT/SetVector.h"
29
#include "llvm/ADT/SmallPtrSet.h"
30
#include "llvm/ADT/SmallSet.h"
31
#include "llvm/ADT/SmallVector.h"
32
#include "llvm/ADT/Statistic.h"
33
#include "llvm/ADT/iterator.h"
34
#include "llvm/ADT/iterator_range.h"
35
#include "llvm/Analysis/AliasAnalysis.h"
36
#include "llvm/Analysis/CodeMetrics.h"
37
#include "llvm/Analysis/DemandedBits.h"
38
#include "llvm/Analysis/GlobalsModRef.h"
39
#include "llvm/Analysis/LoopAccessAnalysis.h"
40
#include "llvm/Analysis/LoopInfo.h"
41
#include "llvm/Analysis/MemoryLocation.h"
42
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
43
#include "llvm/Analysis/ScalarEvolution.h"
44
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
45
#include "llvm/Analysis/TargetLibraryInfo.h"
46
#include "llvm/Analysis/TargetTransformInfo.h"
47
#include "llvm/Analysis/ValueTracking.h"
48
#include "llvm/Analysis/VectorUtils.h"
49
#include "llvm/IR/Attributes.h"
50
#include "llvm/IR/BasicBlock.h"
51
#include "llvm/IR/Constant.h"
52
#include "llvm/IR/Constants.h"
53
#include "llvm/IR/DataLayout.h"
54
#include "llvm/IR/DebugLoc.h"
55
#include "llvm/IR/DerivedTypes.h"
56
#include "llvm/IR/Dominators.h"
57
#include "llvm/IR/Function.h"
58
#include "llvm/IR/IRBuilder.h"
59
#include "llvm/IR/InstrTypes.h"
60
#include "llvm/IR/Instruction.h"
61
#include "llvm/IR/Instructions.h"
62
#include "llvm/IR/IntrinsicInst.h"
63
#include "llvm/IR/Intrinsics.h"
64
#include "llvm/IR/Module.h"
65
#include "llvm/IR/NoFolder.h"
66
#include "llvm/IR/Operator.h"
67
#include "llvm/IR/PassManager.h"
68
#include "llvm/IR/PatternMatch.h"
69
#include "llvm/IR/Type.h"
70
#include "llvm/IR/Use.h"
71
#include "llvm/IR/User.h"
72
#include "llvm/IR/Value.h"
73
#include "llvm/IR/ValueHandle.h"
74
#include "llvm/IR/Verifier.h"
75
#include "llvm/Pass.h"
76
#include "llvm/Support/Casting.h"
77
#include "llvm/Support/CommandLine.h"
78
#include "llvm/Support/Compiler.h"
79
#include "llvm/Support/DOTGraphTraits.h"
80
#include "llvm/Support/Debug.h"
81
#include "llvm/Support/ErrorHandling.h"
82
#include "llvm/Support/GraphWriter.h"
83
#include "llvm/Support/KnownBits.h"
84
#include "llvm/Support/MathExtras.h"
85
#include "llvm/Support/raw_ostream.h"
86
#include "llvm/Transforms/Utils/LoopUtils.h"
87
#include "llvm/Transforms/Vectorize.h"
88
#include <algorithm>
89
#include <cassert>
90
#include <cstdint>
91
#include <iterator>
92
#include <memory>
93
#include <set>
94
#include <string>
95
#include <tuple>
96
#include <utility>
97
#include <vector>
98
99
using namespace llvm;
100
using namespace llvm::PatternMatch;
101
using namespace slpvectorizer;
102
103
43.1k
#define SV_NAME "slp-vectorizer"
104
#define DEBUG_TYPE "SLP"
105
106
STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
107
108
cl::opt<bool>
109
    llvm::RunSLPVectorization("vectorize-slp", cl::init(false), cl::Hidden,
110
                              cl::desc("Run the SLP vectorization passes"));
111
112
static cl::opt<int>
113
    SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
114
                     cl::desc("Only vectorize if you gain more than this "
115
                              "number "));
116
117
static cl::opt<bool>
118
ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
119
                   cl::desc("Attempt to vectorize horizontal reductions"));
120
121
static cl::opt<bool> ShouldStartVectorizeHorAtStore(
122
    "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
123
    cl::desc(
124
        "Attempt to vectorize horizontal reductions feeding into a store"));
125
126
static cl::opt<int>
127
MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
128
    cl::desc("Attempt to vectorize for this register size in bits"));
129
130
/// Limits the size of scheduling regions in a block.
131
/// It avoid long compile times for _very_ large blocks where vector
132
/// instructions are spread over a wide range.
133
/// This limit is way higher than needed by real-world functions.
134
static cl::opt<int>
135
ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
136
    cl::desc("Limit the size of the SLP scheduling region per block"));
137
138
static cl::opt<int> MinVectorRegSizeOption(
139
    "slp-min-reg-size", cl::init(128), cl::Hidden,
140
    cl::desc("Attempt to vectorize for this register size in bits"));
141
142
static cl::opt<unsigned> RecursionMaxDepth(
143
    "slp-recursion-max-depth", cl::init(12), cl::Hidden,
144
    cl::desc("Limit the recursion depth when building a vectorizable tree"));
145
146
static cl::opt<unsigned> MinTreeSize(
147
    "slp-min-tree-size", cl::init(3), cl::Hidden,
148
    cl::desc("Only vectorize small trees if they are fully vectorizable"));
149
150
static cl::opt<bool>
151
    ViewSLPTree("view-slp-tree", cl::Hidden,
152
                cl::desc("Display the SLP trees with Graphviz"));
153
154
// Limit the number of alias checks. The limit is chosen so that
155
// it has no negative effect on the llvm benchmarks.
156
static const unsigned AliasedCheckLimit = 10;
157
158
// Another limit for the alias checks: The maximum distance between load/store
159
// instructions where alias checks are done.
160
// This limit is useful for very large basic blocks.
161
static const unsigned MaxMemDepDistance = 160;
162
163
/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
164
/// regions to be handled.
165
static const int MinScheduleRegionSize = 16;
166
167
/// Predicate for the element types that the SLP vectorizer supports.
168
///
169
/// The most important thing to filter here are types which are invalid in LLVM
170
/// vectors. We also filter target specific types which have absolutely no
171
/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
172
/// avoids spending time checking the cost model and realizing that they will
173
/// be inevitably scalarized.
174
2.98M
static bool isValidElementType(Type *Ty) {
175
2.98M
  return VectorType::isValidElementType(Ty) && 
!Ty->isX86_FP80Ty()2.92M
&&
176
2.98M
         
!Ty->isPPC_FP128Ty()2.92M
;
177
2.98M
}
178
179
/// \returns true if all of the instructions in \p VL are in the same block or
180
/// false otherwise.
181
1.71M
static bool allSameBlock(ArrayRef<Value *> VL) {
182
1.71M
  Instruction *I0 = dyn_cast<Instruction>(VL[0]);
183
1.71M
  if (!I0)
184
84.2k
    return false;
185
1.62M
  BasicBlock *BB = I0->getParent();
186
3.59M
  for (int i = 1, e = VL.size(); i < e; 
i++1.97M
) {
187
2.20M
    Instruction *I = dyn_cast<Instruction>(VL[i]);
188
2.20M
    if (!I)
189
75.2k
      return false;
190
2.12M
191
2.12M
    if (BB != I->getParent())
192
155k
      return false;
193
2.12M
  }
194
1.62M
  
return true1.39M
;
195
1.62M
}
196
197
/// \returns True if all of the values in \p VL are constants.
198
2.67M
static bool allConstant(ArrayRef<Value *> VL) {
199
2.67M
  for (Value *i : VL)
200
3.26M
    if (!isa<Constant>(i))
201
2.36M
      return false;
202
2.67M
  
return true309k
;
203
2.67M
}
204
205
/// \returns True if all of the values in \p VL are identical.
206
2.36M
static bool isSplat(ArrayRef<Value *> VL) {
207
2.54M
  for (unsigned i = 1, e = VL.size(); i < e; 
++i182k
)
208
2.41M
    if (VL[i] != VL[0])
209
2.22M
      return false;
210
2.36M
  
return true135k
;
211
2.36M
}
212
213
/// \returns True if \p I is commutative, handles CmpInst as well as Instruction.
214
922k
static bool isCommutative(Instruction *I) {
215
922k
  if (auto *IC = dyn_cast<CmpInst>(I))
216
32.7k
    return IC->isCommutative();
217
889k
  return I->isCommutative();
218
889k
}
219
220
/// Checks if the vector of instructions can be represented as a shuffle, like:
221
/// %x0 = extractelement <4 x i8> %x, i32 0
222
/// %x3 = extractelement <4 x i8> %x, i32 3
223
/// %y1 = extractelement <4 x i8> %y, i32 1
224
/// %y2 = extractelement <4 x i8> %y, i32 2
225
/// %x0x0 = mul i8 %x0, %x0
226
/// %x3x3 = mul i8 %x3, %x3
227
/// %y1y1 = mul i8 %y1, %y1
228
/// %y2y2 = mul i8 %y2, %y2
229
/// %ins1 = insertelement <4 x i8> undef, i8 %x0x0, i32 0
230
/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
231
/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
232
/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
233
/// ret <4 x i8> %ins4
234
/// can be transformed into:
235
/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
236
///                                                         i32 6>
237
/// %2 = mul <4 x i8> %1, %1
238
/// ret <4 x i8> %2
239
/// We convert this initially to something like:
240
/// %x0 = extractelement <4 x i8> %x, i32 0
241
/// %x3 = extractelement <4 x i8> %x, i32 3
242
/// %y1 = extractelement <4 x i8> %y, i32 1
243
/// %y2 = extractelement <4 x i8> %y, i32 2
244
/// %1 = insertelement <4 x i8> undef, i8 %x0, i32 0
245
/// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1
246
/// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2
247
/// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3
248
/// %5 = mul <4 x i8> %4, %4
249
/// %6 = extractelement <4 x i8> %5, i32 0
250
/// %ins1 = insertelement <4 x i8> undef, i8 %6, i32 0
251
/// %7 = extractelement <4 x i8> %5, i32 1
252
/// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1
253
/// %8 = extractelement <4 x i8> %5, i32 2
254
/// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2
255
/// %9 = extractelement <4 x i8> %5, i32 3
256
/// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3
257
/// ret <4 x i8> %ins4
258
/// InstCombiner transforms this into a shuffle and vector mul
259
/// TODO: Can we split off and reuse the shuffle mask detection from
260
/// TargetTransformInfo::getInstructionThroughput?
261
static Optional<TargetTransformInfo::ShuffleKind>
262
8.62k
isShuffle(ArrayRef<Value *> VL) {
263
8.62k
  auto *EI0 = cast<ExtractElementInst>(VL[0]);
264
8.62k
  unsigned Size = EI0->getVectorOperandType()->getVectorNumElements();
265
8.62k
  Value *Vec1 = nullptr;
266
8.62k
  Value *Vec2 = nullptr;
267
8.62k
  enum ShuffleMode { Unknown, Select, Permute };
268
8.62k
  ShuffleMode CommonShuffleMode = Unknown;
269
37.0k
  for (unsigned I = 0, E = VL.size(); I < E; 
++I28.3k
) {
270
28.4k
    auto *EI = cast<ExtractElementInst>(VL[I]);
271
28.4k
    auto *Vec = EI->getVectorOperand();
272
28.4k
    // All vector operands must have the same number of vector elements.
273
28.4k
    if (Vec->getType()->getVectorNumElements() != Size)
274
7
      return None;
275
28.3k
    auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
276
28.3k
    if (!Idx)
277
0
      return None;
278
28.3k
    // Undefined behavior if Idx is negative or >= Size.
279
28.3k
    if (Idx->getValue().uge(Size))
280
0
      continue;
281
28.3k
    unsigned IntIdx = Idx->getValue().getZExtValue();
282
28.3k
    // We can extractelement from undef vector.
283
28.3k
    if (isa<UndefValue>(Vec))
284
11
      continue;
285
28.3k
    // For correct shuffling we have to have at most 2 different vector operands
286
28.3k
    // in all extractelement instructions.
287
28.3k
    if (!Vec1 || 
Vec1 == Vec19.7k
)
288
24.9k
      Vec1 = Vec;
289
3.40k
    else if (!Vec2 || 
Vec2 == Vec1.99k
)
290
3.40k
      Vec2 = Vec;
291
0
    else
292
0
      return None;
293
28.3k
    if (CommonShuffleMode == Permute)
294
6.67k
      continue;
295
21.7k
    // If the extract index is not the same as the operation number, it is a
296
21.7k
    // permutation.
297
21.7k
    if (IntIdx != I) {
298
2.63k
      CommonShuffleMode = Permute;
299
2.63k
      continue;
300
2.63k
    }
301
19.0k
    CommonShuffleMode = Select;
302
19.0k
  }
303
8.62k
  // If we're not crossing lanes in different vectors, consider it as blending.
304
8.62k
  
if (8.62k
CommonShuffleMode == Select8.62k
&&
Vec25.98k
)
305
16
    return TargetTransformInfo::SK_Select;
306
8.60k
  // If Vec2 was never used, we have a permutation of a single vector, otherwise
307
8.60k
  // we have permutation of 2 vectors.
308
8.60k
  return Vec2 ? 
TargetTransformInfo::SK_PermuteTwoSrc1.39k
309
8.60k
              : 
TargetTransformInfo::SK_PermuteSingleSrc7.21k
;
310
8.60k
}
311
312
namespace {
313
314
/// Main data required for vectorization of instructions.
315
struct InstructionsState {
316
  /// The very first instruction in the list with the main opcode.
317
  Value *OpValue = nullptr;
318
319
  /// The main/alternate instruction.
320
  Instruction *MainOp = nullptr;
321
  Instruction *AltOp = nullptr;
322
323
  /// The main/alternate opcodes for the list of instructions.
324
11.2M
  unsigned getOpcode() const {
325
11.2M
    return MainOp ? 
MainOp->getOpcode()9.60M
:
01.67M
;
326
11.2M
  }
327
328
2.18M
  unsigned getAltOpcode() const {
329
2.18M
    return AltOp ? AltOp->getOpcode() : 
00
;
330
2.18M
  }
331
332
  /// Some of the instructions in the list have alternate opcodes.
333
1.84M
  bool isAltShuffle() const { return getOpcode() != getAltOpcode(); }
334
335
3.71M
  bool isOpcodeOrAlt(Instruction *I) const {
336
3.71M
    unsigned CheckedOpcode = I->getOpcode();
337
3.71M
    return getOpcode() == CheckedOpcode || 
getAltOpcode() == CheckedOpcode252k
;
338
3.71M
  }
339
340
  InstructionsState() = delete;
341
  InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
342
4.80M
      : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
343
};
344
345
} // end anonymous namespace
346
347
/// Chooses the correct key for scheduling data. If \p Op has the same (or
348
/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
349
/// OpValue.
350
3.71M
static Value *isOneOf(const InstructionsState &S, Value *Op) {
351
3.71M
  auto *I = dyn_cast<Instruction>(Op);
352
3.71M
  if (I && S.isOpcodeOrAlt(I))
353
3.71M
    return Op;
354
0
  return S.OpValue;
355
0
}
356
357
/// \returns analysis of the Instructions in \p VL described in
358
/// InstructionsState, the Opcode that we suppose the whole list
359
/// could be vectorized even if its structure is diverse.
360
static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
361
4.80M
                                       unsigned BaseIndex = 0) {
362
4.80M
  // Make sure these are all Instructions.
363
10.3M
  if (
llvm::any_of(VL, [](Value *V) 4.80M
{ return !isa<Instruction>(V); }))
364
1.39M
    return InstructionsState(VL[BaseIndex], nullptr, nullptr);
365
3.40M
366
3.40M
  bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
367
3.40M
  bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
368
3.40M
  unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
369
3.40M
  unsigned AltOpcode = Opcode;
370
3.40M
  unsigned AltIndex = BaseIndex;
371
3.40M
372
3.40M
  // Check for one alternate opcode from another BinaryOperator.
373
3.40M
  // TODO - generalize to support all operators (types, calls etc.).
374
10.6M
  for (int Cnt = 0, E = VL.size(); Cnt < E; 
Cnt++7.26M
) {
375
7.92M
    unsigned InstOpcode = cast<Instruction>(VL[Cnt])->getOpcode();
376
7.92M
    if (IsBinOp && 
isa<BinaryOperator>(VL[Cnt])2.36M
) {
377
2.06M
      if (InstOpcode == Opcode || 
InstOpcode == AltOpcode358k
)
378
1.71M
        continue;
379
348k
      if (Opcode == AltOpcode) {
380
347k
        AltOpcode = InstOpcode;
381
347k
        AltIndex = Cnt;
382
347k
        continue;
383
347k
      }
384
5.86M
    } else if (IsCastOp && 
isa<CastInst>(VL[Cnt])552k
) {
385
514k
      Type *Ty0 = cast<Instruction>(VL[BaseIndex])->getOperand(0)->getType();
386
514k
      Type *Ty1 = cast<Instruction>(VL[Cnt])->getOperand(0)->getType();
387
514k
      if (Ty0 == Ty1) {
388
499k
        if (InstOpcode == Opcode || 
InstOpcode == AltOpcode51.2k
)
389
448k
          continue;
390
50.9k
        if (Opcode == AltOpcode) {
391
50.9k
          AltOpcode = InstOpcode;
392
50.9k
          AltIndex = Cnt;
393
50.9k
          continue;
394
50.9k
        }
395
5.34M
      }
396
5.34M
    } else if (InstOpcode == Opcode || 
InstOpcode == AltOpcode642k
)
397
4.70M
      continue;
398
658k
    return InstructionsState(VL[BaseIndex], nullptr, nullptr);
399
658k
  }
400
3.40M
401
3.40M
  return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
402
2.74M
                           cast<Instruction>(VL[AltIndex]));
403
3.40M
}
404
405
/// \returns true if all of the values in \p VL have the same type or false
406
/// otherwise.
407
630k
static bool allSameType(ArrayRef<Value *> VL) {
408
630k
  Type *Ty = VL[0]->getType();
409
1.44M
  for (int i = 1, e = VL.size(); i < e; 
i++812k
)
410
812k
    if (VL[i]->getType() != Ty)
411
0
      return false;
412
630k
413
630k
  return true;
414
630k
}
415
416
/// \returns True if Extract{Value,Element} instruction extracts element Idx.
417
10.3k
static Optional<unsigned> getExtractIndex(Instruction *E) {
418
10.3k
  unsigned Opcode = E->getOpcode();
419
10.3k
  assert((Opcode == Instruction::ExtractElement ||
420
10.3k
          Opcode == Instruction::ExtractValue) &&
421
10.3k
         "Expected extractelement or extractvalue instruction.");
422
10.3k
  if (Opcode == Instruction::ExtractElement) {
423
10.3k
    auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
424
10.3k
    if (!CI)
425
0
      return None;
426
10.3k
    return CI->getZExtValue();
427
10.3k
  }
428
24
  ExtractValueInst *EI = cast<ExtractValueInst>(E);
429
24
  if (EI->getNumIndices() != 1)
430
0
    return None;
431
24
  return *EI->idx_begin();
432
24
}
433
434
/// \returns True if in-tree use also needs extract. This refers to
435
/// possible scalar operand in vectorized instruction.
436
static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
437
575k
                                    TargetLibraryInfo *TLI) {
438
575k
  unsigned Opcode = UserInst->getOpcode();
439
575k
  switch (Opcode) {
440
575k
  case Instruction::Load: {
441
0
    LoadInst *LI = cast<LoadInst>(UserInst);
442
0
    return (LI->getPointerOperand() == Scalar);
443
575k
  }
444
575k
  case Instruction::Store: {
445
27.3k
    StoreInst *SI = cast<StoreInst>(UserInst);
446
27.3k
    return (SI->getPointerOperand() == Scalar);
447
575k
  }
448
575k
  case Instruction::Call: {
449
3.36k
    CallInst *CI = cast<CallInst>(UserInst);
450
3.36k
    Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
451
9.01k
    for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; 
++i5.65k
) {
452
6.43k
      if (hasVectorInstrinsicScalarOpd(ID, i))
453
783
        return (CI->getArgOperand(i) == Scalar);
454
6.43k
    }
455
3.36k
    
LLVM_FALLTHROUGH2.57k
;
456
2.57k
  }
457
546k
  default:
458
546k
    return false;
459
575k
  }
460
575k
}
461
462
/// \returns the AA location that is being access by the instruction.
463
2.69M
static MemoryLocation getLocation(Instruction *I, AliasAnalysis *AA) {
464
2.69M
  if (StoreInst *SI = dyn_cast<StoreInst>(I))
465
1.97M
    return MemoryLocation::get(SI);
466
721k
  if (LoadInst *LI = dyn_cast<LoadInst>(I))
467
595k
    return MemoryLocation::get(LI);
468
125k
  return MemoryLocation();
469
125k
}
470
471
/// \returns True if the instruction is not a volatile or atomic load/store.
472
3.42M
static bool isSimple(Instruction *I) {
473
3.42M
  if (LoadInst *LI = dyn_cast<LoadInst>(I))
474
987k
    return LI->isSimple();
475
2.43M
  if (StoreInst *SI = dyn_cast<StoreInst>(I))
476
2.43M
    return SI->isSimple();
477
0
  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
478
0
    return !MI->isVolatile();
479
0
  return true;
480
0
}
481
482
namespace llvm {
483
484
namespace slpvectorizer {
485
486
/// Bottom Up SLP Vectorizer.
487
class BoUpSLP {
488
  struct TreeEntry;
489
490
public:
491
  using ValueList = SmallVector<Value *, 8>;
492
  using InstrList = SmallVector<Instruction *, 16>;
493
  using ValueSet = SmallPtrSet<Value *, 16>;
494
  using StoreList = SmallVector<StoreInst *, 8>;
495
  using ExtraValueToDebugLocsMap =
496
      MapVector<Value *, SmallVector<Instruction *, 2>>;
497
498
  BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
499
          TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
500
          DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
501
          const DataLayout *DL, OptimizationRemarkEmitter *ORE)
502
      : F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC),
503
275k
        DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) {
504
275k
    CodeMetrics::collectEphemeralValues(F, AC, EphValues);
505
275k
    // Use the vector register size specified by the target unless overridden
506
275k
    // by a command-line option.
507
275k
    // TODO: It would be better to limit the vectorization factor based on
508
275k
    //       data type rather than just register size. For example, x86 AVX has
509
275k
    //       256-bit registers, but it does not support integer operations
510
275k
    //       at that width (that requires AVX2).
511
275k
    if (MaxVectorRegSizeOption.getNumOccurrences())
512
2
      MaxVecRegSize = MaxVectorRegSizeOption;
513
275k
    else
514
275k
      MaxVecRegSize = TTI->getRegisterBitWidth(true);
515
275k
516
275k
    if (MinVectorRegSizeOption.getNumOccurrences())
517
2
      MinVecRegSize = MinVectorRegSizeOption;
518
275k
    else
519
275k
      MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
520
275k
  }
521
522
  /// Vectorize the tree that starts with the elements in \p VL.
523
  /// Returns the vectorized root.
524
  Value *vectorizeTree();
525
526
  /// Vectorize the tree but with the list of externally used values \p
527
  /// ExternallyUsedValues. Values in this MapVector can be replaced but the
528
  /// generated extractvalue instructions.
529
  Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues);
530
531
  /// \returns the cost incurred by unwanted spills and fills, caused by
532
  /// holding live values over call sites.
533
  int getSpillCost() const;
534
535
  /// \returns the vectorization cost of the subtree that starts at \p VL.
536
  /// A negative number means that this is profitable.
537
  int getTreeCost();
538
539
  /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
540
  /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
541
  void buildTree(ArrayRef<Value *> Roots,
542
                 ArrayRef<Value *> UserIgnoreLst = None);
543
544
  /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
545
  /// the purpose of scheduling and extraction in the \p UserIgnoreLst taking
546
  /// into account (anf updating it, if required) list of externally used
547
  /// values stored in \p ExternallyUsedValues.
548
  void buildTree(ArrayRef<Value *> Roots,
549
                 ExtraValueToDebugLocsMap &ExternallyUsedValues,
550
                 ArrayRef<Value *> UserIgnoreLst = None);
551
552
  /// Clear the internal data structures that are created by 'buildTree'.
553
629k
  void deleteTree() {
554
629k
    VectorizableTree.clear();
555
629k
    ScalarToTreeEntry.clear();
556
629k
    MustGather.clear();
557
629k
    ExternalUses.clear();
558
629k
    NumOpsWantToKeepOrder.clear();
559
629k
    NumOpsWantToKeepOriginalOrder = 0;
560
4.57M
    for (auto &Iter : BlocksSchedules) {
561
4.57M
      BlockScheduling *BS = Iter.second.get();
562
4.57M
      BS->clear();
563
4.57M
    }
564
629k
    MinBWs.clear();
565
629k
  }
566
567
43.1k
  unsigned getTreeSize() const { return VectorizableTree.size(); }
568
569
  /// Perform LICM and CSE on the newly generated gather sequences.
570
  void optimizeGatherSequence();
571
572
  /// \returns The best order of instructions for vectorization.
573
479k
  Optional<ArrayRef<unsigned>> bestOrder() const {
574
479k
    auto I = std::max_element(
575
479k
        NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(),
576
479k
        [](const decltype(NumOpsWantToKeepOrder)::value_type &D1,
577
479k
           const decltype(NumOpsWantToKeepOrder)::value_type &D2) {
578
24
          return D1.second < D2.second;
579
24
        });
580
479k
    if (I == NumOpsWantToKeepOrder.end() ||
581
479k
        
I->getSecond() <= NumOpsWantToKeepOriginalOrder23.9k
)
582
456k
      return None;
583
23.3k
584
23.3k
    return makeArrayRef(I->getFirst());
585
23.3k
  }
586
587
  /// \return The vector element size in bits to use when vectorizing the
588
  /// expression tree ending at \p V. If V is a store, the size is the width of
589
  /// the stored value. Otherwise, the size is the width of the largest loaded
590
  /// value reaching V. This method is used by the vectorizer to calculate
591
  /// vectorization factors.
592
  unsigned getVectorElementSize(Value *V) const;
593
594
  /// Compute the minimum type sizes required to represent the entries in a
595
  /// vectorizable tree.
596
  void computeMinimumValueSizes();
597
598
  // \returns maximum vector register size as set by TTI or overridden by cl::opt.
599
113k
  unsigned getMaxVecRegSize() const {
600
113k
    return MaxVecRegSize;
601
113k
  }
602
603
  // \returns minimum vector register size as set by cl::opt.
604
725k
  unsigned getMinVecRegSize() const {
605
725k
    return MinVecRegSize;
606
725k
  }
607
608
  /// Check if ArrayType or StructType is isomorphic to some VectorType.
609
  ///
610
  /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
611
  unsigned canMapToVector(Type *T, const DataLayout &DL) const;
612
613
  /// \returns True if the VectorizableTree is both tiny and not fully
614
  /// vectorizable. We do not vectorize such trees.
615
  bool isTreeTinyAndNotFullyVectorizable() const;
616
617
490k
  OptimizationRemarkEmitter *getORE() { return ORE; }
618
619
  /// This structure holds any data we need about the edges being traversed
620
  /// during buildTree_rec(). We keep track of:
621
  /// (i) the user TreeEntry index, and
622
  /// (ii) the index of the edge.
623
  struct EdgeInfo {
624
629k
    EdgeInfo() = default;
625
    EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
626
1.31M
        : UserTE(UserTE), EdgeIdx(EdgeIdx) {}
627
    /// The user TreeEntry.
628
    TreeEntry *UserTE = nullptr;
629
    /// The operand index of the use.
630
    unsigned EdgeIdx = UINT_MAX;
631
#ifndef NDEBUG
632
    friend inline raw_ostream &operator<<(raw_ostream &OS,
633
                                          const BoUpSLP::EdgeInfo &EI) {
634
      EI.dump(OS);
635
      return OS;
636
    }
637
    /// Debug print.
638
    void dump(raw_ostream &OS) const {
639
      OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
640
         << " EdgeIdx:" << EdgeIdx << "}";
641
    }
642
    LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
643
#endif
644
  };
645
646
  /// A helper data structure to hold the operands of a vector of instructions.
647
  /// This supports a fixed vector length for all operand vectors.
648
  class VLOperands {
649
    /// For each operand we need (i) the value, and (ii) the opcode that it
650
    /// would be attached to if the expression was in a left-linearized form.
651
    /// This is required to avoid illegal operand reordering.
652
    /// For example:
653
    /// \verbatim
654
    ///                         0 Op1
655
    ///                         |/
656
    /// Op1 Op2   Linearized    + Op2
657
    ///   \ /     ---------->   |/
658
    ///    -                    -
659
    ///
660
    /// Op1 - Op2            (0 + Op1) - Op2
661
    /// \endverbatim
662
    ///
663
    /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
664
    ///
665
    /// Another way to think of this is to track all the operations across the
666
    /// path from the operand all the way to the root of the tree and to
667
    /// calculate the operation that corresponds to this path. For example, the
668
    /// path from Op2 to the root crosses the RHS of the '-', therefore the
669
    /// corresponding operation is a '-' (which matches the one in the
670
    /// linearized tree, as shown above).
671
    ///
672
    /// For lack of a better term, we refer to this operation as Accumulated
673
    /// Path Operation (APO).
674
    struct OperandData {
675
922k
      OperandData() = default;
676
      OperandData(Value *V, bool APO, bool IsUsed)
677
922k
          : V(V), APO(APO), IsUsed(IsUsed) {}
678
      /// The operand value.
679
      Value *V = nullptr;
680
      /// TreeEntries only allow a single opcode, or an alternate sequence of
681
      /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
682
      /// APO. It is set to 'true' if 'V' is attached to an inverse operation
683
      /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
684
      /// (e.g., Add/Mul)
685
      bool APO = false;
686
      /// Helper data for the reordering function.
687
      bool IsUsed = false;
688
    };
689
690
    /// During operand reordering, we are trying to select the operand at lane
691
    /// that matches best with the operand at the neighboring lane. Our
692
    /// selection is based on the type of value we are looking for. For example,
693
    /// if the neighboring lane has a load, we need to look for a load that is
694
    /// accessing a consecutive address. These strategies are summarized in the
695
    /// 'ReorderingMode' enumerator.
696
    enum class ReorderingMode {
697
      Load,     ///< Matching loads to consecutive memory addresses
698
      Opcode,   ///< Matching instructions based on opcode (same or alternate)
699
      Constant, ///< Matching constants
700
      Splat,    ///< Matching the same instruction multiple times (broadcast)
701
      Failed,   ///< We failed to create a vectorizable group
702
    };
703
704
    using OperandDataVec = SmallVector<OperandData, 2>;
705
706
    /// A vector of operand vectors.
707
    SmallVector<OperandDataVec, 4> OpsVec;
708
709
    const DataLayout &DL;
710
    ScalarEvolution &SE;
711
712
    /// \returns the operand data at \p OpIdx and \p Lane.
713
4.21M
    OperandData &getData(unsigned OpIdx, unsigned Lane) {
714
4.21M
      return OpsVec[OpIdx][Lane];
715
4.21M
    }
716
717
    /// \returns the operand data at \p OpIdx and \p Lane. Const version.
718
1.34M
    const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
719
1.34M
      return OpsVec[OpIdx][Lane];
720
1.34M
    }
721
722
    /// Clears the used flag for all entries.
723
353k
    void clearUsed() {
724
353k
      for (unsigned OpIdx = 0, NumOperands = getNumOperands();
725
1.06M
           OpIdx != NumOperands; 
++OpIdx707k
)
726
2.23M
        
for (unsigned Lane = 0, NumLanes = getNumLanes(); 707k
Lane != NumLanes;
727
1.52M
             ++Lane)
728
1.52M
          OpsVec[OpIdx][Lane].IsUsed = false;
729
353k
    }
730
731
    /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
732
358k
    void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
733
358k
      std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
734
358k
    }
735
736
    // Search all operands in Ops[*][Lane] for the one that matches best
737
    // Ops[OpIdx][LastLane] and return its opreand index.
738
    // If no good match can be found, return None.
739
    Optional<unsigned>
740
    getBestOperand(unsigned OpIdx, int Lane, int LastLane,
741
821k
                   ArrayRef<ReorderingMode> ReorderingModes) {
742
821k
      unsigned NumOperands = getNumOperands();
743
821k
744
821k
      // The operand of the previous lane at OpIdx.
745
821k
      Value *OpLastLane = getData(OpIdx, LastLane).V;
746
821k
747
821k
      // Our strategy mode for OpIdx.
748
821k
      ReorderingMode RMode = ReorderingModes[OpIdx];
749
821k
750
821k
      // The linearized opcode of the operand at OpIdx, Lane.
751
821k
      bool OpIdxAPO = getData(OpIdx, Lane).APO;
752
821k
753
821k
      const unsigned BestScore = 2;
754
821k
      const unsigned GoodScore = 1;
755
821k
756
821k
      // The best operand index and its score.
757
821k
      // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
758
821k
      // are using the score to differentiate between the two.
759
821k
      struct BestOpData {
760
821k
        Optional<unsigned> Idx = None;
761
821k
        unsigned Score = 0;
762
821k
      } BestOp;
763
821k
764
821k
      // Iterate through all unused operands and look for the best.
765
2.01M
      for (unsigned Idx = 0; Idx != NumOperands; 
++Idx1.19M
) {
766
1.45M
        // Get the operand at Idx and Lane.
767
1.45M
        OperandData &OpData = getData(Idx, Lane);
768
1.45M
        Value *Op = OpData.V;
769
1.45M
        bool OpAPO = OpData.APO;
770
1.45M
771
1.45M
        // Skip already selected operands.
772
1.45M
        if (OpData.IsUsed)
773
189k
          continue;
774
1.26M
775
1.26M
        // Skip if we are trying to move the operand to a position with a
776
1.26M
        // different opcode in the linearized tree form. This would break the
777
1.26M
        // semantics.
778
1.26M
        if (OpAPO != OpIdxAPO)
779
30.8k
          continue;
780
1.23M
781
1.23M
        // Look for an operand that matches the current mode.
782
1.23M
        switch (RMode) {
783
1.23M
        case ReorderingMode::Load:
784
176k
          if (isa<LoadInst>(Op)) {
785
113k
            // Figure out which is left and right, so that we can check for
786
113k
            // consecutive loads
787
113k
            bool LeftToRight = Lane > LastLane;
788
113k
            Value *OpLeft = (LeftToRight) ? 
OpLastLane108k
:
Op4.42k
;
789
113k
            Value *OpRight = (LeftToRight) ? 
Op108k
:
OpLastLane4.42k
;
790
113k
            if (isConsecutiveAccess(cast<LoadInst>(OpLeft),
791
113k
                                    cast<LoadInst>(OpRight), DL, SE))
792
34.6k
              BestOp.Idx = Idx;
793
113k
          }
794
176k
          break;
795
1.23M
        case ReorderingMode::Opcode:
796
543k
          // We accept both Instructions and Undefs, but with different scores.
797
543k
          if ((isa<Instruction>(Op) && 
isa<Instruction>(OpLastLane)452k
&&
798
543k
               cast<Instruction>(Op)->getOpcode() ==
799
452k
                   cast<Instruction>(OpLastLane)->getOpcode()) ||
800
543k
              
(310k
isa<UndefValue>(OpLastLane)310k
&&
isa<Instruction>(Op)4
) ||
801
543k
              
isa<UndefValue>(Op)310k
) {
802
233k
            // An instruction has a higher score than an undef.
803
233k
            unsigned Score = (isa<UndefValue>(Op)) ? 
GoodScore28
:
BestScore233k
;
804
233k
            if (Score > BestOp.Score) {
805
207k
              BestOp.Idx = Idx;
806
207k
              BestOp.Score = Score;
807
207k
            }
808
233k
          }
809
543k
          break;
810
1.23M
        case ReorderingMode::Constant:
811
176k
          if (isa<Constant>(Op)) {
812
80.4k
            unsigned Score = (isa<UndefValue>(Op)) ? 
GoodScore69
:
BestScore80.4k
;
813
80.4k
            if (Score > BestOp.Score) {
814
80.4k
              BestOp.Idx = Idx;
815
80.4k
              BestOp.Score = Score;
816
80.4k
            }
817
80.4k
          }
818
176k
          break;
819
1.23M
        case ReorderingMode::Splat:
820
79.8k
          if (Op == OpLastLane)
821
37.3k
            BestOp.Idx = Idx;
822
79.8k
          break;
823
1.23M
        case ReorderingMode::Failed:
824
255k
          return None;
825
1.23M
        }
826
1.23M
      }
827
821k
828
821k
      
if (565k
BestOp.Idx565k
) {
829
358k
        getData(BestOp.Idx.getValue(), Lane).IsUsed = true;
830
358k
        return BestOp.Idx;
831
358k
      }
832
207k
      // If we could not find a good match return None.
833
207k
      return None;
834
207k
    }
835
836
    /// Helper for reorderOperandVecs. \Returns the lane that we should start
837
    /// reordering from. This is the one which has the least number of operands
838
    /// that can freely move about.
839
211k
    unsigned getBestLaneToStartReordering() const {
840
211k
      unsigned BestLane = 0;
841
211k
      unsigned Min = UINT_MAX;
842
672k
      for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
843
461k
           ++Lane) {
844
461k
        unsigned NumFreeOps = getMaxNumOperandsThatCanBeReordered(Lane);
845
461k
        if (NumFreeOps < Min) {
846
239k
          Min = NumFreeOps;
847
239k
          BestLane = Lane;
848
239k
        }
849
461k
      }
850
211k
      return BestLane;
851
211k
    }
852
853
    /// \Returns the maximum number of operands that are allowed to be reordered
854
    /// for \p Lane. This is used as a heuristic for selecting the first lane to
855
    /// start operand reordering.
856
461k
    unsigned getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
857
461k
      unsigned CntTrue = 0;
858
461k
      unsigned NumOperands = getNumOperands();
859
461k
      // Operands with the same APO can be reordered. We therefore need to count
860
461k
      // how many of them we have for each APO, like this: Cnt[APO] = x.
861
461k
      // Since we only have two APOs, namely true and false, we can avoid using
862
461k
      // a map. Instead we can simply count the number of operands that
863
461k
      // correspond to one of them (in this case the 'true' APO), and calculate
864
461k
      // the other by subtracting it from the total number of operands.
865
1.38M
      for (unsigned OpIdx = 0; OpIdx != NumOperands; 
++OpIdx922k
)
866
922k
        if (getData(OpIdx, Lane).APO)
867
78.3k
          ++CntTrue;
868
461k
      unsigned CntFalse = NumOperands - CntTrue;
869
461k
      return std::max(CntTrue, CntFalse);
870
461k
    }
871
872
    /// Go through the instructions in VL and append their operands.
873
211k
    void appendOperandsOfVL(ArrayRef<Value *> VL) {
874
211k
      assert(!VL.empty() && "Bad VL");
875
211k
      assert((empty() || VL.size() == getNumLanes()) &&
876
211k
             "Expected same number of lanes");
877
211k
      assert(isa<Instruction>(VL[0]) && "Expected instruction");
878
211k
      unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
879
211k
      OpsVec.resize(NumOperands);
880
211k
      unsigned NumLanes = VL.size();
881
633k
      for (unsigned OpIdx = 0; OpIdx != NumOperands; 
++OpIdx422k
) {
882
422k
        OpsVec[OpIdx].resize(NumLanes);
883
1.34M
        for (unsigned Lane = 0; Lane != NumLanes; 
++Lane922k
) {
884
922k
          assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
885
922k
          // Our tree has just 3 nodes: the root and two operands.
886
922k
          // It is therefore trivial to get the APO. We only need to check the
887
922k
          // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
888
922k
          // RHS operand. The LHS operand of both add and sub is never attached
889
922k
          // to an inversese operation in the linearized form, therefore its APO
890
922k
          // is false. The RHS is true only if VL[Lane] is an inverse operation.
891
922k
892
922k
          // Since operand reordering is performed on groups of commutative
893
922k
          // operations or alternating sequences (e.g., +, -), we can safely
894
922k
          // tell the inverse operations by checking commutativity.
895
922k
          bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
896
922k
          bool APO = (OpIdx == 0) ? 
false461k
:
IsInverseOperation461k
;
897
922k
          OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
898
922k
                                 APO, false};
899
922k
        }
900
422k
      }
901
211k
    }
902
903
    /// \returns the number of operands.
904
2.10M
    unsigned getNumOperands() const { return OpsVec.size(); }
905
906
    /// \returns the number of lanes.
907
1.80M
    unsigned getNumLanes() const { return OpsVec[0].size(); }
908
909
    /// \returns the operand value at \p OpIdx and \p Lane.
910
422k
    Value *getValue(unsigned OpIdx, unsigned Lane) const {
911
422k
      return getData(OpIdx, Lane).V;
912
422k
    }
913
914
    /// \returns true if the data structure is empty.
915
0
    bool empty() const { return OpsVec.empty(); }
916
917
    /// Clears the data.
918
0
    void clear() { OpsVec.clear(); }
919
920
    /// \Returns true if there are enough operands identical to \p Op to fill
921
    /// the whole vector.
922
    /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
923
255k
    bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
924
255k
      bool OpAPO = getData(OpIdx, Lane).APO;
925
509k
      for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; 
++Ln253k
) {
926
486k
        if (Ln == Lane)
927
227k
          continue;
928
259k
        // This is set to true if we found a candidate for broadcast at Lane.
929
259k
        bool FoundCandidate = false;
930
735k
        for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; 
++OpI476k
) {
931
502k
          OperandData &Data = getData(OpI, Ln);
932
502k
          if (Data.APO != OpAPO || 
Data.IsUsed450k
)
933
55.1k
            continue;
934
447k
          if (Data.V == Op) {
935
26.4k
            FoundCandidate = true;
936
26.4k
            Data.IsUsed = true;
937
26.4k
            break;
938
26.4k
          }
939
447k
        }
940
259k
        if (!FoundCandidate)
941
233k
          return false;
942
259k
      }
943
255k
      
return true22.4k
;
944
255k
    }
945
946
  public:
947
    /// Initialize with all the operands of the instruction vector \p RootVL.
948
    VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL,
949
               ScalarEvolution &SE)
950
211k
        : DL(DL), SE(SE) {
951
211k
      // Append all the operands of RootVL.
952
211k
      appendOperandsOfVL(RootVL);
953
211k
    }
954
955
    /// \Returns a value vector with the operands across all lanes for the
956
    /// opearnd at \p OpIdx.
957
422k
    ValueList getVL(unsigned OpIdx) const {
958
422k
      ValueList OpVL(OpsVec[OpIdx].size());
959
422k
      assert(OpsVec[OpIdx].size() == getNumLanes() &&
960
422k
             "Expected same num of lanes across all operands");
961
1.34M
      for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; 
++Lane922k
)
962
922k
        OpVL[Lane] = OpsVec[OpIdx][Lane].V;
963
422k
      return OpVL;
964
422k
    }
965
966
    // Performs operand reordering for 2 or more operands.
967
    // The original operands are in OrigOps[OpIdx][Lane].
968
    // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
969
211k
    void reorder() {
970
211k
      unsigned NumOperands = getNumOperands();
971
211k
      unsigned NumLanes = getNumLanes();
972
211k
      // Each operand has its own mode. We are using this mode to help us select
973
211k
      // the instructions for each lane, so that they match best with the ones
974
211k
      // we have selected so far.
975
211k
      SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
976
211k
977
211k
      // This is a greedy single-pass algorithm. We are going over each lane
978
211k
      // once and deciding on the best order right away with no back-tracking.
979
211k
      // However, in order to increase its effectiveness, we start with the lane
980
211k
      // that has operands that can move the least. For example, given the
981
211k
      // following lanes:
982
211k
      //  Lane 0 : A[0] = B[0] + C[0]   // Visited 3rd
983
211k
      //  Lane 1 : A[1] = C[1] - B[1]   // Visited 1st
984
211k
      //  Lane 2 : A[2] = B[2] + C[2]   // Visited 2nd
985
211k
      //  Lane 3 : A[3] = C[3] - B[3]   // Visited 4th
986
211k
      // we will start at Lane 1, since the operands of the subtraction cannot
987
211k
      // be reordered. Then we will visit the rest of the lanes in a circular
988
211k
      // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
989
211k
990
211k
      // Find the first lane that we will start our search from.
991
211k
      unsigned FirstLane = getBestLaneToStartReordering();
992
211k
993
211k
      // Initialize the modes.
994
633k
      for (unsigned OpIdx = 0; OpIdx != NumOperands; 
++OpIdx422k
) {
995
422k
        Value *OpLane0 = getValue(OpIdx, FirstLane);
996
422k
        // Keep track if we have instructions with all the same opcode on one
997
422k
        // side.
998
422k
        if (isa<LoadInst>(OpLane0))
999
80.0k
          ReorderingModes[OpIdx] = ReorderingMode::Load;
1000
342k
        else if (isa<Instruction>(OpLane0)) {
1001
255k
          // Check if OpLane0 should be broadcast.
1002
255k
          if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
1003
22.4k
            ReorderingModes[OpIdx] = ReorderingMode::Splat;
1004
233k
          else
1005
233k
            ReorderingModes[OpIdx] = ReorderingMode::Opcode;
1006
255k
        }
1007
86.6k
        else if (isa<Constant>(OpLane0))
1008
73.8k
          ReorderingModes[OpIdx] = ReorderingMode::Constant;
1009
12.8k
        else if (isa<Argument>(OpLane0))
1010
12.8k
          // Our best hope is a Splat. It may save some cost in some cases.
1011
12.8k
          ReorderingModes[OpIdx] = ReorderingMode::Splat;
1012
0
        else
1013
0
          // NOTE: This should be unreachable.
1014
0
          ReorderingModes[OpIdx] = ReorderingMode::Failed;
1015
422k
      }
1016
211k
1017
211k
      // If the initial strategy fails for any of the operand indexes, then we
1018
211k
      // perform reordering again in a second pass. This helps avoid assigning
1019
211k
      // high priority to the failed strategy, and should improve reordering for
1020
211k
      // the non-failed operand indexes.
1021
496k
      for (int Pass = 0; Pass != 2; 
++Pass285k
) {
1022
353k
        // Skip the second pass if the first pass did not fail.
1023
353k
        bool StrategyFailed = false;
1024
353k
        // Mark all operand data as free to use.
1025
353k
        clearUsed();
1026
353k
        // We keep the original operand order for the FirstLane, so reorder the
1027
353k
        // rest of the lanes. We are visiting the nodes in a circular fashion,
1028
353k
        // using FirstLane as the center point and increasing the radius
1029
353k
        // distance.
1030
764k
        for (unsigned Distance = 1; Distance != NumLanes; 
++Distance410k
) {
1031
410k
          // Visit the lane on the right and then the lane on the left.
1032
821k
          for (int Direction : {+1, -1}) {
1033
821k
            int Lane = FirstLane + Direction * Distance;
1034
821k
            if (Lane < 0 || 
Lane >= (int)NumLanes466k
)
1035
410k
              continue;
1036
410k
            int LastLane = Lane - Direction;
1037
410k
            assert(LastLane >= 0 && LastLane < (int)NumLanes &&
1038
410k
                   "Out of bounds");
1039
410k
            // Look for a good match for each operand.
1040
1.23M
            for (unsigned OpIdx = 0; OpIdx != NumOperands; 
++OpIdx821k
) {
1041
821k
              // Search for the operand that matches SortedOps[OpIdx][Lane-1].
1042
821k
              Optional<unsigned> BestIdx =
1043
821k
                  getBestOperand(OpIdx, Lane, LastLane, ReorderingModes);
1044
821k
              // By not selecting a value, we allow the operands that follow to
1045
821k
              // select a better matching value. We will get a non-null value in
1046
821k
              // the next run of getBestOperand().
1047
821k
              if (BestIdx) {
1048
358k
                // Swap the current operand with the one returned by
1049
358k
                // getBestOperand().
1050
358k
                swap(OpIdx, BestIdx.getValue(), Lane);
1051
462k
              } else {
1052
462k
                // We failed to find a best operand, set mode to 'Failed'.
1053
462k
                ReorderingModes[OpIdx] = ReorderingMode::Failed;
1054
462k
                // Enable the second pass.
1055
462k
                StrategyFailed = true;
1056
462k
              }
1057
821k
            }
1058
410k
          }
1059
410k
        }
1060
353k
        // Skip second pass if the strategy did not fail.
1061
353k
        if (!StrategyFailed)
1062
68.4k
          break;
1063
353k
      }
1064
211k
    }
1065
1066
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1067
    LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
1068
      switch (RMode) {
1069
      case ReorderingMode::Load:
1070
        return "Load";
1071
      case ReorderingMode::Opcode:
1072
        return "Opcode";
1073
      case ReorderingMode::Constant:
1074
        return "Constant";
1075
      case ReorderingMode::Splat:
1076
        return "Splat";
1077
      case ReorderingMode::Failed:
1078
        return "Failed";
1079
      }
1080
      llvm_unreachable("Unimplemented Reordering Type");
1081
    }
1082
1083
    LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
1084
                                                   raw_ostream &OS) {
1085
      return OS << getModeStr(RMode);
1086
    }
1087
1088
    /// Debug print.
1089
    LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
1090
      printMode(RMode, dbgs());
1091
    }
1092
1093
    friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
1094
      return printMode(RMode, OS);
1095
    }
1096
1097
    LLVM_DUMP_METHOD raw_ostream &print(raw_ostream &OS) const {
1098
      const unsigned Indent = 2;
1099
      unsigned Cnt = 0;
1100
      for (const OperandDataVec &OpDataVec : OpsVec) {
1101
        OS << "Operand " << Cnt++ << "\n";
1102
        for (const OperandData &OpData : OpDataVec) {
1103
          OS.indent(Indent) << "{";
1104
          if (Value *V = OpData.V)
1105
            OS << *V;
1106
          else
1107
            OS << "null";
1108
          OS << ", APO:" << OpData.APO << "}\n";
1109
        }
1110
        OS << "\n";
1111
      }
1112
      return OS;
1113
    }
1114
1115
    /// Debug print.
1116
    LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
1117
#endif
1118
  };
1119
1120
private:
1121
  /// Checks if all users of \p I are the part of the vectorization tree.
1122
  bool areAllUsersVectorized(Instruction *I) const;
1123
1124
  /// \returns the cost of the vectorizable entry.
1125
  int getEntryCost(TreeEntry *E);
1126
1127
  /// This is the recursive part of buildTree.
1128
  void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
1129
                     const EdgeInfo &EI);
1130
1131
  /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
1132
  /// be vectorized to use the original vector (or aggregate "bitcast" to a
1133
  /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
1134
  /// returns false, setting \p CurrentOrder to either an empty vector or a
1135
  /// non-identity permutation that allows to reuse extract instructions.
1136
  bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
1137
                       SmallVectorImpl<unsigned> &CurrentOrder) const;
1138
1139
  /// Vectorize a single entry in the tree.
1140
  Value *vectorizeTree(TreeEntry *E);
1141
1142
  /// Vectorize a single entry in the tree, starting in \p VL.
1143
  Value *vectorizeTree(ArrayRef<Value *> VL);
1144
1145
  /// \returns the scalarization cost for this type. Scalarization in this
1146
  /// context means the creation of vectors from a group of scalars.
1147
  int getGatherCost(Type *Ty, const DenseSet<unsigned> &ShuffledIndices) const;
1148
1149
  /// \returns the scalarization cost for this list of values. Assuming that
1150
  /// this subtree gets vectorized, we may need to extract the values from the
1151
  /// roots. This method calculates the cost of extracting the values.
1152
  int getGatherCost(ArrayRef<Value *> VL) const;
1153
1154
  /// Set the Builder insert point to one after the last instruction in
1155
  /// the bundle
1156
  void setInsertPointAfterBundle(ArrayRef<Value *> VL,
1157
                                 const InstructionsState &S);
1158
1159
  /// \returns a vector from a collection of scalars in \p VL.
1160
  Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
1161
1162
  /// \returns whether the VectorizableTree is fully vectorizable and will
1163
  /// be beneficial even the tree height is tiny.
1164
  bool isFullyVectorizableTinyTree() const;
1165
1166
  /// Reorder commutative or alt operands to get better probability of
1167
  /// generating vectorized code.
1168
  static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
1169
                                             SmallVectorImpl<Value *> &Left,
1170
                                             SmallVectorImpl<Value *> &Right,
1171
                                             const DataLayout &DL,
1172
                                             ScalarEvolution &SE);
1173
  struct TreeEntry {
1174
    using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
1175
1.78M
    TreeEntry(VecTreeTy &Container) : Container(Container) {}
1176
1177
    /// \returns true if the scalars in VL are equal to this entry.
1178
1.81M
    bool isSame(ArrayRef<Value *> VL) const {
1179
1.81M
      if (VL.size() == Scalars.size())
1180
1.80M
        return std::equal(VL.begin(), VL.end(), Scalars.begin());
1181
8.59k
      return VL.size() == ReuseShuffleIndices.size() &&
1182
8.59k
             std::equal(
1183
1.01k
                 VL.begin(), VL.end(), ReuseShuffleIndices.begin(),
1184
1.23k
                 [this](Value *V, unsigned Idx) { return V == Scalars[Idx]; });
1185
8.59k
    }
1186
1187
    /// A vector of scalars.
1188
    ValueList Scalars;
1189
1190
    /// The Scalars are vectorized into this value. It is initialized to Null.
1191
    Value *VectorizedValue = nullptr;
1192
1193
    /// Do we need to gather this sequence ?
1194
    bool NeedToGather = false;
1195
1196
    /// Does this sequence require some shuffling?
1197
    SmallVector<unsigned, 4> ReuseShuffleIndices;
1198
1199
    /// Does this entry require reordering?
1200
    ArrayRef<unsigned> ReorderIndices;
1201
1202
    /// Points back to the VectorizableTree.
1203
    ///
1204
    /// Only used for Graphviz right now.  Unfortunately GraphTrait::NodeRef has
1205
    /// to be a pointer and needs to be able to initialize the child iterator.
1206
    /// Thus we need a reference back to the container to translate the indices
1207
    /// to entries.
1208
    VecTreeTy &Container;
1209
1210
    /// The TreeEntry index containing the user of this entry.  We can actually
1211
    /// have multiple users so the data structure is not truly a tree.
1212
    SmallVector<EdgeInfo, 1> UserTreeIndices;
1213
1214
    /// The index of this treeEntry in VectorizableTree.
1215
    int Idx = -1;
1216
1217
  private:
1218
    /// The operands of each instruction in each lane Operands[op_index][lane].
1219
    /// Note: This helps avoid the replication of the code that performs the
1220
    /// reordering of operands during buildTree_rec() and vectorizeTree().
1221
    SmallVector<ValueList, 2> Operands;
1222
1223
  public:
1224
    /// Set this bundle's \p OpIdx'th operand to \p OpVL.
1225
    void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL,
1226
1.31M
                    ArrayRef<unsigned> ReuseShuffleIndices) {
1227
1.31M
      if (Operands.size() < OpIdx + 1)
1228
1.31M
        Operands.resize(OpIdx + 1);
1229
1.31M
      assert(Operands[OpIdx].size() == 0 && "Already resized?");
1230
1.31M
      Operands[OpIdx].resize(Scalars.size());
1231
4.52M
      for (unsigned Lane = 0, E = Scalars.size(); Lane != E; 
++Lane3.20M
)
1232
3.20M
        Operands[OpIdx][Lane] = (!ReuseShuffleIndices.empty())
1233
3.20M
                                    ? 
OpVL[ReuseShuffleIndices[Lane]]7.76k
1234
3.20M
                                    : 
OpVL[Lane]3.20M
;
1235
1.31M
    }
1236
1237
    /// If there is a user TreeEntry, then set its operand.
1238
    void trySetUserTEOperand(const EdgeInfo &UserTreeIdx,
1239
                             ArrayRef<Value *> OpVL,
1240
1.94M
                             ArrayRef<unsigned> ReuseShuffleIndices) {
1241
1.94M
      if (UserTreeIdx.UserTE)
1242
1.31M
        UserTreeIdx.UserTE->setOperand(UserTreeIdx.EdgeIdx, OpVL,
1243
1.31M
                                       ReuseShuffleIndices);
1244
1.94M
    }
1245
1246
    /// \returns the \p OpIdx operand of this TreeEntry.
1247
63.8k
    ValueList &getOperand(unsigned OpIdx) {
1248
63.8k
      assert(OpIdx < Operands.size() && "Off bounds");
1249
63.8k
      return Operands[OpIdx];
1250
63.8k
    }
1251
1252
    /// \return the single \p OpIdx operand.
1253
2.18k
    Value *getSingleOperand(unsigned OpIdx) const {
1254
2.18k
      assert(OpIdx < Operands.size() && "Off bounds");
1255
2.18k
      assert(!Operands[OpIdx].empty() && "No operand available");
1256
2.18k
      return Operands[OpIdx][0];
1257
2.18k
    }
1258
1259
#ifndef NDEBUG
1260
    /// Debug printer.
1261
    LLVM_DUMP_METHOD void dump() const {
1262
      dbgs() << Idx << ".\n";
1263
      for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
1264
        dbgs() << "Operand " << OpI << ":\n";
1265
        for (const Value *V : Operands[OpI])
1266
          dbgs().indent(2) << *V << "\n";
1267
      }
1268
      dbgs() << "Scalars: \n";
1269
      for (Value *V : Scalars)
1270
        dbgs().indent(2) << *V << "\n";
1271
      dbgs() << "NeedToGather: " << NeedToGather << "\n";
1272
      dbgs() << "VectorizedValue: ";
1273
      if (VectorizedValue)
1274
        dbgs() << *VectorizedValue;
1275
      else
1276
        dbgs() << "NULL";
1277
      dbgs() << "\n";
1278
      dbgs() << "ReuseShuffleIndices: ";
1279
      if (ReuseShuffleIndices.empty())
1280
        dbgs() << "Emtpy";
1281
      else
1282
        for (unsigned Idx : ReuseShuffleIndices)
1283
          dbgs() << Idx << ", ";
1284
      dbgs() << "\n";
1285
      dbgs() << "ReorderIndices: ";
1286
      for (unsigned Idx : ReorderIndices)
1287
        dbgs() << Idx << ", ";
1288
      dbgs() << "\n";
1289
      dbgs() << "UserTreeIndices: ";
1290
      for (const auto &EInfo : UserTreeIndices)
1291
        dbgs() << EInfo << ", ";
1292
      dbgs() << "\n";
1293
    }
1294
#endif
1295
  };
1296
1297
  /// Create a new VectorizableTree entry.
1298
  TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized,
1299
                          const EdgeInfo &UserTreeIdx,
1300
                          ArrayRef<unsigned> ReuseShuffleIndices = None,
1301
1.78M
                          ArrayRef<unsigned> ReorderIndices = None) {
1302
1.78M
    VectorizableTree.push_back(llvm::make_unique<TreeEntry>(VectorizableTree));
1303
1.78M
    TreeEntry *Last = VectorizableTree.back().get();
1304
1.78M
    Last->Idx = VectorizableTree.size() - 1;
1305
1.78M
    Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
1306
1.78M
    Last->NeedToGather = !Vectorized;
1307
1.78M
    Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
1308
1.78M
                                     ReuseShuffleIndices.end());
1309
1.78M
    Last->ReorderIndices = ReorderIndices;
1310
1.78M
    if (Vectorized) {
1311
2.73M
      for (int i = 0, e = VL.size(); i != e; 
++i1.94M
) {
1312
1.94M
        assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
1313
1.94M
        ScalarToTreeEntry[VL[i]] = Last->Idx;
1314
1.94M
      }
1315
992k
    } else {
1316
992k
      MustGather.insert(VL.begin(), VL.end());
1317
992k
    }
1318
1.78M
1319
1.78M
    if (UserTreeIdx.UserTE)
1320
1.15M
      Last->UserTreeIndices.push_back(UserTreeIdx);
1321
1.78M
1322
1.78M
    Last->trySetUserTEOperand(UserTreeIdx, VL, ReuseShuffleIndices);
1323
1.78M
    return Last;
1324
1.78M
  }
1325
1326
  /// -- Vectorization State --
1327
  /// Holds all of the tree entries.
1328
  TreeEntry::VecTreeTy VectorizableTree;
1329
1330
#ifndef NDEBUG
1331
  /// Debug printer.
1332
  LLVM_DUMP_METHOD void dumpVectorizableTree() const {
1333
    for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
1334
      VectorizableTree[Id]->dump();
1335
      dbgs() << "\n";
1336
    }
1337
  }
1338
#endif
1339
1340
7.09M
  TreeEntry *getTreeEntry(Value *V) {
1341
7.09M
    auto I = ScalarToTreeEntry.find(V);
1342
7.09M
    if (I != ScalarToTreeEntry.end())
1343
1.66M
      return VectorizableTree[I->second].get();
1344
5.42M
    return nullptr;
1345
5.42M
  }
1346
1347
1.37M
  const TreeEntry *getTreeEntry(Value *V) const {
1348
1.37M
    auto I = ScalarToTreeEntry.find(V);
1349
1.37M
    if (I != ScalarToTreeEntry.end())
1350
582k
      return VectorizableTree[I->second].get();
1351
797k
    return nullptr;
1352
797k
  }
1353
1354
  /// Maps a specific scalar to its tree entry.
1355
  SmallDenseMap<Value*, int> ScalarToTreeEntry;
1356
1357
  /// A list of scalars that we found that we need to keep as scalars.
1358
  ValueSet MustGather;
1359
1360
  /// This POD struct describes one external user in the vectorized tree.
1361
  struct ExternalUser {
1362
    ExternalUser(Value *S, llvm::User *U, int L)
1363
1.64M
        : Scalar(S), User(U), Lane(L) {}
1364
1365
    // Which scalar in our function.
1366
    Value *Scalar;
1367
1368
    // Which user that uses the scalar.
1369
    llvm::User *User;
1370
1371
    // Which lane does the scalar belong to.
1372
    int Lane;
1373
  };
1374
  using UserList = SmallVector<ExternalUser, 16>;
1375
1376
  /// Checks if two instructions may access the same memory.
1377
  ///
1378
  /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
1379
  /// is invariant in the calling loop.
1380
  bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
1381
3.27M
                 Instruction *Inst2) {
1382
3.27M
    // First check if the result is already in the cache.
1383
3.27M
    AliasCacheKey key = std::make_pair(Inst1, Inst2);
1384
3.27M
    Optional<bool> &result = AliasCache[key];
1385
3.27M
    if (result.hasValue()) {
1386
1.45M
      return result.getValue();
1387
1.45M
    }
1388
1.82M
    MemoryLocation Loc2 = getLocation(Inst2, AA);
1389
1.82M
    bool aliased = true;
1390
1.82M
    if (Loc1.Ptr && 
Loc2.Ptr1.75M
&&
isSimple(Inst1)1.71M
&&
isSimple(Inst2)1.71M
) {
1391
1.71M
      // Do the alias check.
1392
1.71M
      aliased = AA->alias(Loc1, Loc2);
1393
1.71M
    }
1394
1.82M
    // Store the result in the cache.
1395
1.82M
    result = aliased;
1396
1.82M
    return aliased;
1397
1.82M
  }
1398
1399
  using AliasCacheKey = std::pair<Instruction *, Instruction *>;
1400
1401
  /// Cache for alias results.
1402
  /// TODO: consider moving this to the AliasAnalysis itself.
1403
  DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
1404
1405
  /// Removes an instruction from its block and eventually deletes it.
1406
  /// It's like Instruction::eraseFromParent() except that the actual deletion
1407
  /// is delayed until BoUpSLP is destructed.
1408
  /// This is required to ensure that there are no incorrect collisions in the
1409
  /// AliasCache, which can happen if a new instruction is allocated at the
1410
  /// same address as a previously deleted instruction.
1411
228k
  void eraseInstruction(Instruction *I) {
1412
228k
    I->removeFromParent();
1413
228k
    I->dropAllReferences();
1414
228k
    DeletedInstructions.emplace_back(I);
1415
228k
  }
1416
1417
  /// Temporary store for deleted instructions. Instructions will be deleted
1418
  /// eventually when the BoUpSLP is destructed.
1419
  SmallVector<unique_value, 8> DeletedInstructions;
1420
1421
  /// A list of values that need to extracted out of the tree.
1422
  /// This list holds pairs of (Internal Scalar : External User). External User
1423
  /// can be nullptr, it means that this Internal Scalar will be used later,
1424
  /// after vectorization.
1425
  UserList ExternalUses;
1426
1427
  /// Values used only by @llvm.assume calls.
1428
  SmallPtrSet<const Value *, 32> EphValues;
1429
1430
  /// Holds all of the instructions that we gathered.
1431
  SetVector<Instruction *> GatherSeq;
1432
1433
  /// A list of blocks that we are going to CSE.
1434
  SetVector<BasicBlock *> CSEBlocks;
1435
1436
  /// Contains all scheduling relevant data for an instruction.
1437
  /// A ScheduleData either represents a single instruction or a member of an
1438
  /// instruction bundle (= a group of instructions which is combined into a
1439
  /// vector instruction).
1440
  struct ScheduleData {
1441
    // The initial value for the dependency counters. It means that the
1442
    // dependencies are not calculated yet.
1443
    enum { InvalidDeps = -1 };
1444
1445
3.04M
    ScheduleData() = default;
1446
1447
5.43M
    void init(int BlockSchedulingRegionID, Value *OpVal) {
1448
5.43M
      FirstInBundle = this;
1449
5.43M
      NextInBundle = nullptr;
1450
5.43M
      NextLoadStore = nullptr;
1451
5.43M
      IsScheduled = false;
1452
5.43M
      SchedulingRegionID = BlockSchedulingRegionID;
1453
5.43M
      UnscheduledDepsInBundle = UnscheduledDeps;
1454
5.43M
      clearDependencies();
1455
5.43M
      OpValue = OpVal;
1456
5.43M
    }
1457
1458
    /// Returns true if the dependency information has been calculated.
1459
10.8M
    bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
1460
1461
    /// Returns true for single instructions and for bundle representatives
1462
    /// (= the head of a bundle).
1463
7.74M
    bool isSchedulingEntity() const { return FirstInBundle == this; }
1464
1465
    /// Returns true if it represents an instruction bundle and not only a
1466
    /// single instruction.
1467
61.6k
    bool isPartOfBundle() const {
1468
61.6k
      return NextInBundle != nullptr || FirstInBundle != this;
1469
61.6k
    }
1470
1471
    /// Returns true if it is ready for scheduling, i.e. it has no more
1472
    /// unscheduled depending instructions/bundles.
1473
11.6M
    bool isReady() const {
1474
11.6M
      assert(isSchedulingEntity() &&
1475
11.6M
             "can't consider non-scheduling entity for ready list");
1476
11.6M
      return UnscheduledDepsInBundle == 0 && 
!IsScheduled4.05M
;
1477
11.6M
    }
1478
1479
    /// Modifies the number of unscheduled dependencies, also updating it for
1480
    /// the whole bundle.
1481
25.3M
    int incrementUnscheduledDeps(int Incr) {
1482
25.3M
      UnscheduledDeps += Incr;
1483
25.3M
      return FirstInBundle->UnscheduledDepsInBundle += Incr;
1484
25.3M
    }
1485
1486
    /// Sets the number of unscheduled dependencies to the number of
1487
    /// dependencies.
1488
18.5M
    void resetUnscheduledDeps() {
1489
18.5M
      incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
1490
18.5M
    }
1491
1492
    /// Clears all dependency information.
1493
9.30M
    void clearDependencies() {
1494
9.30M
      Dependencies = InvalidDeps;
1495
9.30M
      resetUnscheduledDeps();
1496
9.30M
      MemoryDependencies.clear();
1497
9.30M
    }
1498
1499
0
    void dump(raw_ostream &os) const {
1500
0
      if (!isSchedulingEntity()) {
1501
0
        os << "/ " << *Inst;
1502
0
      } else if (NextInBundle) {
1503
0
        os << '[' << *Inst;
1504
0
        ScheduleData *SD = NextInBundle;
1505
0
        while (SD) {
1506
0
          os << ';' << *SD->Inst;
1507
0
          SD = SD->NextInBundle;
1508
0
        }
1509
0
        os << ']';
1510
0
      } else {
1511
0
        os << *Inst;
1512
0
      }
1513
0
    }
1514
1515
    Instruction *Inst = nullptr;
1516
1517
    /// Points to the head in an instruction bundle (and always to this for
1518
    /// single instructions).
1519
    ScheduleData *FirstInBundle = nullptr;
1520
1521
    /// Single linked list of all instructions in a bundle. Null if it is a
1522
    /// single instruction.
1523
    ScheduleData *NextInBundle = nullptr;
1524
1525
    /// Single linked list of all memory instructions (e.g. load, store, call)
1526
    /// in the block - until the end of the scheduling region.
1527
    ScheduleData *NextLoadStore = nullptr;
1528
1529
    /// The dependent memory instructions.
1530
    /// This list is derived on demand in calculateDependencies().
1531
    SmallVector<ScheduleData *, 4> MemoryDependencies;
1532
1533
    /// This ScheduleData is in the current scheduling region if this matches
1534
    /// the current SchedulingRegionID of BlockScheduling.
1535
    int SchedulingRegionID = 0;
1536
1537
    /// Used for getting a "good" final ordering of instructions.
1538
    int SchedulingPriority = 0;
1539
1540
    /// The number of dependencies. Constitutes of the number of users of the
1541
    /// instruction plus the number of dependent memory instructions (if any).
1542
    /// This value is calculated on demand.
1543
    /// If InvalidDeps, the number of dependencies is not calculated yet.
1544
    int Dependencies = InvalidDeps;
1545
1546
    /// The number of dependencies minus the number of dependencies of scheduled
1547
    /// instructions. As soon as this is zero, the instruction/bundle gets ready
1548
    /// for scheduling.
1549
    /// Note that this is negative as long as Dependencies is not calculated.
1550
    int UnscheduledDeps = InvalidDeps;
1551
1552
    /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
1553
    /// single instructions.
1554
    int UnscheduledDepsInBundle = InvalidDeps;
1555
1556
    /// True if this instruction is scheduled (or considered as scheduled in the
1557
    /// dry-run).
1558
    bool IsScheduled = false;
1559
1560
    /// Opcode of the current instruction in the schedule data.
1561
    Value *OpValue = nullptr;
1562
  };
1563
1564
#ifndef NDEBUG
1565
  friend inline raw_ostream &operator<<(raw_ostream &os,
1566
                                        const BoUpSLP::ScheduleData &SD) {
1567
    SD.dump(os);
1568
    return os;
1569
  }
1570
#endif
1571
1572
  friend struct GraphTraits<BoUpSLP *>;
1573
  friend struct DOTGraphTraits<BoUpSLP *>;
1574
1575
  /// Contains all scheduling data for a basic block.
1576
  struct BlockScheduling {
1577
    BlockScheduling(BasicBlock *BB)
1578
230k
        : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
1579
1580
4.57M
    void clear() {
1581
4.57M
      ReadyInsts.clear();
1582
4.57M
      ScheduleStart = nullptr;
1583
4.57M
      ScheduleEnd = nullptr;
1584
4.57M
      FirstLoadStoreInRegion = nullptr;
1585
4.57M
      LastLoadStoreInRegion = nullptr;
1586
4.57M
1587
4.57M
      // Reduce the maximum schedule region size by the size of the
1588
4.57M
      // previous scheduling run.
1589
4.57M
      ScheduleRegionSizeLimit -= ScheduleRegionSize;
1590
4.57M
      if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
1591
4.54k
        ScheduleRegionSizeLimit = MinScheduleRegionSize;
1592
4.57M
      ScheduleRegionSize = 0;
1593
4.57M
1594
4.57M
      // Make a new scheduling region, i.e. all existing ScheduleData is not
1595
4.57M
      // in the new region yet.
1596
4.57M
      ++SchedulingRegionID;
1597
4.57M
    }
1598
1599
30.2M
    ScheduleData *getScheduleData(Value *V) {
1600
30.2M
      ScheduleData *SD = ScheduleDataMap[V];
1601
30.2M
      if (SD && 
SD->SchedulingRegionID == SchedulingRegionID27.0M
)
1602
24.6M
        return SD;
1603
5.62M
      return nullptr;
1604
5.62M
    }
1605
1606
2.09M
    ScheduleData *getScheduleData(Value *V, Value *Key) {
1607
2.09M
      if (V == Key)
1608
2.09M
        return getScheduleData(V);
1609
0
      auto I = ExtraScheduleDataMap.find(V);
1610
0
      if (I != ExtraScheduleDataMap.end()) {
1611
0
        ScheduleData *SD = I->second[Key];
1612
0
        if (SD && SD->SchedulingRegionID == SchedulingRegionID)
1613
0
          return SD;
1614
0
      }
1615
0
      return nullptr;
1616
0
    }
1617
1618
2.44M
    bool isInSchedulingRegion(ScheduleData *SD) {
1619
2.44M
      return SD->SchedulingRegionID == SchedulingRegionID;
1620
2.44M
    }
1621
1622
    /// Marks an instruction as scheduled and puts all dependent ready
1623
    /// instructions into the ready-list.
1624
    template <typename ReadyListType>
1625
1.71M
    void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
1626
1.71M
      SD->IsScheduled = true;
1627
1.71M
      LLVM_DEBUG(dbgs() << "SLP:   schedule " << *SD << "\n");
1628
1.71M
1629
1.71M
      ScheduleData *BundleMember = SD;
1630
4.04M
      while (BundleMember) {
1631
2.32M
        if (BundleMember->Inst != BundleMember->OpValue) {
1632
0
          BundleMember = BundleMember->NextInBundle;
1633
0
          continue;
1634
0
        }
1635
2.32M
        // Handle the def-use chain dependencies.
1636
4.65M
        
for (Use &U : BundleMember->Inst->operands())2.32M
{
1637
4.65M
          auto *I = dyn_cast<Instruction>(U.get());
1638
4.65M
          if (!I)
1639
942k
            continue;
1640
3.71M
          doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
1641
3.25M
            if (OpDef && OpDef->hasValidDependencies() &&
1642
3.25M
                
OpDef->incrementUnscheduledDeps(-1) == 02.72M
) {
1643
1.42M
              // There are no more unscheduled dependencies after
1644
1.42M
              // decrementing, so we can put the dependent instruction
1645
1.42M
              // into the ready list.
1646
1.42M
              ScheduleData *DepBundle = OpDef->FirstInBundle;
1647
1.42M
              assert(!DepBundle->IsScheduled &&
1648
1.42M
                     "already scheduled bundle gets ready");
1649
1.42M
              ReadyList.insert(DepBundle);
1650
1.42M
              LLVM_DEBUG(dbgs()
1651
1.42M
                         << "SLP:    gets ready (def): " << *DepBundle << "\n");
1652
1.42M
            }
1653
3.25M
          });
void llvm::slpvectorizer::BoUpSLP::BlockScheduling::schedule<llvm::slpvectorizer::BoUpSLP::BlockScheduling::ReadyList>(llvm::slpvectorizer::BoUpSLP::ScheduleData*, llvm::slpvectorizer::BoUpSLP::BlockScheduling::ReadyList&)::'lambda'(llvm::slpvectorizer::BoUpSLP::ScheduleData*)::operator()(llvm::slpvectorizer::BoUpSLP::ScheduleData*) const
Line
Count
Source
1640
3.03M
          doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
1641
3.03M
            if (OpDef && OpDef->hasValidDependencies() &&
1642
3.03M
                
OpDef->incrementUnscheduledDeps(-1) == 02.50M
) {
1643
1.28M
              // There are no more unscheduled dependencies after
1644
1.28M
              // decrementing, so we can put the dependent instruction
1645
1.28M
              // into the ready list.
1646
1.28M
              ScheduleData *DepBundle = OpDef->FirstInBundle;
1647
1.28M
              assert(!DepBundle->IsScheduled &&
1648
1.28M
                     "already scheduled bundle gets ready");
1649
1.28M
              ReadyList.insert(DepBundle);
1650
1.28M
              LLVM_DEBUG(dbgs()
1651
1.28M
                         << "SLP:    gets ready (def): " << *DepBundle << "\n");
1652
1.28M
            }
1653
3.03M
          });
SLPVectorizer.cpp:void llvm::slpvectorizer::BoUpSLP::BlockScheduling::schedule<std::__1::set<llvm::slpvectorizer::BoUpSLP::ScheduleData*, llvm::slpvectorizer::BoUpSLP::scheduleBlock(llvm::slpvectorizer::BoUpSLP::BlockScheduling*)::ScheduleDataCompare, std::__1::allocator<llvm::slpvectorizer::BoUpSLP::ScheduleData*> > >(llvm::slpvectorizer::BoUpSLP::ScheduleData*, std::__1::set<llvm::slpvectorizer::BoUpSLP::ScheduleData*, llvm::slpvectorizer::BoUpSLP::scheduleBlock(llvm::slpvectorizer::BoUpSLP::BlockScheduling*)::ScheduleDataCompare, std::__1::allocator<llvm::slpvectorizer::BoUpSLP::ScheduleData*> >&)::'lambda'(llvm::slpvectorizer::BoUpSLP::ScheduleData*)::operator()(llvm::slpvectorizer::BoUpSLP::ScheduleData*) const
Line
Count
Source
1640
221k
          doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
1641
221k
            if (OpDef && OpDef->hasValidDependencies() &&
1642
221k
                OpDef->incrementUnscheduledDeps(-1) == 0) {
1643
140k
              // There are no more unscheduled dependencies after
1644
140k
              // decrementing, so we can put the dependent instruction
1645
140k
              // into the ready list.
1646
140k
              ScheduleData *DepBundle = OpDef->FirstInBundle;
1647
140k
              assert(!DepBundle->IsScheduled &&
1648
140k
                     "already scheduled bundle gets ready");
1649
140k
              ReadyList.insert(DepBundle);
1650
140k
              LLVM_DEBUG(dbgs()
1651
140k
                         << "SLP:    gets ready (def): " << *DepBundle << "\n");
1652
140k
            }
1653
221k
          });
1654
3.71M
        }
1655
2.32M
        // Handle the memory dependencies.
1656
2.32M
        for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
1657
811k
          if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
1658
50.1k
            // There are no more unscheduled dependencies after decrementing,
1659
50.1k
            // so we can put the dependent instruction into the ready list.
1660
50.1k
            ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
1661
50.1k
            assert(!DepBundle->IsScheduled &&
1662
50.1k
                   "already scheduled bundle gets ready");
1663
50.1k
            ReadyList.insert(DepBundle);
1664
50.1k
            LLVM_DEBUG(dbgs()
1665
50.1k
                       << "SLP:    gets ready (mem): " << *DepBundle << "\n");
1666
50.1k
          }
1667
811k
        }
1668
2.32M
        BundleMember = BundleMember->NextInBundle;
1669
2.32M
      }
1670
1.71M
    }
void llvm::slpvectorizer::BoUpSLP::BlockScheduling::schedule<llvm::slpvectorizer::BoUpSLP::BlockScheduling::ReadyList>(llvm::slpvectorizer::BoUpSLP::ScheduleData*, llvm::slpvectorizer::BoUpSLP::BlockScheduling::ReadyList&)
Line
Count
Source
1625
1.46M
    void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
1626
1.46M
      SD->IsScheduled = true;
1627
1.46M
      LLVM_DEBUG(dbgs() << "SLP:   schedule " << *SD << "\n");
1628
1.46M
1629
1.46M
      ScheduleData *BundleMember = SD;
1630
3.39M
      while (BundleMember) {
1631
1.92M
        if (BundleMember->Inst != BundleMember->OpValue) {
1632
0
          BundleMember = BundleMember->NextInBundle;
1633
0
          continue;
1634
0
        }
1635
1.92M
        // Handle the def-use chain dependencies.
1636
3.79M
        
for (Use &U : BundleMember->Inst->operands())1.92M
{
1637
3.79M
          auto *I = dyn_cast<Instruction>(U.get());
1638
3.79M
          if (!I)
1639
484k
            continue;
1640
3.30M
          doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
1641
3.30M
            if (OpDef && OpDef->hasValidDependencies() &&
1642
3.30M
                OpDef->incrementUnscheduledDeps(-1) == 0) {
1643
3.30M
              // There are no more unscheduled dependencies after
1644
3.30M
              // decrementing, so we can put the dependent instruction
1645
3.30M
              // into the ready list.
1646
3.30M
              ScheduleData *DepBundle = OpDef->FirstInBundle;
1647
3.30M
              assert(!DepBundle->IsScheduled &&
1648
3.30M
                     "already scheduled bundle gets ready");
1649
3.30M
              ReadyList.insert(DepBundle);
1650
3.30M
              LLVM_DEBUG(dbgs()
1651
3.30M
                         << "SLP:    gets ready (def): " << *DepBundle << "\n");
1652
3.30M
            }
1653
3.30M
          });
1654
3.30M
        }
1655
1.92M
        // Handle the memory dependencies.
1656
1.92M
        for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
1657
791k
          if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
1658
48.1k
            // There are no more unscheduled dependencies after decrementing,
1659
48.1k
            // so we can put the dependent instruction into the ready list.
1660
48.1k
            ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
1661
48.1k
            assert(!DepBundle->IsScheduled &&
1662
48.1k
                   "already scheduled bundle gets ready");
1663
48.1k
            ReadyList.insert(DepBundle);
1664
48.1k
            LLVM_DEBUG(dbgs()
1665
48.1k
                       << "SLP:    gets ready (mem): " << *DepBundle << "\n");
1666
48.1k
          }
1667
791k
        }
1668
1.92M
        BundleMember = BundleMember->NextInBundle;
1669
1.92M
      }
1670
1.46M
    }
SLPVectorizer.cpp:void llvm::slpvectorizer::BoUpSLP::BlockScheduling::schedule<std::__1::set<llvm::slpvectorizer::BoUpSLP::ScheduleData*, llvm::slpvectorizer::BoUpSLP::scheduleBlock(llvm::slpvectorizer::BoUpSLP::BlockScheduling*)::ScheduleDataCompare, std::__1::allocator<llvm::slpvectorizer::BoUpSLP::ScheduleData*> > >(llvm::slpvectorizer::BoUpSLP::ScheduleData*, std::__1::set<llvm::slpvectorizer::BoUpSLP::ScheduleData*, llvm::slpvectorizer::BoUpSLP::scheduleBlock(llvm::slpvectorizer::BoUpSLP::BlockScheduling*)::ScheduleDataCompare, std::__1::allocator<llvm::slpvectorizer::BoUpSLP::ScheduleData*> >&)
Line
Count
Source
1625
245k
    void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
1626
245k
      SD->IsScheduled = true;
1627
245k
      LLVM_DEBUG(dbgs() << "SLP:   schedule " << *SD << "\n");
1628
245k
1629
245k
      ScheduleData *BundleMember = SD;
1630
651k
      while (BundleMember) {
1631
406k
        if (BundleMember->Inst != BundleMember->OpValue) {
1632
0
          BundleMember = BundleMember->NextInBundle;
1633
0
          continue;
1634
0
        }
1635
406k
        // Handle the def-use chain dependencies.
1636
864k
        
for (Use &U : BundleMember->Inst->operands())406k
{
1637
864k
          auto *I = dyn_cast<Instruction>(U.get());
1638
864k
          if (!I)
1639
457k
            continue;
1640
406k
          doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
1641
406k
            if (OpDef && OpDef->hasValidDependencies() &&
1642
406k
                OpDef->incrementUnscheduledDeps(-1) == 0) {
1643
406k
              // There are no more unscheduled dependencies after
1644
406k
              // decrementing, so we can put the dependent instruction
1645
406k
              // into the ready list.
1646
406k
              ScheduleData *DepBundle = OpDef->FirstInBundle;
1647
406k
              assert(!DepBundle->IsScheduled &&
1648
406k
                     "already scheduled bundle gets ready");
1649
406k
              ReadyList.insert(DepBundle);
1650
406k
              LLVM_DEBUG(dbgs()
1651
406k
                         << "SLP:    gets ready (def): " << *DepBundle << "\n");
1652
406k
            }
1653
406k
          });
1654
406k
        }
1655
406k
        // Handle the memory dependencies.
1656
406k
        for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
1657
19.7k
          if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
1658
1.98k
            // There are no more unscheduled dependencies after decrementing,
1659
1.98k
            // so we can put the dependent instruction into the ready list.
1660
1.98k
            ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
1661
1.98k
            assert(!DepBundle->IsScheduled &&
1662
1.98k
                   "already scheduled bundle gets ready");
1663
1.98k
            ReadyList.insert(DepBundle);
1664
1.98k
            LLVM_DEBUG(dbgs()
1665
1.98k
                       << "SLP:    gets ready (mem): " << *DepBundle << "\n");
1666
1.98k
          }
1667
19.7k
        }
1668
406k
        BundleMember = BundleMember->NextInBundle;
1669
406k
      }
1670
245k
    }
1671
1672
    void doForAllOpcodes(Value *V,
1673
19.7M
                         function_ref<void(ScheduleData *SD)> Action) {
1674
19.7M
      if (ScheduleData *SD = getScheduleData(V))
1675
19.2M
        Action(SD);
1676
19.7M
      auto I = ExtraScheduleDataMap.find(V);
1677
19.7M
      if (I != ExtraScheduleDataMap.end())
1678
0
        for (auto &P : I->second)
1679
0
          if (P.second->SchedulingRegionID == SchedulingRegionID)
1680
0
            Action(P.second);
1681
19.7M
    }
1682
1683
    /// Put all instructions into the ReadyList which are ready for scheduling.
1684
    template <typename ReadyListType>
1685
653k
    void initialFillReadyList(ReadyListType &ReadyList) {
1686
6.51M
      for (auto *I = ScheduleStart; I != ScheduleEnd; 
I = I->getNextNode()5.86M
) {
1687
5.86M
        doForAllOpcodes(I, [&](ScheduleData *SD) {
1688
5.86M
          if (SD->isSchedulingEntity() && 
SD->isReady()4.72M
) {
1689
155k
            ReadyList.insert(SD);
1690
155k
            LLVM_DEBUG(dbgs()
1691
155k
                       << "SLP:    initially in ready list: " << *I << "\n");
1692
155k
          }
1693
5.86M
        });
void llvm::slpvectorizer::BoUpSLP::BlockScheduling::initialFillReadyList<llvm::slpvectorizer::BoUpSLP::BlockScheduling::ReadyList>(llvm::slpvectorizer::BoUpSLP::BlockScheduling::ReadyList&)::'lambda'(llvm::slpvectorizer::BoUpSLP::ScheduleData*)::operator()(llvm::slpvectorizer::BoUpSLP::ScheduleData*) const
Line
Count
Source
1687
5.45M
        doForAllOpcodes(I, [&](ScheduleData *SD) {
1688
5.45M
          if (SD->isSchedulingEntity() && 
SD->isReady()4.47M
) {
1689
52.0k
            ReadyList.insert(SD);
1690
52.0k
            LLVM_DEBUG(dbgs()
1691
52.0k
                       << "SLP:    initially in ready list: " << *I << "\n");
1692
52.0k
          }
1693
5.45M
        });
SLPVectorizer.cpp:void llvm::slpvectorizer::BoUpSLP::BlockScheduling::initialFillReadyList<std::__1::set<llvm::slpvectorizer::BoUpSLP::ScheduleData*, llvm::slpvectorizer::BoUpSLP::scheduleBlock(llvm::slpvectorizer::BoUpSLP::BlockScheduling*)::ScheduleDataCompare, std::__1::allocator<llvm::slpvectorizer::BoUpSLP::ScheduleData*> > >(std::__1::set<llvm::slpvectorizer::BoUpSLP::ScheduleData*, llvm::slpvectorizer::BoUpSLP::scheduleBlock(llvm::slpvectorizer::BoUpSLP::BlockScheduling*)::ScheduleDataCompare, std::__1::allocator<llvm::slpvectorizer::BoUpSLP::ScheduleData*> >&)::'lambda'(llvm::slpvectorizer::BoUpSLP::ScheduleData*)::operator()(llvm::slpvectorizer::BoUpSLP::ScheduleData*) const
Line
Count
Source
1687
406k
        doForAllOpcodes(I, [&](ScheduleData *SD) {
1688
406k
          if (SD->isSchedulingEntity() && 
SD->isReady()245k
) {
1689
103k
            ReadyList.insert(SD);
1690
103k
            LLVM_DEBUG(dbgs()
1691
103k
                       << "SLP:    initially in ready list: " << *I << "\n");
1692
103k
          }
1693
406k
        });
1694
5.86M
      }
1695
653k
    }
void llvm::slpvectorizer::BoUpSLP::BlockScheduling::initialFillReadyList<llvm::slpvectorizer::BoUpSLP::BlockScheduling::ReadyList>(llvm::slpvectorizer::BoUpSLP::BlockScheduling::ReadyList&)
Line
Count
Source
1685
609k
    void initialFillReadyList(ReadyListType &ReadyList) {
1686
6.06M
      for (auto *I = ScheduleStart; I != ScheduleEnd; 
I = I->getNextNode()5.45M
) {
1687
5.45M
        doForAllOpcodes(I, [&](ScheduleData *SD) {
1688
5.45M
          if (SD->isSchedulingEntity() && SD->isReady()) {
1689
5.45M
            ReadyList.insert(SD);
1690
5.45M
            LLVM_DEBUG(dbgs()
1691
5.45M
                       << "SLP:    initially in ready list: " << *I << "\n");
1692
5.45M
          }
1693
5.45M
        });
1694
5.45M
      }
1695
609k
    }
SLPVectorizer.cpp:void llvm::slpvectorizer::BoUpSLP::BlockScheduling::initialFillReadyList<std::__1::set<llvm::slpvectorizer::BoUpSLP::ScheduleData*, llvm::slpvectorizer::BoUpSLP::scheduleBlock(llvm::slpvectorizer::BoUpSLP::BlockScheduling*)::ScheduleDataCompare, std::__1::allocator<llvm::slpvectorizer::BoUpSLP::ScheduleData*> > >(std::__1::set<llvm::slpvectorizer::BoUpSLP::ScheduleData*, llvm::slpvectorizer::BoUpSLP::scheduleBlock(llvm::slpvectorizer::BoUpSLP::BlockScheduling*)::ScheduleDataCompare, std::__1::allocator<llvm::slpvectorizer::BoUpSLP::ScheduleData*> >&)
Line
Count
Source
1685
44.0k
    void initialFillReadyList(ReadyListType &ReadyList) {
1686
450k
      for (auto *I = ScheduleStart; I != ScheduleEnd; 
I = I->getNextNode()406k
) {
1687
406k
        doForAllOpcodes(I, [&](ScheduleData *SD) {
1688
406k
          if (SD->isSchedulingEntity() && SD->isReady()) {
1689
406k
            ReadyList.insert(SD);
1690
406k
            LLVM_DEBUG(dbgs()
1691
406k
                       << "SLP:    initially in ready list: " << *I << "\n");
1692
406k
          }
1693
406k
        });
1694
406k
      }
1695
44.0k
    }
1696
1697
    /// Checks if a bundle of instructions can be scheduled, i.e. has no
1698
    /// cyclic dependencies. This is only a dry-run, no instructions are
1699
    /// actually moved at this stage.
1700
    bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
1701
                           const InstructionsState &S);
1702
1703
    /// Un-bundles a group of instructions.
1704
    void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
1705
1706
    /// Allocates schedule data chunk.
1707
    ScheduleData *allocateScheduleDataChunks();
1708
1709
    /// Extends the scheduling region so that V is inside the region.
1710
    /// \returns true if the region size is within the limit.
1711
    bool extendSchedulingRegion(Value *V, const InstructionsState &S);
1712
1713
    /// Initialize the ScheduleData structures for new instructions in the
1714
    /// scheduling region.
1715
    void initScheduleData(Instruction *FromI, Instruction *ToI,
1716
                          ScheduleData *PrevLoadStore,
1717
                          ScheduleData *NextLoadStore);
1718
1719
    /// Updates the dependency information of a bundle and of all instructions/
1720
    /// bundles which depend on the original bundle.
1721
    void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
1722
                               BoUpSLP *SLP);
1723
1724
    /// Sets all instruction in the scheduling region to un-scheduled.
1725
    void resetSchedule();
1726
1727
    BasicBlock *BB;
1728
1729
    /// Simple memory allocation for ScheduleData.
1730
    std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
1731
1732
    /// The size of a ScheduleData array in ScheduleDataChunks.
1733
    int ChunkSize;
1734
1735
    /// The allocator position in the current chunk, which is the last entry
1736
    /// of ScheduleDataChunks.
1737
    int ChunkPos;
1738
1739
    /// Attaches ScheduleData to Instruction.
1740
    /// Note that the mapping survives during all vectorization iterations, i.e.
1741
    /// ScheduleData structures are recycled.
1742
    DenseMap<Value *, ScheduleData *> ScheduleDataMap;
1743
1744
    /// Attaches ScheduleData to Instruction with the leading key.
1745
    DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>>
1746
        ExtraScheduleDataMap;
1747
1748
    struct ReadyList : SmallVector<ScheduleData *, 8> {
1749
1.89M
      void insert(ScheduleData *SD) { push_back(SD); }
1750
    };
1751
1752
    /// The ready-list for scheduling (only used for the dry-run).
1753
    ReadyList ReadyInsts;
1754
1755
    /// The first instruction of the scheduling region.
1756
    Instruction *ScheduleStart = nullptr;
1757
1758
    /// The first instruction _after_ the scheduling region.
1759
    Instruction *ScheduleEnd = nullptr;
1760
1761
    /// The first memory accessing instruction in the scheduling region
1762
    /// (can be null).
1763
    ScheduleData *FirstLoadStoreInRegion = nullptr;
1764
1765
    /// The last memory accessing instruction in the scheduling region
1766
    /// (can be null).
1767
    ScheduleData *LastLoadStoreInRegion = nullptr;
1768
1769
    /// The current size of the scheduling region.
1770
    int ScheduleRegionSize = 0;
1771
1772
    /// The maximum size allowed for the scheduling region.
1773
    int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
1774
1775
    /// The ID of the scheduling region. For a new vectorization iteration this
1776
    /// is incremented which "removes" all ScheduleData from the region.
1777
    // Make sure that the initial SchedulingRegionID is greater than the
1778
    // initial SchedulingRegionID in ScheduleData (which is 0).
1779
    int SchedulingRegionID = 1;
1780
  };
1781
1782
  /// Attaches the BlockScheduling structures to basic blocks.
1783
  MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
1784
1785
  /// Performs the "real" scheduling. Done before vectorization is actually
1786
  /// performed in a basic block.
1787
  void scheduleBlock(BlockScheduling *BS);
1788
1789
  /// List of users to ignore during scheduling and that don't need extracting.
1790
  ArrayRef<Value *> UserIgnoreList;
1791
1792
  using OrdersType = SmallVector<unsigned, 4>;
1793
  /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
1794
  /// sorted SmallVectors of unsigned.
1795
  struct OrdersTypeDenseMapInfo {
1796
143k
    static OrdersType getEmptyKey() {
1797
143k
      OrdersType V;
1798
143k
      V.push_back(~1U);
1799
143k
      return V;
1800
143k
    }
1801
1802
111k
    static OrdersType getTombstoneKey() {
1803
111k
      OrdersType V;
1804
111k
      V.push_back(~2U);
1805
111k
      return V;
1806
111k
    }
1807
1808
31.8k
    static unsigned getHashValue(const OrdersType &V) {
1809
31.8k
      return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
1810
31.8k
    }
1811
1812
3.70M
    static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
1813
3.70M
      return LHS == RHS;
1814
3.70M
    }
1815
  };
1816
1817
  /// Contains orders of operations along with the number of bundles that have
1818
  /// operations in this order. It stores only those orders that require
1819
  /// reordering, if reordering is not required it is counted using \a
1820
  /// NumOpsWantToKeepOriginalOrder.
1821
  DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo> NumOpsWantToKeepOrder;
1822
  /// Number of bundles that do not require reordering.
1823
  unsigned NumOpsWantToKeepOriginalOrder = 0;
1824
1825
  // Analysis and block reference.
1826
  Function *F;
1827
  ScalarEvolution *SE;
1828
  TargetTransformInfo *TTI;
1829
  TargetLibraryInfo *TLI;
1830
  AliasAnalysis *AA;
1831
  LoopInfo *LI;
1832
  DominatorTree *DT;
1833
  AssumptionCache *AC;
1834
  DemandedBits *DB;
1835
  const DataLayout *DL;
1836
  OptimizationRemarkEmitter *ORE;
1837
1838
  unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
1839
  unsigned MinVecRegSize; // Set by cl::opt (default: 128).
1840
1841
  /// Instruction builder to construct the vectorized tree.
1842
  IRBuilder<> Builder;
1843
1844
  /// A map of scalar integer values to the smallest bit width with which they
1845
  /// can legally be represented. The values map to (width, signed) pairs,
1846
  /// where "width" indicates the minimum bit width and "signed" is True if the
1847
  /// value must be signed-extended, rather than zero-extended, back to its
1848
  /// original width.
1849
  MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;
1850
};
1851
1852
} // end namespace slpvectorizer
1853
1854
template <> struct GraphTraits<BoUpSLP *> {
1855
  using TreeEntry = BoUpSLP::TreeEntry;
1856
1857
  /// NodeRef has to be a pointer per the GraphWriter.
1858
  using NodeRef = TreeEntry *;
1859
1860
  using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
1861
1862
  /// Add the VectorizableTree to the index iterator to be able to return
1863
  /// TreeEntry pointers.
1864
  struct ChildIteratorType
1865
      : public iterator_adaptor_base<
1866
            ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
1867
    ContainerTy &VectorizableTree;
1868
1869
    ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, 1>::iterator W,
1870
                      ContainerTy &VT)
1871
0
        : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
1872
1873
0
    NodeRef operator*() { return I->UserTE; }
1874
  };
1875
1876
0
  static NodeRef getEntryNode(BoUpSLP &R) {
1877
0
    return R.VectorizableTree[0].get();
1878
0
  }
1879
1880
0
  static ChildIteratorType child_begin(NodeRef N) {
1881
0
    return {N->UserTreeIndices.begin(), N->Container};
1882
0
  }
1883
1884
0
  static ChildIteratorType child_end(NodeRef N) {
1885
0
    return {N->UserTreeIndices.end(), N->Container};
1886
0
  }
1887
1888
  /// For the node iterator we just need to turn the TreeEntry iterator into a
1889
  /// TreeEntry* iterator so that it dereferences to NodeRef.
1890
  class nodes_iterator {
1891
    using ItTy = ContainerTy::iterator;
1892
    ItTy It;
1893
1894
  public:
1895
0
    nodes_iterator(const ItTy &It2) : It(It2) {}
1896
0
    NodeRef operator*() { return It->get(); }
1897
0
    nodes_iterator operator++() {
1898
0
      ++It;
1899
0
      return *this;
1900
0
    }
1901
0
    bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
1902
  };
1903
1904
0
  static nodes_iterator nodes_begin(BoUpSLP *R) {
1905
0
    return nodes_iterator(R->VectorizableTree.begin());
1906
0
  }
1907
1908
0
  static nodes_iterator nodes_end(BoUpSLP *R) {
1909
0
    return nodes_iterator(R->VectorizableTree.end());
1910
0
  }
1911
1912
0
  static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
1913
};
1914
1915
template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
1916
  using TreeEntry = BoUpSLP::TreeEntry;
1917
1918
0
  DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
1919
1920
0
  std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
1921
0
    std::string Str;
1922
0
    raw_string_ostream OS(Str);
1923
0
    if (isSplat(Entry->Scalars)) {
1924
0
      OS << "<splat> " << *Entry->Scalars[0];
1925
0
      return Str;
1926
0
    }
1927
0
    for (auto V : Entry->Scalars) {
1928
0
      OS << *V;
1929
0
      if (std::any_of(
1930
0
              R->ExternalUses.begin(), R->ExternalUses.end(),
1931
0
              [&](const BoUpSLP::ExternalUser &EU) { return EU.Scalar == V; }))
1932
0
        OS << " <extract>";
1933
0
      OS << "\n";
1934
0
    }
1935
0
    return Str;
1936
0
  }
1937
1938
  static std::string getNodeAttributes(const TreeEntry *Entry,
1939
0
                                       const BoUpSLP *) {
1940
0
    if (Entry->NeedToGather)
1941
0
      return "color=red";
1942
0
    return "";
1943
0
  }
1944
};
1945
1946
} // end namespace llvm
1947
1948
void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
1949
620k
                        ArrayRef<Value *> UserIgnoreLst) {
1950
620k
  ExtraValueToDebugLocsMap ExternallyUsedValues;
1951
620k
  buildTree(Roots, ExternallyUsedValues, UserIgnoreLst);
1952
620k
}
1953
1954
void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
1955
                        ExtraValueToDebugLocsMap &ExternallyUsedValues,
1956
629k
                        ArrayRef<Value *> UserIgnoreLst) {
1957
629k
  deleteTree();
1958
629k
  UserIgnoreList = UserIgnoreLst;
1959
629k
  if (!allSameType(Roots))
1960
0
    return;
1961
629k
  buildTree_rec(Roots, 0, EdgeInfo());
1962
629k
1963
629k
  // Collect the values that we need to extract from the tree.
1964
1.78M
  for (auto &TEPtr : VectorizableTree) {
1965
1.78M
    TreeEntry *Entry = TEPtr.get();
1966
1.78M
1967
1.78M
    // No need to handle users of gathered values.
1968
1.78M
    if (Entry->NeedToGather)
1969
992k
      continue;
1970
796k
1971
796k
    // For each lane:
1972
2.73M
    
for (int Lane = 0, LE = Entry->Scalars.size(); 796k
Lane != LE;
++Lane1.94M
) {
1973
1.94M
      Value *Scalar = Entry->Scalars[Lane];
1974
1.94M
      int FoundLane = Lane;
1975
1.94M
      if (!Entry->ReuseShuffleIndices.empty()) {
1976
2.63k
        FoundLane =
1977
2.63k
            std::distance(Entry->ReuseShuffleIndices.begin(),
1978
2.63k
                          llvm::find(Entry->ReuseShuffleIndices, FoundLane));
1979
2.63k
      }
1980
1.94M
1981
1.94M
      // Check if the scalar is externally used as an extra arg.
1982
1.94M
      auto ExtI = ExternallyUsedValues.find(Scalar);
1983
1.94M
      if (ExtI != ExternallyUsedValues.end()) {
1984
10
        LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
1985
10
                          << Lane << " from " << *Scalar << ".\n");
1986
10
        ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
1987
10
      }
1988
3.14M
      for (User *U : Scalar->users()) {
1989
3.14M
        LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
1990
3.14M
1991
3.14M
        Instruction *UserInst = dyn_cast<Instruction>(U);
1992
3.14M
        if (!UserInst)
1993
0
          continue;
1994
3.14M
1995
3.14M
        // Skip in-tree scalars that become vectors
1996
3.14M
        if (TreeEntry *UseEntry = getTreeEntry(U)) {
1997
1.46M
          Value *UseScalar = UseEntry->Scalars[0];
1998
1.46M
          // Some in-tree scalars will remain as scalar in vectorized
1999
1.46M
          // instructions. If that is the case, the one in Lane 0 will
2000
1.46M
          // be used.
2001
1.46M
          if (UseScalar != U ||
2002
1.46M
              
!InTreeUserNeedToExtract(Scalar, UserInst, TLI)575k
) {
2003
1.46M
            LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
2004
1.46M
                              << ".\n");
2005
1.46M
            assert(!UseEntry->NeedToGather && "Bad state");
2006
1.46M
            continue;
2007
1.46M
          }
2008
1.68M
        }
2009
1.68M
2010
1.68M
        // Ignore users in the user ignore list.
2011
1.68M
        if (is_contained(UserIgnoreList, UserInst))
2012
39.7k
          continue;
2013
1.64M
2014
1.64M
        LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane "
2015
1.64M
                          << Lane << " from " << *Scalar << ".\n");
2016
1.64M
        ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));
2017
1.64M
      }
2018
1.94M
    }
2019
796k
  }
2020
629k
}
2021
2022
void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
2023
1.94M
                            const EdgeInfo &UserTreeIdx) {
2024
1.94M
  assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
2025
1.94M
2026
1.94M
  InstructionsState S = getSameOpcode(VL);
2027
1.94M
  if (Depth == RecursionMaxDepth) {
2028
6.59k
    LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
2029
6.59k
    newTreeEntry(VL, false, UserTreeIdx);
2030
6.59k
    return;
2031
6.59k
  }
2032
1.93M
2033
1.93M
  // Don't handle vectors.
2034
1.93M
  if (S.OpValue->getType()->isVectorTy()) {
2035
0
    LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
2036
0
    newTreeEntry(VL, false, UserTreeIdx);
2037
0
    return;
2038
0
  }
2039
1.93M
2040
1.93M
  if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
2041
128k
    if (SI->getValueOperand()->getType()->isVectorTy()) {
2042
0
      LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
2043
0
      newTreeEntry(VL, false, UserTreeIdx);
2044
0
      return;
2045
0
    }
2046
1.93M
2047
1.93M
  // If all of the operands are identical or constant we have a simple solution.
2048
1.93M
  if (allConstant(VL) || 
isSplat(VL)1.78M
||
!allSameBlock(VL)1.71M
||
!S.getOpcode()1.39M
) {
2049
683k
    LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
2050
683k
    newTreeEntry(VL, false, UserTreeIdx);
2051
683k
    return;
2052
683k
  }
2053
1.25M
2054
1.25M
  // We now know that this is a vector of instructions of the same type from
2055
1.25M
  // the same block.
2056
1.25M
2057
1.25M
  // Don't vectorize ephemeral values.
2058
4.26M
  
for (unsigned i = 0, e = VL.size(); 1.25M
i != e;
++i3.01M
) {
2059
3.01M
    if (EphValues.count(VL[i])) {
2060
24
      LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i]
2061
24
                        << ") is ephemeral.\n");
2062
24
      newTreeEntry(VL, false, UserTreeIdx);
2063
24
      return;
2064
24
    }
2065
3.01M
  }
2066
1.25M
2067
1.25M
  // Check if this is a duplicate of another entry.
2068
1.25M
  
if (TreeEntry *1.25M
E1.25M
= getTreeEntry(S.OpValue)) {
2069
162k
    LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
2070
162k
    if (!E->isSame(VL)) {
2071
9.76k
      LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
2072
9.76k
      newTreeEntry(VL, false, UserTreeIdx);
2073
9.76k
      return;
2074
9.76k
    }
2075
152k
    // Record the reuse of the tree node.  FIXME, currently this is only used to
2076
152k
    // properly draw the graph rather than for the actual vectorization.
2077
152k
    E->UserTreeIndices.push_back(UserTreeIdx);
2078
152k
    LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
2079
152k
                      << ".\n");
2080
152k
    E->trySetUserTEOperand(UserTreeIdx, VL, None);
2081
152k
    return;
2082
152k
  }
2083
1.08M
2084
1.08M
  // Check that none of the instructions in the bundle are already in the tree.
2085
3.67M
  
for (unsigned i = 0, e = VL.size(); 1.08M
i != e;
++i2.58M
) {
2086
2.59M
    auto *I = dyn_cast<Instruction>(VL[i]);
2087
2.59M
    if (!I)
2088
0
      continue;
2089
2.59M
    if (getTreeEntry(I)) {
2090
5.97k
      LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i]
2091
5.97k
                        << ") is already in tree.\n");
2092
5.97k
      newTreeEntry(VL, false, UserTreeIdx);
2093
5.97k
      return;
2094
5.97k
    }
2095
2.59M
  }
2096
1.08M
2097
1.08M
  // If any of the scalars is marked as a value that needs to stay scalar, then
2098
1.08M
  // we need to gather the scalars.
2099
1.08M
  // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
2100
3.62M
  
for (unsigned i = 0, e = VL.size(); 1.08M
i != e;
++i2.54M
) {
2101
2.55M
    if (MustGather.count(VL[i]) || 
is_contained(UserIgnoreList, VL[i])2.54M
) {
2102
18.4k
      LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
2103
18.4k
      newTreeEntry(VL, false, UserTreeIdx);
2104
18.4k
      return;
2105
18.4k
    }
2106
2.55M
  }
2107
1.08M
2108
1.08M
  // Check that all of the users of the scalars that we want to vectorize are
2109
1.08M
  // schedulable.
2110
1.08M
  auto *VL0 = cast<Instruction>(S.OpValue);
2111
1.06M
  BasicBlock *BB = VL0->getParent();
2112
1.06M
2113
1.06M
  if (!DT->isReachableFromEntry(BB)) {
2114
3
    // Don't go into unreachable blocks. They may contain instructions with
2115
3
    // dependency cycles which confuse the final scheduling.
2116
3
    LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
2117
3
    newTreeEntry(VL, false, UserTreeIdx);
2118
3
    return;
2119
3
  }
2120
1.06M
2121
1.06M
  // Check that every instruction appears once in this bundle.
2122
1.06M
  SmallVector<unsigned, 4> ReuseShuffleIndicies;
2123
1.06M
  SmallVector<Value *, 4> UniqueValues;
2124
1.06M
  DenseMap<Value *, unsigned> UniquePositions;
2125
2.53M
  for (Value *V : VL) {
2126
2.53M
    auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
2127
2.53M
    ReuseShuffleIndicies.emplace_back(Res.first->second);
2128
2.53M
    if (Res.second)
2129
2.53M
      UniqueValues.emplace_back(V);
2130
2.53M
  }
2131
1.06M
  if (UniqueValues.size() == VL.size()) {
2132
1.06M
    ReuseShuffleIndicies.clear();
2133
1.06M
  } else {
2134
2.21k
    LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
2135
2.21k
    if (UniqueValues.size() <= 1 || !llvm::isPowerOf2_32(UniqueValues.size())) {
2136
647
      LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
2137
647
      newTreeEntry(VL, false, UserTreeIdx);
2138
647
      return;
2139
647
    }
2140
1.56k
    VL = UniqueValues;
2141
1.56k
  }
2142
1.06M
2143
1.06M
  auto &BSRef = BlocksSchedules[BB];
2144
1.06M
  if (!BSRef)
2145
230k
    BSRef = llvm::make_unique<BlockScheduling>(BB);
2146
1.06M
2147
1.06M
  BlockScheduling &BS = *BSRef.get();
2148
1.06M
2149
1.06M
  if (!BS.tryScheduleBundle(VL, this, S)) {
2150
54.5k
    LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
2151
54.5k
    assert((!BS.getScheduleData(VL0) ||
2152
54.5k
            !BS.getScheduleData(VL0)->isPartOfBundle()) &&
2153
54.5k
           "tryScheduleBundle should cancelScheduling on failure");
2154
54.5k
    newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
2155
54.5k
    return;
2156
54.5k
  }
2157
1.00M
  LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
2158
1.00M
2159
1.00M
  unsigned ShuffleOrOp = S.isAltShuffle() ?
2160
894k
                
(unsigned) Instruction::ShuffleVector115k
: S.getOpcode();
2161
1.00M
  switch (ShuffleOrOp) {
2162
1.00M
    case Instruction::PHI: {
2163
175k
      PHINode *PH = dyn_cast<PHINode>(VL0);
2164
175k
2165
175k
      // Check for terminator values (e.g. invoke).
2166
611k
      for (unsigned j = 0; j < VL.size(); 
++j436k
)
2167
1.64M
        
for (unsigned i = 0, e = PH->getNumIncomingValues(); 436k
i < e;
++i1.21M
) {
2168
1.21M
          Instruction *Term = dyn_cast<Instruction>(
2169
1.21M
              cast<PHINode>(VL[j])->getIncomingValueForBlock(
2170
1.21M
                  PH->getIncomingBlock(i)));
2171
1.21M
          if (Term && 
Term->isTerminator()1.09M
) {
2172
475
            LLVM_DEBUG(dbgs()
2173
475
                       << "SLP: Need to swizzle PHINodes (terminator use).\n");
2174
475
            BS.cancelScheduling(VL, VL0);
2175
475
            newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
2176
475
            return;
2177
475
          }
2178
1.21M
        }
2179
175k
2180
175k
      auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
2181
175k
      LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
2182
175k
2183
644k
      for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; 
++i469k
) {
2184
469k
        ValueList Operands;
2185
469k
        // Prepare the operand vector.
2186
469k
        for (Value *j : VL)
2187
1.20M
          Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(
2188
1.20M
              PH->getIncomingBlock(i)));
2189
469k
2190
469k
        buildTree_rec(Operands, Depth + 1, {TE, i});
2191
469k
      }
2192
175k
      return;
2193
175k
    }
2194
175k
    case Instruction::ExtractValue:
2195
8.86k
    case Instruction::ExtractElement: {
2196
8.86k
      OrdersType CurrentOrder;
2197
8.86k
      bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
2198
8.86k
      if (Reuse) {
2199
2.49k
        LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
2200
2.49k
        ++NumOpsWantToKeepOriginalOrder;
2201
2.49k
        newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
2202
2.49k
                     ReuseShuffleIndicies);
2203
2.49k
        // This is a special case, as it does not gather, but at the same time
2204
2.49k
        // we are not extending buildTree_rec() towards the operands.
2205
2.49k
        ValueList Op0;
2206
2.49k
        Op0.assign(VL.size(), VL0->getOperand(0));
2207
2.49k
        VectorizableTree.back()->setOperand(0, Op0, ReuseShuffleIndicies);
2208
2.49k
        return;
2209
2.49k
      }
2210
6.36k
      if (!CurrentOrder.empty()) {
2211
83
        LLVM_DEBUG({
2212
83
          dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
2213
83
                    "with order";
2214
83
          for (unsigned Idx : CurrentOrder)
2215
83
            dbgs() << " " << Idx;
2216
83
          dbgs() << "\n";
2217
83
        });
2218
83
        // Insert new order with initial value 0, if it does not exist,
2219
83
        // otherwise return the iterator to the existing one.
2220
83
        auto StoredCurrentOrderAndNum =
2221
83
            NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
2222
83
        ++StoredCurrentOrderAndNum->getSecond();
2223
83
        newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies,
2224
83
                     StoredCurrentOrderAndNum->getFirst());
2225
83
        // This is a special case, as it does not gather, but at the same time
2226
83
        // we are not extending buildTree_rec() towards the operands.
2227
83
        ValueList Op0;
2228
83
        Op0.assign(VL.size(), VL0->getOperand(0));
2229
83
        VectorizableTree.back()->setOperand(0, Op0, ReuseShuffleIndicies);
2230
83
        return;
2231
83
      }
2232
6.28k
      LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
2233
6.28k
      newTreeEntry(VL, /*Vectorized=*/false, UserTreeIdx, ReuseShuffleIndicies);
2234
6.28k
      BS.cancelScheduling(VL, VL0);
2235
6.28k
      return;
2236
6.28k
    }
2237
256k
    case Instruction::Load: {
2238
256k
      // Check that a vectorized load would load the same memory as a scalar
2239
256k
      // load. For example, we don't want to vectorize loads that are smaller
2240
256k
      // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
2241
256k
      // treats loading/storing it as an i8 struct. If we vectorize loads/stores
2242
256k
      // from such a struct, we read/write packed bits disagreeing with the
2243
256k
      // unvectorized version.
2244
256k
      Type *ScalarTy = VL0->getType();
2245
256k
2246
256k
      if (DL->getTypeSizeInBits(ScalarTy) !=
2247
256k
          DL->getTypeAllocSizeInBits(ScalarTy)) {
2248
18
        BS.cancelScheduling(VL, VL0);
2249
18
        newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
2250
18
        LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
2251
18
        return;
2252
18
      }
2253
256k
2254
256k
      // Make sure all loads in the bundle are simple - we can't vectorize
2255
256k
      // atomic or volatile loads.
2256
256k
      SmallVector<Value *, 4> PointerOps(VL.size());
2257
256k
      auto POIter = PointerOps.begin();
2258
578k
      for (Value *V : VL) {
2259
578k
        auto *L = cast<LoadInst>(V);
2260
578k
        if (!L->isSimple()) {
2261
0
          BS.cancelScheduling(VL, VL0);
2262
0
          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
2263
0
          LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
2264
0
          return;
2265
0
        }
2266
578k
        *POIter = L->getPointerOperand();
2267
578k
        ++POIter;
2268
578k
      }
2269
256k
2270
256k
      OrdersType CurrentOrder;
2271
256k
      // Check the order of pointer operands.
2272
256k
      if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) {
2273
155k
        Value *Ptr0;
2274
155k
        Value *PtrN;
2275
155k
        if (CurrentOrder.empty()) {
2276
112k
          Ptr0 = PointerOps.front();
2277
112k
          PtrN = PointerOps.back();
2278
112k
        } else {
2279
43.3k
          Ptr0 = PointerOps[CurrentOrder.front()];
2280
43.3k
          PtrN = PointerOps[CurrentOrder.back()];
2281
43.3k
        }
2282
155k
        const SCEV *Scev0 = SE->getSCEV(Ptr0);
2283
155k
        const SCEV *ScevN = SE->getSCEV(PtrN);
2284
155k
        const auto *Diff =
2285
155k
            dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0));
2286
155k
        uint64_t Size = DL->getTypeAllocSize(ScalarTy);
2287
155k
        // Check that the sorted loads are consecutive.
2288
155k
        if (Diff && Diff->getAPInt().getZExtValue() == (VL.size() - 1) * Size) {
2289
109k
          if (CurrentOrder.empty()) {
2290
78.2k
            // Original loads are consecutive and does not require reordering.
2291
78.2k
            ++NumOpsWantToKeepOriginalOrder;
2292
78.2k
            newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
2293
78.2k
                         ReuseShuffleIndicies);
2294
78.2k
            LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
2295
78.2k
          } else {
2296
31.7k
            // Need to reorder.
2297
31.7k
            auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
2298
31.7k
            ++I->getSecond();
2299
31.7k
            newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
2300
31.7k
                         ReuseShuffleIndicies, I->getFirst());
2301
31.7k
            LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
2302
31.7k
          }
2303
109k
          return;
2304
109k
        }
2305
146k
      }
2306
146k
2307
146k
      LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
2308
146k
      BS.cancelScheduling(VL, VL0);
2309
146k
      newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
2310
146k
      return;
2311
146k
    }
2312
146k
    case Instruction::ZExt:
2313
53.9k
    case Instruction::SExt:
2314
53.9k
    case Instruction::FPToUI:
2315
53.9k
    case Instruction::FPToSI:
2316
53.9k
    case Instruction::FPExt:
2317
53.9k
    case Instruction::PtrToInt:
2318
53.9k
    case Instruction::IntToPtr:
2319
53.9k
    case Instruction::SIToFP:
2320
53.9k
    case Instruction::UIToFP:
2321
53.9k
    case Instruction::Trunc:
2322
53.9k
    case Instruction::FPTrunc:
2323
53.9k
    case Instruction::BitCast: {
2324
53.9k
      Type *SrcTy = VL0->getOperand(0)->getType();
2325
192k
      for (unsigned i = 0; i < VL.size(); 
++i138k
) {
2326
138k
        Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
2327
138k
        if (Ty != SrcTy || !isValidElementType(Ty)) {
2328
2
          BS.cancelScheduling(VL, VL0);
2329
2
          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
2330
2
          LLVM_DEBUG(dbgs()
2331
2
                     << "SLP: Gathering casts with different src types.\n");
2332
2
          return;
2333
2
        }
2334
138k
      }
2335
53.9k
      auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
2336
53.9k
      LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
2337
53.9k
2338
107k
      for (unsigned i = 0, e = VL0->getNumOperands(); i < e; 
++i53.9k
) {
2339
53.9k
        ValueList Operands;
2340
53.9k
        // Prepare the operand vector.
2341
53.9k
        for (Value *j : VL)
2342
138k
          Operands.push_back(cast<Instruction>(j)->getOperand(i));
2343
53.9k
2344
53.9k
        buildTree_rec(Operands, Depth + 1, {TE, i});
2345
53.9k
      }
2346
53.9k
      return;
2347
53.9k
    }
2348
74.3k
    case Instruction::ICmp:
2349
74.3k
    case Instruction::FCmp: {
2350
74.3k
      // Check that all of the compares have the same predicate.
2351
74.3k
      CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
2352
74.3k
      CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0);
2353
74.3k
      Type *ComparedTy = VL0->getOperand(0)->getType();
2354
123k
      for (unsigned i = 1, e = VL.size(); i < e; 
++i48.6k
) {
2355
86.2k
        CmpInst *Cmp = cast<CmpInst>(VL[i]);
2356
86.2k
        if ((Cmp->getPredicate() != P0 && 
Cmp->getPredicate() != SwapP040.3k
) ||
2357
86.2k
            
Cmp->getOperand(0)->getType() != ComparedTy54.6k
) {
2358
37.5k
          BS.cancelScheduling(VL, VL0);
2359
37.5k
          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
2360
37.5k
          LLVM_DEBUG(dbgs()
2361
37.5k
                     << "SLP: Gathering cmp with different predicate.\n");
2362
37.5k
          return;
2363
37.5k
        }
2364
86.2k
      }
2365
74.3k
2366
74.3k
      auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
2367
36.8k
      LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
2368
36.8k
2369
36.8k
      ValueList Left, Right;
2370
36.8k
      if (cast<CmpInst>(VL0)->isCommutative()) {
2371
7.97k
        // Commutative predicate - collect + sort operands of the instructions
2372
7.97k
        // so that each side is more likely to have the same opcode.
2373
7.97k
        assert(P0 == SwapP0 && "Commutative Predicate mismatch");
2374
7.97k
        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
2375
28.8k
      } else {
2376
28.8k
        // Collect operands - commute if it uses the swapped predicate.
2377
68.9k
        for (Value *V : VL) {
2378
68.9k
          auto *Cmp = cast<CmpInst>(V);
2379
68.9k
          Value *LHS = Cmp->getOperand(0);
2380
68.9k
          Value *RHS = Cmp->getOperand(1);
2381
68.9k
          if (Cmp->getPredicate() != P0)
2382
8.39k
            std::swap(LHS, RHS);
2383
68.9k
          Left.push_back(LHS);
2384
68.9k
          Right.push_back(RHS);
2385
68.9k
        }
2386
28.8k
      }
2387
36.8k
2388
36.8k
      buildTree_rec(Left, Depth + 1, {TE, 0});
2389
36.8k
      buildTree_rec(Right, Depth + 1, {TE, 1});
2390
36.8k
      return;
2391
74.3k
    }
2392
180k
    case Instruction::Select:
2393
180k
    case Instruction::FNeg:
2394
180k
    case Instruction::Add:
2395
180k
    case Instruction::FAdd:
2396
180k
    case Instruction::Sub:
2397
180k
    case Instruction::FSub:
2398
180k
    case Instruction::Mul:
2399
180k
    case Instruction::FMul:
2400
180k
    case Instruction::UDiv:
2401
180k
    case Instruction::SDiv:
2402
180k
    case Instruction::FDiv:
2403
180k
    case Instruction::URem:
2404
180k
    case Instruction::SRem:
2405
180k
    case Instruction::FRem:
2406
180k
    case Instruction::Shl:
2407
180k
    case Instruction::LShr:
2408
180k
    case Instruction::AShr:
2409
180k
    case Instruction::And:
2410
180k
    case Instruction::Or:
2411
180k
    case Instruction::Xor: {
2412
180k
      auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
2413
180k
      LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
2414
180k
2415
180k
      // Sort operands of the instructions so that each side is more likely to
2416
180k
      // have the same opcode.
2417
180k
      if (isa<BinaryOperator>(VL0) && 
VL0->isCommutative()161k
) {
2418
113k
        ValueList Left, Right;
2419
113k
        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
2420
113k
        buildTree_rec(Left, Depth + 1, {TE, 0});
2421
113k
        buildTree_rec(Right, Depth + 1, {TE, 1});
2422
113k
        return;
2423
113k
      }
2424
67.3k
2425
221k
      
for (unsigned i = 0, e = VL0->getNumOperands(); 67.3k
i < e;
++i153k
) {
2426
153k
        ValueList Operands;
2427
153k
        // Prepare the operand vector.
2428
153k
        for (Value *j : VL)
2429
374k
          Operands.push_back(cast<Instruction>(j)->getOperand(i));
2430
153k
2431
153k
        buildTree_rec(Operands, Depth + 1, {TE, i});
2432
153k
      }
2433
67.3k
      return;
2434
67.3k
    }
2435
67.3k
    case Instruction::GetElementPtr: {
2436
24.1k
      // We don't combine GEPs with complicated (nested) indexing.
2437
53.2k
      for (unsigned j = 0; j < VL.size(); 
++j29.1k
) {
2438
39.0k
        if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
2439
9.94k
          LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
2440
9.94k
          BS.cancelScheduling(VL, VL0);
2441
9.94k
          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
2442
9.94k
          return;
2443
9.94k
        }
2444
39.0k
      }
2445
24.1k
2446
24.1k
      // We can't combine several GEPs into one vector if they operate on
2447
24.1k
      // different types.
2448
24.1k
      Type *Ty0 = VL0->getOperand(0)->getType();
2449
43.2k
      for (unsigned j = 0; j < VL.size(); 
++j29.0k
) {
2450
29.0k
        Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
2451
29.0k
        if (Ty0 != CurTy) {
2452
0
          LLVM_DEBUG(dbgs()
2453
0
                     << "SLP: not-vectorizable GEP (different types).\n");
2454
0
          BS.cancelScheduling(VL, VL0);
2455
0
          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
2456
0
          return;
2457
0
        }
2458
29.0k
      }
2459
14.1k
2460
14.1k
      // We don't combine GEPs with non-constant indexes.
2461
27.3k
      
for (unsigned j = 0; 14.1k
j < VL.size();
++j13.1k
) {
2462
21.6k
        auto Op = cast<Instruction>(VL[j])->getOperand(1);
2463
21.6k
        if (!isa<ConstantInt>(Op)) {
2464
8.48k
          LLVM_DEBUG(dbgs()
2465
8.48k
                     << "SLP: not-vectorizable GEP (non-constant indexes).\n");
2466
8.48k
          BS.cancelScheduling(VL, VL0);
2467
8.48k
          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
2468
8.48k
          return;
2469
8.48k
        }
2470
21.6k
      }
2471
14.1k
2472
14.1k
      auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
2473
5.68k
      LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
2474
17.0k
      for (unsigned i = 0, e = 2; i < e; 
++i11.3k
) {
2475
11.3k
        ValueList Operands;
2476
11.3k
        // Prepare the operand vector.
2477
11.3k
        for (Value *j : VL)
2478
23.7k
          Operands.push_back(cast<Instruction>(j)->getOperand(i));
2479
11.3k
2480
11.3k
        buildTree_rec(Operands, Depth + 1, {TE, i});
2481
11.3k
      }
2482
5.68k
      return;
2483
14.1k
    }
2484
113k
    case Instruction::Store: {
2485
113k
      // Check if the stores are consecutive or of we need to swizzle them.
2486
317k
      for (unsigned i = 0, e = VL.size() - 1; i < e; 
++i203k
)
2487
203k
        if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
2488
0
          BS.cancelScheduling(VL, VL0);
2489
0
          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
2490
0
          LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
2491
0
          return;
2492
0
        }
2493
113k
2494
113k
      auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
2495
113k
      LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
2496
113k
2497
113k
      ValueList Operands;
2498
113k
      for (Value *j : VL)
2499
317k
        Operands.push_back(cast<Instruction>(j)->getOperand(0));
2500
113k
2501
113k
      buildTree_rec(Operands, Depth + 1, {TE, 0});
2502
113k
      return;
2503
113k
    }
2504
113k
    case Instruction::Call: {
2505
6.77k
      // Check if the calls are all to the same vectorizable intrinsic.
2506
6.77k
      CallInst *CI = cast<CallInst>(VL0);
2507
6.77k
      // Check if this is an Intrinsic call or something that can be
2508
6.77k
      // represented by an intrinsic call
2509
6.77k
      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
2510
6.77k
      if (!isTriviallyVectorizable(ID)) {
2511
3.67k
        BS.cancelScheduling(VL, VL0);
2512
3.67k
        newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
2513
3.67k
        LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
2514
3.67k
        return;
2515
3.67k
      }
2516
3.09k
      Function *Int = CI->getCalledFunction();
2517
3.09k
      unsigned NumArgs = CI->getNumArgOperands();
2518
3.09k
      SmallVector<Value*, 4> ScalarArgs(NumArgs, nullptr);
2519
8.03k
      for (unsigned j = 0; j != NumArgs; 
++j4.93k
)
2520
4.93k
        if (hasVectorInstrinsicScalarOpd(ID, j))
2521
576
          ScalarArgs[j] = CI->getArgOperand(j);
2522
13.9k
      for (unsigned i = 1, e = VL.size(); i != e; 
++i10.8k
) {
2523
11.0k
        CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
2524
11.0k
        if (!CI2 || CI2->getCalledFunction() != Int ||
2525
11.0k
            
getVectorIntrinsicIDForCall(CI2, TLI) != ID10.8k
||
2526
11.0k
            
!CI->hasIdenticalOperandBundleSchema(*CI2)10.8k
) {
2527
175
          BS.cancelScheduling(VL, VL0);
2528
175
          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
2529
175
          LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
2530
175
                            << "\n");
2531
175
          return;
2532
175
        }
2533
10.8k
        // Some intrinsics have scalar arguments and should be same in order for
2534
10.8k
        // them to be vectorized.
2535
31.3k
        
for (unsigned j = 0; 10.8k
j != NumArgs;
++j20.5k
) {
2536
20.5k
          if (hasVectorInstrinsicScalarOpd(ID, j)) {
2537
3.34k
            Value *A1J = CI2->getArgOperand(j);
2538
3.34k
            if (ScalarArgs[j] != A1J) {
2539
3
              BS.cancelScheduling(VL, VL0);
2540
3
              newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
2541
3
              LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
2542
3
                                << " argument " << ScalarArgs[j] << "!=" << A1J
2543
3
                                << "\n");
2544
3
              return;
2545
3
            }
2546
3.34k
          }
2547
20.5k
        }
2548
10.8k
        // Verify that the bundle operands are identical between the two calls.
2549
10.8k
        
if (10.8k
CI->hasOperandBundles()10.8k
&&
2550
10.8k
            !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
2551
1
                        CI->op_begin() + CI->getBundleOperandsEndIndex(),
2552
1
                        CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
2553
0
          BS.cancelScheduling(VL, VL0);
2554
0
          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
2555
0
          LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:"
2556
0
                            << *CI << "!=" << *VL[i] << '\n');
2557
0
          return;
2558
0
        }
2559
10.8k
      }
2560
3.09k
2561
3.09k
      auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
2562
7.66k
      for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; 
++i4.74k
) {
2563
4.74k
        ValueList Operands;
2564
4.74k
        // Prepare the operand vector.
2565
25.2k
        for (Value *j : VL) {
2566
25.2k
          CallInst *CI2 = dyn_cast<CallInst>(j);
2567
25.2k
          Operands.push_back(CI2->getArgOperand(i));
2568
25.2k
        }
2569
4.74k
        buildTree_rec(Operands, Depth + 1, {TE, i});
2570
4.74k
      }
2571
2.91k
      return;
2572
3.09k
    }
2573
115k
    case Instruction::ShuffleVector: {
2574
115k
      // If this is not an alternate sequence of opcode like add-sub
2575
115k
      // then do not vectorize this instruction.
2576
115k
      if (!S.isAltShuffle()) {
2577
0
        BS.cancelScheduling(VL, VL0);
2578
0
        newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
2579
0
        LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
2580
0
        return;
2581
0
      }
2582
115k
      auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
2583
115k
      LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
2584
115k
2585
115k
      // Reorder operands if reordering would enable vectorization.
2586
115k
      if (isa<BinaryOperator>(VL0)) {
2587
90.0k
        ValueList Left, Right;
2588
90.0k
        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
2589
90.0k
        buildTree_rec(Left, Depth + 1, {TE, 0});
2590
90.0k
        buildTree_rec(Right, Depth + 1, {TE, 1});
2591
90.0k
        return;
2592
90.0k
      }
2593
25.3k
2594
50.6k
      
for (unsigned i = 0, e = VL0->getNumOperands(); 25.3k
i < e;
++i25.3k
) {
2595
25.3k
        ValueList Operands;
2596
25.3k
        // Prepare the operand vector.
2597
25.3k
        for (Value *j : VL)
2598
50.8k
          Operands.push_back(cast<Instruction>(j)->getOperand(i));
2599
25.3k
2600
25.3k
        buildTree_rec(Operands, Depth + 1, {TE, i});
2601
25.3k
      }
2602
25.3k
      return;
2603
25.3k
    }
2604
25.3k
    default:
2605
56
      BS.cancelScheduling(VL, VL0);
2606
56
      newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
2607
56
      LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
2608
56
      return;
2609
1.00M
  }
2610
1.00M
}
2611
2612
6.89k
unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
2613
6.89k
  unsigned N;
2614
6.89k
  Type *EltTy;
2615
6.89k
  auto *ST = dyn_cast<StructType>(T);
2616
6.89k
  if (ST) {
2617
4.90k
    N = ST->getNumElements();
2618
4.90k
    EltTy = *ST->element_begin();
2619
4.90k
  } else {
2620
1.99k
    N = cast<ArrayType>(T)->getNumElements();
2621
1.99k
    EltTy = cast<ArrayType>(T)->getElementType();
2622
1.99k
  }
2623
6.89k
  if (!isValidElementType(EltTy))
2624
213
    return 0;
2625
6.68k
  uint64_t VTSize = DL.getTypeStoreSizeInBits(VectorType::get(EltTy, N));
2626
6.68k
  if (VTSize < MinVecRegSize || 
VTSize > MaxVecRegSize4.42k
||
VTSize != DL.getTypeStoreSizeInBits(T)3.97k
)
2627
3.18k
    return 0;
2628
3.49k
  if (ST) {
2629
2.27k
    // Check that struct is homogeneous.
2630
2.27k
    for (const auto *Ty : ST->elements())
2631
4.59k
      if (Ty != EltTy)
2632
1.76k
        return 0;
2633
2.27k
  }
2634
3.49k
  
return N1.72k
;
2635
3.49k
}
2636
2637
bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
2638
8.86k
                              SmallVectorImpl<unsigned> &CurrentOrder) const {
2639
8.86k
  Instruction *E0 = cast<Instruction>(OpValue);
2640
8.86k
  assert(E0->getOpcode() == Instruction::ExtractElement ||
2641
8.86k
         E0->getOpcode() == Instruction::ExtractValue);
2642
8.86k
  assert(E0->getOpcode() == getSameOpcode(VL).getOpcode() && "Invalid opcode");
2643
8.86k
  // Check if all of the extracts come from the same vector and from the
2644
8.86k
  // correct offset.
2645
8.86k
  Value *Vec = E0->getOperand(0);
2646
8.86k
2647
8.86k
  CurrentOrder.clear();
2648
8.86k
2649
8.86k
  // We have to extract from a vector/aggregate with the same number of elements.
2650
8.86k
  unsigned NElts;
2651
8.86k
  if (E0->getOpcode() == Instruction::ExtractValue) {
2652
3.35k
    const DataLayout &DL = E0->getModule()->getDataLayout();
2653
3.35k
    NElts = canMapToVector(Vec->getType(), DL);
2654
3.35k
    if (!NElts)
2655
2.48k
      return false;
2656
868
    // Check if load can be rewritten as load of vector.
2657
868
    LoadInst *LI = dyn_cast<LoadInst>(Vec);
2658
868
    if (!LI || 
!LI->isSimple()6
||
!LI->hasNUses(VL.size())6
)
2659
862
      return false;
2660
5.50k
  } else {
2661
5.50k
    NElts = Vec->getType()->getVectorNumElements();
2662
5.50k
  }
2663
8.86k
2664
8.86k
  
if (5.51k
NElts != VL.size()5.51k
)
2665
2.40k
    return false;
2666
3.10k
2667
3.10k
  // Check that all of the indices extract from the correct offset.
2668
3.10k
  bool ShouldKeepOrder = true;
2669
3.10k
  unsigned E = VL.size();
2670
3.10k
  // Assign to all items the initial value E + 1 so we can check if the extract
2671
3.10k
  // instruction index was used already.
2672
3.10k
  // Also, later we can check that all the indices are used and we have a
2673
3.10k
  // consecutive access in the extract instructions, by checking that no
2674
3.10k
  // element of CurrentOrder still has value E + 1.
2675
3.10k
  CurrentOrder.assign(E, E + 1);
2676
3.10k
  unsigned I = 0;
2677
13.3k
  for (; I < E; 
++I10.2k
) {
2678
10.8k
    auto *Inst = cast<Instruction>(VL[I]);
2679
10.8k
    if (Inst->getOperand(0) != Vec)
2680
448
      break;
2681
10.3k
    Optional<unsigned> Idx = getExtractIndex(Inst);
2682
10.3k
    if (!Idx)
2683
0
      break;
2684
10.3k
    const unsigned ExtIdx = *Idx;
2685
10.3k
    if (ExtIdx != I) {
2686
841
      if (ExtIdx >= E || CurrentOrder[ExtIdx] != E + 1)
2687
49
        break;
2688
792
      ShouldKeepOrder = false;
2689
792
      CurrentOrder[ExtIdx] = I;
2690
9.51k
    } else {
2691
9.51k
      if (CurrentOrder[I] != E + 1)
2692
35
        break;
2693
9.48k
      CurrentOrder[I] = I;
2694
9.48k
    }
2695
10.3k
  }
2696
3.10k
  if (I < E) {
2697
532
    CurrentOrder.clear();
2698
532
    return false;
2699
532
  }
2700
2.57k
2701
2.57k
  return ShouldKeepOrder;
2702
2.57k
}
2703
2704
12.4k
bool BoUpSLP::areAllUsersVectorized(Instruction *I) const {
2705
12.4k
  return I->hasOneUse() ||
2706
12.4k
         
std::all_of(I->user_begin(), I->user_end(), [this](User *U) 320
{
2707
513
           return ScalarToTreeEntry.count(U) > 0;
2708
513
         });
2709
12.4k
}
2710
2711
1.25M
int BoUpSLP::getEntryCost(TreeEntry *E) {
2712
1.25M
  ArrayRef<Value*> VL = E->Scalars;
2713
1.25M
2714
1.25M
  Type *ScalarTy = VL[0]->getType();
2715
1.25M
  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
2716
66.3k
    ScalarTy = SI->getValueOperand()->getType();
2717
1.19M
  else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))
2718
40.7k
    ScalarTy = CI->getOperand(0)->getType();
2719
1.25M
  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
2720
1.25M
2721
1.25M
  // If we have computed a smaller type for the expression, update VecTy so
2722
1.25M
  // that the costs will be accurate.
2723
1.25M
  if (MinBWs.count(VL[0]))
2724
2.32k
    VecTy = VectorType::get(
2725
2.32k
        IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
2726
1.25M
2727
1.25M
  unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size();
2728
1.25M
  bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
2729
1.25M
  int ReuseShuffleCost = 0;
2730
1.25M
  if (NeedToShuffleReuses) {
2731
1.56k
    ReuseShuffleCost =
2732
1.56k
        TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
2733
1.56k
  }
2734
1.25M
  if (E->NeedToGather) {
2735
605k
    if (allConstant(VL))
2736
126k
      return 0;
2737
479k
    if (isSplat(VL)) {
2738
56.4k
      return ReuseShuffleCost +
2739
56.4k
             TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
2740
56.4k
    }
2741
423k
    if (getSameOpcode(VL).getOpcode() == Instruction::ExtractElement &&
2742
423k
        
allSameType(VL)863
&&
allSameBlock(VL)863
) {
2743
861
      Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL);
2744
861
      if (ShuffleKind.hasValue()) {
2745
859
        int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy);
2746
3.23k
        for (auto *V : VL) {
2747
3.23k
          // If all users of instruction are going to be vectorized and this
2748
3.23k
          // instruction itself is not going to be vectorized, consider this
2749
3.23k
          // instruction as dead and remove its cost from the final cost of the
2750
3.23k
          // vectorized tree.
2751
3.23k
          if (areAllUsersVectorized(cast<Instruction>(V)) &&
2752
3.23k
              
!ScalarToTreeEntry.count(V)3.20k
) {
2753
3.20k
            auto *IO = cast<ConstantInt>(
2754
3.20k
                cast<ExtractElementInst>(V)->getIndexOperand());
2755
3.20k
            Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
2756
3.20k
                                            IO->getZExtValue());
2757
3.20k
          }
2758
3.23k
        }
2759
859
        return ReuseShuffleCost + Cost;
2760
859
      }
2761
422k
    }
2762
422k
    return ReuseShuffleCost + getGatherCost(VL);
2763
422k
  }
2764
652k
  InstructionsState S = getSameOpcode(VL);
2765
652k
  assert(S.getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
2766
652k
  Instruction *VL0 = cast<Instruction>(S.OpValue);
2767
652k
  unsigned ShuffleOrOp = S.isAltShuffle() ?
2768
563k
               
(unsigned) Instruction::ShuffleVector88.7k
: S.getOpcode();
2769
652k
  switch (ShuffleOrOp) {
2770
652k
    case Instruction::PHI:
2771
167k
      return 0;
2772
652k
2773
652k
    case Instruction::ExtractValue:
2774
2.50k
    case Instruction::ExtractElement:
2775
2.50k
      if (NeedToShuffleReuses) {
2776
3
        unsigned Idx = 0;
2777
12
        for (unsigned I : E->ReuseShuffleIndices) {
2778
12
          if (ShuffleOrOp == Instruction::ExtractElement) {
2779
12
            auto *IO = cast<ConstantInt>(
2780
12
                cast<ExtractElementInst>(VL[I])->getIndexOperand());
2781
12
            Idx = IO->getZExtValue();
2782
12
            ReuseShuffleCost -= TTI->getVectorInstrCost(
2783
12
                Instruction::ExtractElement, VecTy, Idx);
2784
12
          } else {
2785
0
            ReuseShuffleCost -= TTI->getVectorInstrCost(
2786
0
                Instruction::ExtractElement, VecTy, Idx);
2787
0
            ++Idx;
2788
0
          }
2789
12
        }
2790
3
        Idx = ReuseShuffleNumbers;
2791
6
        for (Value *V : VL) {
2792
6
          if (ShuffleOrOp == Instruction::ExtractElement) {
2793
6
            auto *IO = cast<ConstantInt>(
2794
6
                cast<ExtractElementInst>(V)->getIndexOperand());
2795
6
            Idx = IO->getZExtValue();
2796
6
          } else {
2797
0
            --Idx;
2798
0
          }
2799
6
          ReuseShuffleCost +=
2800
6
              TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx);
2801
6
        }
2802
3
      }
2803
2.50k
      if (!E->NeedToGather) {
2804
2.50k
        int DeadCost = ReuseShuffleCost;
2805
2.50k
        if (!E->ReorderIndices.empty()) {
2806
15
          // TODO: Merge this shuffle with the ReuseShuffleCost.
2807
15
          DeadCost += TTI->getShuffleCost(
2808
15
              TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
2809
15
        }
2810
11.6k
        for (unsigned i = 0, e = VL.size(); i < e; 
++i9.19k
) {
2811
9.19k
          Instruction *E = cast<Instruction>(VL[i]);
2812
9.19k
          // If all users are going to be vectorized, instruction can be
2813
9.19k
          // considered as dead.
2814
9.19k
          // The same, if have only one user, it will be vectorized for sure.
2815
9.19k
          if (areAllUsersVectorized(E)) {
2816
9.04k
            // Take credit for instruction that will become dead.
2817
9.04k
            if (E->hasOneUse()) {
2818
8.97k
              Instruction *Ext = E->user_back();
2819
8.97k
              if ((isa<SExtInst>(Ext) || 
isa<ZExtInst>(Ext)8.94k
) &&
2820
8.97k
                  all_of(Ext->users(),
2821
60
                         [](User *U) { return isa<GetElementPtrInst>(U); })) {
2822
4
                // Use getExtractWithExtendCost() to calculate the cost of
2823
4
                // extractelement/ext pair.
2824
4
                DeadCost -= TTI->getExtractWithExtendCost(
2825
4
                    Ext->getOpcode(), Ext->getType(), VecTy, i);
2826
4
                // Add back the cost of s|zext which is subtracted separately.
2827
4
                DeadCost += TTI->getCastInstrCost(
2828
4
                    Ext->getOpcode(), Ext->getType(), E->getType(), Ext);
2829
4
                continue;
2830
4
              }
2831
9.04k
            }
2832
9.04k
            DeadCost -=
2833
9.04k
                TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
2834
9.04k
          }
2835
9.19k
        }
2836
2.50k
        return DeadCost;
2837
2.50k
      }
2838
0
      return ReuseShuffleCost + getGatherCost(VL);
2839
0
2840
33.8k
    case Instruction::ZExt:
2841
33.8k
    case Instruction::SExt:
2842
33.8k
    case Instruction::FPToUI:
2843
33.8k
    case Instruction::FPToSI:
2844
33.8k
    case Instruction::FPExt:
2845
33.8k
    case Instruction::PtrToInt:
2846
33.8k
    case Instruction::IntToPtr:
2847
33.8k
    case Instruction::SIToFP:
2848
33.8k
    case Instruction::UIToFP:
2849
33.8k
    case Instruction::Trunc:
2850
33.8k
    case Instruction::FPTrunc:
2851
33.8k
    case Instruction::BitCast: {
2852
33.8k
      Type *SrcTy = VL0->getOperand(0)->getType();
2853
33.8k
      int ScalarEltCost =
2854
33.8k
          TTI->getCastInstrCost(S.getOpcode(), ScalarTy, SrcTy, VL0);
2855
33.8k
      if (NeedToShuffleReuses) {
2856
23
        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
2857
23
      }
2858
33.8k
2859
33.8k
      // Calculate the cost of this instruction.
2860
33.8k
      int ScalarCost = VL.size() * ScalarEltCost;
2861
33.8k
2862
33.8k
      VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
2863
33.8k
      int VecCost = 0;
2864
33.8k
      // Check if the values are candidates to demote.
2865
33.8k
      if (!MinBWs.count(VL0) || 
VecTy != SrcVecTy1.95k
) {
2866
32.2k
        VecCost = ReuseShuffleCost +
2867
32.2k
                  TTI->getCastInstrCost(S.getOpcode(), VecTy, SrcVecTy, VL0);
2868
32.2k
      }
2869
33.8k
      return VecCost - ScalarCost;
2870
33.8k
    }
2871
52.6k
    case Instruction::FCmp:
2872
52.6k
    case Instruction::ICmp:
2873
52.6k
    case Instruction::Select: {
2874
52.6k
      // Calculate the cost of this instruction.
2875
52.6k
      int ScalarEltCost = TTI->getCmpSelInstrCost(S.getOpcode(), ScalarTy,
2876
52.6k
                                                  Builder.getInt1Ty(), VL0);
2877
52.6k
      if (NeedToShuffleReuses) {
2878
44
        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
2879
44
      }
2880
52.6k
      VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
2881
52.6k
      int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
2882
52.6k
      int VecCost = TTI->getCmpSelInstrCost(S.getOpcode(), VecTy, MaskTy, VL0);
2883
52.6k
      return ReuseShuffleCost + VecCost - ScalarCost;
2884
52.6k
    }
2885
148k
    case Instruction::FNeg:
2886
148k
    case Instruction::Add:
2887
148k
    case Instruction::FAdd:
2888
148k
    case Instruction::Sub:
2889
148k
    case Instruction::FSub:
2890
148k
    case Instruction::Mul:
2891
148k
    case Instruction::FMul:
2892
148k
    case Instruction::UDiv:
2893
148k
    case Instruction::SDiv:
2894
148k
    case Instruction::FDiv:
2895
148k
    case Instruction::URem:
2896
148k
    case Instruction::SRem:
2897
148k
    case Instruction::FRem:
2898
148k
    case Instruction::Shl:
2899
148k
    case Instruction::LShr:
2900
148k
    case Instruction::AShr:
2901
148k
    case Instruction::And:
2902
148k
    case Instruction::Or:
2903
148k
    case Instruction::Xor: {
2904
148k
      // Certain instructions can be cheaper to vectorize if they have a
2905
148k
      // constant second vector operand.
2906
148k
      TargetTransformInfo::OperandValueKind Op1VK =
2907
148k
          TargetTransformInfo::OK_AnyValue;
2908
148k
      TargetTransformInfo::OperandValueKind Op2VK =
2909
148k
          TargetTransformInfo::OK_UniformConstantValue;
2910
148k
      TargetTransformInfo::OperandValueProperties Op1VP =
2911
148k
          TargetTransformInfo::OP_None;
2912
148k
      TargetTransformInfo::OperandValueProperties Op2VP =
2913
148k
          TargetTransformInfo::OP_PowerOf2;
2914
148k
2915
148k
      // If all operands are exactly the same ConstantInt then set the
2916
148k
      // operand kind to OK_UniformConstantValue.
2917
148k
      // If instead not all operands are constants, then set the operand kind
2918
148k
      // to OK_AnyValue. If all operands are constants but not the same,
2919
148k
      // then set the operand kind to OK_NonUniformConstantValue.
2920
148k
      ConstantInt *CInt0 = nullptr;
2921
214k
      for (unsigned i = 0, e = VL.size(); i < e; 
++i66.4k
) {
2922
187k
        const Instruction *I = cast<Instruction>(VL[i]);
2923
187k
        unsigned OpIdx = isa<BinaryOperator>(I) ? 
1187k
:
03
;
2924
187k
        ConstantInt *CInt = dyn_cast<ConstantInt>(I->getOperand(OpIdx));
2925
187k
        if (!CInt) {
2926
121k
          Op2VK = TargetTransformInfo::OK_AnyValue;
2927
121k
          Op2VP = TargetTransformInfo::OP_None;
2928
121k
          break;
2929
121k
        }
2930
66.4k
        if (Op2VP == TargetTransformInfo::OP_PowerOf2 &&
2931
66.4k
            
!CInt->getValue().isPowerOf2()45.8k
)
2932
19.7k
          Op2VP = TargetTransformInfo::OP_None;
2933
66.4k
        if (i == 0) {
2934
31.5k
          CInt0 = CInt;
2935
31.5k
          continue;
2936
31.5k
        }
2937
34.8k
        if (CInt0 != CInt)
2938
12.6k
          Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
2939
34.8k
      }
2940
148k
2941
148k
      SmallVector<const Value *, 4> Operands(VL0->operand_values());
2942
148k
      int ScalarEltCost = TTI->getArithmeticInstrCost(
2943
148k
          S.getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands);
2944
148k
      if (NeedToShuffleReuses) {
2945
335
        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
2946
335
      }
2947
148k
      int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
2948
148k
      int VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy, Op1VK,
2949
148k
                                                Op2VK, Op1VP, Op2VP, Operands);
2950
148k
      return ReuseShuffleCost + VecCost - ScalarCost;
2951
148k
    }
2952
148k
    case Instruction::GetElementPtr: {
2953
5.68k
      TargetTransformInfo::OperandValueKind Op1VK =
2954
5.68k
          TargetTransformInfo::OK_AnyValue;
2955
5.68k
      TargetTransformInfo::OperandValueKind Op2VK =
2956
5.68k
          TargetTransformInfo::OK_UniformConstantValue;
2957
5.68k
2958
5.68k
      int ScalarEltCost =
2959
5.68k
          TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
2960
5.68k
      if (NeedToShuffleReuses) {
2961
6
        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
2962
6
      }
2963
5.68k
      int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
2964
5.68k
      int VecCost =
2965
5.68k
          TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
2966
5.68k
      return ReuseShuffleCost + VecCost - ScalarCost;
2967
148k
    }
2968
148k
    case Instruction::Load: {
2969
84.0k
      // Cost of wide load - cost of scalar loads.
2970
84.0k
      unsigned alignment = cast<LoadInst>(VL0)->getAlignment();
2971
84.0k
      int ScalarEltCost =
2972
84.0k
          TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0);
2973
84.0k
      if (NeedToShuffleReuses) {
2974
243
        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
2975
243
      }
2976
84.0k
      int ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;
2977
84.0k
      int VecLdCost =
2978
84.0k
          TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, VL0);
2979
84.0k
      if (!E->ReorderIndices.empty()) {
2980
5.83k
        // TODO: Merge this shuffle with the ReuseShuffleCost.
2981
5.83k
        VecLdCost += TTI->getShuffleCost(
2982
5.83k
            TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
2983
5.83k
      }
2984
84.0k
      return ReuseShuffleCost + VecLdCost - ScalarLdCost;
2985
148k
    }
2986
148k
    case Instruction::Store: {
2987
66.3k
      // We know that we can merge the stores. Calculate the cost.
2988
66.3k
      unsigned alignment = cast<StoreInst>(VL0)->getAlignment();
2989
66.3k
      int ScalarEltCost =
2990
66.3k
          TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0);
2991
66.3k
      if (NeedToShuffleReuses) {
2992
0
        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
2993
0
      }
2994
66.3k
      int ScalarStCost = VecTy->getNumElements() * ScalarEltCost;
2995
66.3k
      int VecStCost =
2996
66.3k
          TTI->getMemoryOpCost(Instruction::Store, VecTy, alignment, 0, VL0);
2997
66.3k
      return ReuseShuffleCost + VecStCost - ScalarStCost;
2998
148k
    }
2999
148k
    case Instruction::Call: {
3000
2.83k
      CallInst *CI = cast<CallInst>(VL0);
3001
2.83k
      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3002
2.83k
3003
2.83k
      // Calculate the cost of the scalar and vector calls.
3004
2.83k
      SmallVector<Type *, 4> ScalarTys;
3005
7.49k
      for (unsigned op = 0, opc = CI->getNumArgOperands(); op != opc; 
++op4.66k
)
3006
4.66k
        ScalarTys.push_back(CI->getArgOperand(op)->getType());
3007
2.83k
3008
2.83k
      FastMathFlags FMF;
3009
2.83k
      if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3010
1.73k
        FMF = FPMO->getFastMathFlags();
3011
2.83k
3012
2.83k
      int ScalarEltCost =
3013
2.83k
          TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);
3014
2.83k
      if (NeedToShuffleReuses) {
3015
3
        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
3016
3
      }
3017
2.83k
      int ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;
3018
2.83k
3019
2.83k
      SmallVector<Value *, 4> Args(CI->arg_operands());
3020
2.83k
      int VecCallCost = TTI->getIntrinsicInstrCost(ID, CI->getType(), Args, FMF,
3021
2.83k
                                                   VecTy->getNumElements());
3022
2.83k
3023
2.83k
      LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCost
3024
2.83k
                        << " (" << VecCallCost << "-" << ScalarCallCost << ")"
3025
2.83k
                        << " for " << *CI << "\n");
3026
2.83k
3027
2.83k
      return ReuseShuffleCost + VecCallCost - ScalarCallCost;
3028
148k
    }
3029
148k
    case Instruction::ShuffleVector: {
3030
88.7k
      assert(S.isAltShuffle() &&
3031
88.7k
             ((Instruction::isBinaryOp(S.getOpcode()) &&
3032
88.7k
               Instruction::isBinaryOp(S.getAltOpcode())) ||
3033
88.7k
              (Instruction::isCast(S.getOpcode()) &&
3034
88.7k
               Instruction::isCast(S.getAltOpcode()))) &&
3035
88.7k
             "Invalid Shuffle Vector Operand");
3036
88.7k
      int ScalarCost = 0;
3037
88.7k
      if (NeedToShuffleReuses) {
3038
384
        for (unsigned Idx : E->ReuseShuffleIndices) {
3039
384
          Instruction *I = cast<Instruction>(VL[Idx]);
3040
384
          ReuseShuffleCost -= TTI->getInstructionCost(
3041
384
              I, TargetTransformInfo::TCK_RecipThroughput);
3042
384
        }
3043
192
        for (Value *V : VL) {
3044
192
          Instruction *I = cast<Instruction>(V);
3045
192
          ReuseShuffleCost += TTI->getInstructionCost(
3046
192
              I, TargetTransformInfo::TCK_RecipThroughput);
3047
192
        }
3048
76
      }
3049
183k
      for (Value *i : VL) {
3050
183k
        Instruction *I = cast<Instruction>(i);
3051
183k
        assert(S.isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
3052
183k
        ScalarCost += TTI->getInstructionCost(
3053
183k
            I, TargetTransformInfo::TCK_RecipThroughput);
3054
183k
      }
3055
88.7k
      // VecCost is equal to sum of the cost of creating 2 vectors
3056
88.7k
      // and the cost of creating shuffle.
3057
88.7k
      int VecCost = 0;
3058
88.7k
      if (Instruction::isBinaryOp(S.getOpcode())) {
3059
88.6k
        VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy);
3060
88.6k
        VecCost += TTI->getArithmeticInstrCost(S.getAltOpcode(), VecTy);
3061
88.6k
      } else {
3062
111
        Type *Src0SclTy = S.MainOp->getOperand(0)->getType();
3063
111
        Type *Src1SclTy = S.AltOp->getOperand(0)->getType();
3064
111
        VectorType *Src0Ty = VectorType::get(Src0SclTy, VL.size());
3065
111
        VectorType *Src1Ty = VectorType::get(Src1SclTy, VL.size());
3066
111
        VecCost = TTI->getCastInstrCost(S.getOpcode(), VecTy, Src0Ty);
3067
111
        VecCost += TTI->getCastInstrCost(S.getAltOpcode(), VecTy, Src1Ty);
3068
111
      }
3069
88.7k
      VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0);
3070
88.7k
      return ReuseShuffleCost + VecCost - ScalarCost;
3071
148k
    }
3072
148k
    default:
3073
0
      llvm_unreachable("Unknown instruction");
3074
652k
  }
3075
652k
}
3076
3077
385k
bool BoUpSLP::isFullyVectorizableTinyTree() const {
3078
385k
  LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
3079
385k
                    << VectorizableTree.size() << " is fully vectorizable .\n");
3080
385k
3081
385k
  // We only handle trees of heights 1 and 2.
3082
385k
  if (VectorizableTree.size() == 1 && 
!VectorizableTree[0]->NeedToGather252k
)
3083
22.7k
    return true;
3084
362k
3085
362k
  if (VectorizableTree.size() != 2)
3086
229k
    return false;
3087
132k
3088
132k
  // Handle splat and all-constants stores.
3089
132k
  if (!VectorizableTree[0]->NeedToGather &&
3090
132k
      (allConstant(VectorizableTree[1]->Scalars) ||
3091
132k
       
isSplat(VectorizableTree[1]->Scalars)98.2k
))
3092
39.0k
    return true;
3093
93.8k
3094
93.8k
  // Gathering cost would be too much for tiny trees.
3095
93.8k
  if (VectorizableTree[0]->NeedToGather || VectorizableTree[1]->NeedToGather)
3096
88.8k
    return false;
3097
4.95k
3098
4.95k
  return true;
3099
4.95k
}
3100
3101
608k
bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const {
3102
608k
  // We can vectorize the tree if its size is greater than or equal to the
3103
608k
  // minimum size specified by the MinTreeSize command line option.
3104
608k
  if (VectorizableTree.size() >= MinTreeSize)
3105
223k
    return false;
3106
385k
3107
385k
  // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
3108
385k
  // can vectorize it if we can prove it fully vectorizable.
3109
385k
  if (isFullyVectorizableTinyTree())
3110
66.7k
    return false;
3111
318k
3112
318k
  assert(VectorizableTree.empty()
3113
318k
             ? ExternalUses.empty()
3114
318k
             : true && "We shouldn't have any external users");
3115
318k
3116
318k
  // Otherwise, we can't vectorize the tree. It is both tiny and not fully
3117
318k
  // vectorizable.
3118
318k
  return true;
3119
318k
}
3120
3121
290k
int BoUpSLP::getSpillCost() const {
3122
290k
  // Walk from the bottom of the tree to the top, tracking which values are
3123
290k
  // live. When we see a call instruction that is not part of our tree,
3124
290k
  // query TTI to see if there is a cost to keeping values live over it
3125
290k
  // (for example, if spills and fills are required).
3126
290k
  unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
3127
290k
  int Cost = 0;
3128
290k
3129
290k
  SmallPtrSet<Instruction*, 4> LiveValues;
3130
290k
  Instruction *PrevInst = nullptr;
3131
290k
3132
1.31M
  for (const auto &TEPtr : VectorizableTree) {
3133
1.31M
    Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
3134
1.31M
    if (!Inst)
3135
210k
      continue;
3136
1.10M
3137
1.10M
    if (!PrevInst) {
3138
290k
      PrevInst = Inst;
3139
290k
      continue;
3140
290k
    }
3141
810k
3142
810k
    // Update LiveValues.
3143
810k
    LiveValues.erase(PrevInst);
3144
1.63M
    for (auto &J : PrevInst->operands()) {
3145
1.63M
      if (isa<Instruction>(&*J) && 
getTreeEntry(&*J)1.37M
)
3146
582k
        LiveValues.insert(cast<Instruction>(&*J));
3147
1.63M
    }
3148
810k
3149
810k
    LLVM_DEBUG({
3150
810k
      dbgs() << "SLP: #LV: " << LiveValues.size();
3151
810k
      for (auto *X : LiveValues)
3152
810k
        dbgs() << " " << X->getName();
3153
810k
      dbgs() << ", Looking at ";
3154
810k
      Inst->dump();
3155
810k
    });
3156
810k
3157
810k
    // Now find the sequence of instructions between PrevInst and Inst.
3158
810k
    unsigned NumCalls = 0;
3159
810k
    BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
3160
810k
                                 PrevInstIt =
3161
810k
                                     PrevInst->getIterator().getReverse();
3162
23.3M
    while (InstIt != PrevInstIt) {
3163
22.4M
      if (PrevInstIt == PrevInst->getParent()->rend()) {
3164
442k
        PrevInstIt = Inst->getParent()->rbegin();
3165
442k
        continue;
3166
442k
      }
3167
22.0M
3168
22.0M
      // Debug informations don't impact spill cost.
3169
22.0M
      if ((isa<CallInst>(&*PrevInstIt) &&
3170
22.0M
           
!isa<DbgInfoIntrinsic>(&*PrevInstIt)433k
) &&
3171
22.0M
          
&*PrevInstIt != PrevInst433k
)
3172
425k
        NumCalls++;
3173
22.0M
3174
22.0M
      ++PrevInstIt;
3175
22.0M
    }
3176
810k
3177
810k
    if (NumCalls) {
3178
124k
      SmallVector<Type*, 4> V;
3179
124k
      for (auto *II : LiveValues)
3180
210k
        V.push_back(VectorType::get(II->getType(), BundleWidth));
3181
124k
      Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
3182
124k
    }
3183
810k
3184
810k
    PrevInst = Inst;
3185
810k
  }
3186
290k
3187
290k
  return Cost;
3188
290k
}
3189
3190
290k
int BoUpSLP::getTreeCost() {
3191
290k
  int Cost = 0;
3192
290k
  LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
3193
290k
                    << VectorizableTree.size() << ".\n");
3194
290k
3195
290k
  unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
3196
290k
3197
1.60M
  for (unsigned I = 0, E = VectorizableTree.size(); I < E; 
++I1.31M
) {
3198
1.31M
    TreeEntry &TE = *VectorizableTree[I].get();
3199
1.31M
3200
1.31M
    // We create duplicate tree entries for gather sequences that have multiple
3201
1.31M
    // uses. However, we should not compute the cost of duplicate sequences.
3202
1.31M
    // For example, if we have a build vector (i.e., insertelement sequence)
3203
1.31M
    // that is used by more than one vector instruction, we only need to
3204
1.31M
    // compute the cost of the insertelement instructions once. The redundant
3205
1.31M
    // instructions will be eliminated by CSE.
3206
1.31M
    //
3207
1.31M
    // We should consider not creating duplicate tree entries for gather
3208
1.31M
    // sequences, and instead add additional edges to the tree representing
3209
1.31M
    // their uses. Since such an approach results in fewer total entries,
3210
1.31M
    // existing heuristics based on tree size may yield different results.
3211
1.31M
    //
3212
1.31M
    if (TE.NeedToGather &&
3213
1.31M
        std::any_of(
3214
658k
            std::next(VectorizableTree.begin(), I + 1), VectorizableTree.end(),
3215
2.32M
            [TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
3216
2.32M
              return EntryPtr->NeedToGather && 
EntryPtr->isSame(TE.Scalars)1.62M
;
3217
2.32M
            }))
3218
52.8k
      continue;
3219
1.25M
3220
1.25M
    int C = getEntryCost(&TE);
3221
1.25M
    LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
3222
1.25M
                      << " for bundle that starts with " << *TE.Scalars[0]
3223
1.25M
                      << ".\n");
3224
1.25M
    Cost += C;
3225
1.25M
  }
3226
290k
3227
290k
  SmallPtrSet<Value *, 16> ExtractCostCalculated;
3228
290k
  int ExtractCost = 0;
3229
1.44M
  for (ExternalUser &EU : ExternalUses) {
3230
1.44M
    // We only add extract cost once for the same scalar.
3231
1.44M
    if (!ExtractCostCalculated.insert(EU.Scalar).second)
3232
730k
      continue;
3233
719k
3234
719k
    // Uses by ephemeral values are free (because the ephemeral value will be
3235
719k
    // removed prior to code generation, and so the extraction will be
3236
719k
    // removed as well).
3237
719k
    if (EphValues.count(EU.User))
3238
0
      continue;
3239
719k
3240
719k
    // If we plan to rewrite the tree in a smaller type, we will need to sign
3241
719k
    // extend the extracted value back to the original type. Here, we account
3242
719k
    // for the extract and the added cost of the sign extend if needed.
3243
719k
    auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
3244
719k
    auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
3245
719k
    if (MinBWs.count(ScalarRoot)) {
3246
6.58k
      auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
3247
6.58k
      auto Extend =
3248
6.58k
          MinBWs[ScalarRoot].second ? 
Instruction::SExt6.15k
:
Instruction::ZExt428
;
3249
6.58k
      VecTy = VectorType::get(MinTy, BundleWidth);
3250
6.58k
      ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
3251
6.58k
                                                   VecTy, EU.Lane);
3252
712k
    } else {
3253
712k
      ExtractCost +=
3254
712k
          TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
3255
712k
    }
3256
719k
  }
3257
290k
3258
290k
  int SpillCost = getSpillCost();
3259
290k
  Cost += SpillCost + ExtractCost;
3260
290k
3261
290k
  std::string Str;
3262
290k
  {
3263
290k
    raw_string_ostream OS(Str);
3264
290k
    OS << "SLP: Spill Cost = " << SpillCost << ".\n"
3265
290k
       << "SLP: Extract Cost = " << ExtractCost << ".\n"
3266
290k
       << "SLP: Total Cost = " << Cost << ".\n";
3267
290k
  }
3268
290k
  LLVM_DEBUG(dbgs() << Str);
3269
290k
3270
290k
  if (ViewSLPTree)
3271
0
    ViewGraph(this, "SLP" + F->getName(), false, Str);
3272
290k
3273
290k
  return Cost;
3274
290k
}
3275
3276
int BoUpSLP::getGatherCost(Type *Ty,
3277
422k
                           const DenseSet<unsigned> &ShuffledIndices) const {
3278
422k
  int Cost = 0;
3279
1.38M
  for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; 
++i958k
)
3280
958k
    if (!ShuffledIndices.count(i))
3281
945k
      Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
3282
422k
  if (!ShuffledIndices.empty())
3283
5.57k
    Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);
3284
422k
  return Cost;
3285
422k
}
3286
3287
422k
int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
3288
422k
  // Find the type of the operands in VL.
3289
422k
  Type *ScalarTy = VL[0]->getType();
3290
422k
  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
3291
0
    ScalarTy = SI->getValueOperand()->getType();
3292
422k
  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
3293
422k
  // Find the cost of inserting/extracting values from the vector.
3294
422k
  // Check if the same elements are inserted several times and count them as
3295
422k
  // shuffle candidates.
3296
422k
  DenseSet<unsigned> ShuffledElements;
3297
422k
  DenseSet<Value *> UniqueElements;
3298
422k
  // Iterate in reverse order to consider insert elements with the high cost.
3299
1.38M
  for (unsigned I = VL.size(); I > 0; 
--I958k
) {
3300
958k
    unsigned Idx = I - 1;
3301
958k
    if (!UniqueElements.insert(VL[Idx]).second)
3302
13.5k
      ShuffledElements.insert(Idx);
3303
958k
  }
3304
422k
  return getGatherCost(VecTy, ShuffledElements);
3305
422k
}
3306
3307
// Perform operand reordering on the instructions in VL and return the reordered
3308
// operands in Left and Right.
3309
void BoUpSLP::reorderInputsAccordingToOpcode(
3310
    ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left,
3311
    SmallVectorImpl<Value *> &Right, const DataLayout &DL,
3312
211k
    ScalarEvolution &SE) {
3313
211k
  if (VL.empty())
3314
0
    return;
3315
211k
  VLOperands Ops(VL, DL, SE);
3316
211k
  // Reorder the operands in place.
3317
211k
  Ops.reorder();
3318
211k
  Left = Ops.getVL(0);
3319
211k
  Right = Ops.getVL(1);
3320
211k
}
3321
3322
void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL,
3323
61.6k
                                        const InstructionsState &S) {
3324
61.6k
  // Get the basic block this bundle is in. All instructions in the bundle
3325
61.6k
  // should be in this block.
3326
61.6k
  auto *Front = cast<Instruction>(S.OpValue);
3327
61.6k
  auto *BB = Front->getParent();
3328
61.6k
  assert(llvm::all_of(make_range(VL.begin(), VL.end()), [=](Value *V) -> bool {
3329
61.6k
    auto *I = cast<Instruction>(V);
3330
61.6k
    return !S.isOpcodeOrAlt(I) || I->getParent() == BB;
3331
61.6k
  }));
3332
61.6k
3333
61.6k
  // The last instruction in the bundle in program order.
3334
61.6k
  Instruction *LastInst = nullptr;
3335
61.6k
3336
61.6k
  // Find the last instruction. The common case should be that BB has been
3337
61.6k
  // scheduled, and the last instruction is VL.back(). So we start with
3338
61.6k
  // VL.back() and iterate over schedule data until we reach the end of the
3339
61.6k
  // bundle. The end of the bundle is marked by null ScheduleData.
3340
61.6k
  if (BlocksSchedules.count(BB)) {
3341
61.6k
    auto *Bundle =
3342
61.6k
        BlocksSchedules[BB]->getScheduleData(isOneOf(S, VL.back()));
3343
61.6k
    if (Bundle && 
Bundle->isPartOfBundle()61.6k
)
3344
123k
      
for (; 61.6k
Bundle;
Bundle = Bundle->NextInBundle61.6k
)
3345
61.6k
        if (Bundle->OpValue == Bundle->Inst)
3346
61.6k
          LastInst = Bundle->Inst;
3347
61.6k
  }
3348
61.6k
3349
61.6k
  // LastInst can still be null at this point if there's either not an entry
3350
61.6k
  // for BB in BlocksSchedules or there's no ScheduleData available for
3351
61.6k
  // VL.back(). This can be the case if buildTree_rec aborts for various
3352
61.6k
  // reasons (e.g., the maximum recursion depth is reached, the maximum region
3353
61.6k
  // size is reached, etc.). ScheduleData is initialized in the scheduling
3354
61.6k
  // "dry-run".
3355
61.6k
  //
3356
61.6k
  // If this happens, we can still find the last instruction by brute force. We
3357
61.6k
  // iterate forwards from Front (inclusive) until we either see all
3358
61.6k
  // instructions in the bundle or reach the end of the block. If Front is the
3359
61.6k
  // last instruction in program order, LastInst will be set to Front, and we
3360
61.6k
  // will visit all the remaining instructions in the block.
3361
61.6k
  //
3362
61.6k
  // One of the reasons we exit early from buildTree_rec is to place an upper
3363
61.6k
  // bound on compile-time. Thus, taking an additional compile-time hit here is
3364
61.6k
  // not ideal. However, this should be exceedingly rare since it requires that
3365
61.6k
  // we both exit early from buildTree_rec and that the bundle be out-of-order
3366
61.6k
  // (causing us to iterate all the way to the end of the block).
3367
61.6k
  if (!LastInst) {
3368
2
    SmallPtrSet<Value *, 16> Bundle(VL.begin(), VL.end());
3369
30
    for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) {
3370
30
      if (Bundle.erase(&I) && 
S.isOpcodeOrAlt(&I)16
)
3371
16
        LastInst = &I;
3372
30
      if (Bundle.empty())
3373
2
        break;
3374
30
    }
3375
2
  }
3376
61.6k
3377
61.6k
  // Set the insertion point after the last instruction in the bundle. Set the
3378
61.6k
  // debug location to Front.
3379
61.6k
  Builder.SetInsertPoint(BB, ++LastInst->getIterator());
3380
61.6k
  Builder.SetCurrentDebugLocation(Front->getDebugLoc());
3381
61.6k
}
3382
3383
42.1k
Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
3384
42.1k
  Value *Vec = UndefValue::get(Ty);
3385
42.1k
  // Generate the 'InsertElement' instruction.
3386
163k
  for (unsigned i = 0; i < Ty->getNumElements(); 
++i121k
) {
3387
121k
    Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
3388
121k
    if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) {
3389
12.0k
      GatherSeq.insert(Insrt);
3390
12.0k
      CSEBlocks.insert(Insrt->getParent());
3391
12.0k
3392
12.0k
      // Add to our 'need-to-extract' list.
3393
12.0k
      if (TreeEntry *E = getTreeEntry(VL[i])) {
3394
179
        // Find which lane we need to extract.
3395
179
        int FoundLane = -1;
3396
384
        for (unsigned Lane = 0, LE = E->Scalars.size(); Lane != LE; 
++Lane205
) {
3397
384
          // Is this the lane of the scalar that we are looking for ?
3398
384
          if (E->Scalars[Lane] == VL[i]) {
3399
179
            FoundLane = Lane;
3400
179
            break;
3401
179
          }
3402
384
        }
3403
179
        assert(FoundLane >= 0 && "Could not find the correct lane");
3404
179
        if (!E->ReuseShuffleIndices.empty()) {
3405
1
          FoundLane =
3406
1
              std::distance(E->ReuseShuffleIndices.begin(),
3407
1
                            llvm::find(E->ReuseShuffleIndices, FoundLane));
3408
1
        }
3409
179
        ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
3410
179
      }
3411
12.0k
    }
3412
121k
  }
3413
42.1k
3414
42.1k
  return Vec;
3415
42.1k
}
3416
3417
63.8k
Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
3418
63.8k
  InstructionsState S = getSameOpcode(VL);
3419
63.8k
  if (S.getOpcode()) {
3420
26.0k
    if (TreeEntry *E = getTreeEntry(S.OpValue)) {
3421
21.8k
      if (E->isSame(VL)) {
3422
21.7k
        Value *V = vectorizeTree(E);
3423
21.7k
        if (VL.size() == E->Scalars.size() && 
!E->ReuseShuffleIndices.empty()21.7k
) {
3424
0
          // We need to get the vectorized value but without shuffle.
3425
0
          if (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {
3426
0
            V = SV->getOperand(0);
3427
0
          } else {
3428
0
            // Reshuffle to get only unique values.
3429
0
            SmallVector<unsigned, 4> UniqueIdxs;
3430
0
            SmallSet<unsigned, 4> UsedIdxs;
3431
0
            for(unsigned Idx : E->ReuseShuffleIndices)
3432
0
              if (UsedIdxs.insert(Idx).second)
3433
0
                UniqueIdxs.emplace_back(Idx);
3434
0
            V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()),
3435
0
                                            UniqueIdxs);
3436
0
          }
3437
0
        }
3438
21.7k
        return V;
3439
21.7k
      }
3440
42.1k
    }
3441
26.0k
  }
3442
42.1k
3443
42.1k
  Type *ScalarTy = S.OpValue->getType();
3444
42.1k
  if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
3445
0
    ScalarTy = SI->getValueOperand()->getType();
3446
42.1k
3447
42.1k
  // Check that every instruction appears once in this bundle.
3448
42.1k
  SmallVector<unsigned, 4> ReuseShuffleIndicies;
3449
42.1k
  SmallVector<Value *, 4> UniqueValues;
3450
42.1k
  if (VL.size() > 2) {
3451
16.1k
    DenseMap<Value *, unsigned> UniquePositions;
3452
69.5k
    for (Value *V : VL) {
3453
69.5k
      auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
3454
69.5k
      ReuseShuffleIndicies.emplace_back(Res.first->second);
3455
69.5k
      if (Res.second || 
isa<Constant>(V)29.8k
)
3456
67.6k
        UniqueValues.emplace_back(V);
3457
69.5k
    }
3458
16.1k
    // Do not shuffle single element or if number of unique values is not power
3459
16.1k
    // of 2.
3460
16.1k
    if (UniqueValues.size() == VL.size() || 
UniqueValues.size() <= 1485
||
3461
16.1k
        
!llvm::isPowerOf2_32(UniqueValues.size())28
)
3462
16.1k
      ReuseShuffleIndicies.clear();
3463
27
    else
3464
27
      VL = UniqueValues;
3465
16.1k
  }
3466
42.1k
  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
3467
42.1k
3468
42.1k
  Value *V = Gather(VL, VecTy);
3469
42.1k
  if (!ReuseShuffleIndicies.empty()) {
3470
27
    V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
3471
27
                                    ReuseShuffleIndicies, "shuffle");
3472
27
    if (auto *I = dyn_cast<Instruction>(V)) {
3473
27
      GatherSeq.insert(I);
3474
27
      CSEBlocks.insert(I->getParent());
3475
27
    }
3476
27
  }
3477
42.1k
  return V;
3478
42.1k
}
3479
3480
static void inversePermutation(ArrayRef<unsigned> Indices,
3481
153
                               SmallVectorImpl<unsigned> &Mask) {
3482
153
  Mask.clear();
3483
153
  const unsigned E = Indices.size();
3484
153
  Mask.resize(E);
3485
591
  for (unsigned I = 0; I < E; 
++I438
)
3486
438
    Mask[Indices[I]] = I;
3487
153
}
3488
3489
65.0k
Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
3490
65.0k
  IRBuilder<>::InsertPointGuard Guard(Builder);
3491
65.0k
3492
65.0k
  if (E->VectorizedValue) {
3493
852
    LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
3494
852
    return E->VectorizedValue;
3495
852
  }
3496
64.1k
3497
64.1k
  InstructionsState S = getSameOpcode(E->Scalars);
3498
64.1k
  Instruction *VL0 = cast<Instruction>(S.OpValue);
3499
64.1k
  Type *ScalarTy = VL0->getType();
3500
64.1k
  if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
3501
41.1k
    ScalarTy = SI->getValueOperand()->getType();
3502
64.1k
  VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
3503
64.1k
3504
64.1k
  bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
3505
64.1k
3506
64.1k
  if (E->NeedToGather) {
3507
2
    setInsertPointAfterBundle(E->Scalars, S);
3508
2
    auto *V = Gather(E->Scalars, VecTy);
3509
2
    if (NeedToShuffleReuses) {
3510
0
      V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
3511
0
                                      E->ReuseShuffleIndices, "shuffle");
3512
0
      if (auto *I = dyn_cast<Instruction>(V)) {
3513
0
        GatherSeq.insert(I);
3514
0
        CSEBlocks.insert(I->getParent());
3515
0
      }
3516
0
    }
3517
2
    E->VectorizedValue = V;
3518
2
    return V;
3519
2
  }
3520
64.1k
3521
64.1k
  unsigned ShuffleOrOp = S.isAltShuffle() ?
3522
64.0k
           
(unsigned) Instruction::ShuffleVector160
: S.getOpcode();
3523
64.1k
  switch (ShuffleOrOp) {
3524
64.1k
    case Instruction::PHI: {
3525
345
      PHINode *PH = dyn_cast<PHINode>(VL0);
3526
345
      Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
3527
345
      Builder.SetCurrentDebugLocation(PH->getDebugLoc());
3528
345
      PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
3529
345
      Value *V = NewPhi;
3530
345
      if (NeedToShuffleReuses) {
3531
3
        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
3532
3
                                        E->ReuseShuffleIndices, "shuffle");
3533
3
      }
3534
345
      E->VectorizedValue = V;
3535
345
3536
345
      // PHINodes may have multiple entries from the same block. We want to
3537
345
      // visit every block once.
3538
345
      SmallPtrSet<BasicBlock*, 4> VisitedBBs;
3539
345
3540
1.15k
      for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; 
++i805
) {
3541
805
        ValueList Operands;
3542
805
        BasicBlock *IBB = PH->getIncomingBlock(i);
3543
805
3544
805
        if (!VisitedBBs.insert(IBB).second) {
3545
10
          NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
3546
10
          continue;
3547
10
        }
3548
795
3549
795
        Builder.SetInsertPoint(IBB->getTerminator());
3550
795
        Builder.SetCurrentDebugLocation(PH->getDebugLoc());
3551
795
        Value *Vec = vectorizeTree(E->getOperand(i));
3552
795
        NewPhi->addIncoming(Vec, IBB);
3553
795
      }
3554
345
3555
345
      assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
3556
345
             "Invalid number of incoming values");
3557
345
      return V;
3558
64.1k
    }
3559
64.1k
3560
64.1k
    case Instruction::ExtractElement: {
3561
2.18k
      if (!E->NeedToGather) {
3562
2.18k
        Value *V = E->getSingleOperand(0);
3563
2.18k
        if (!E->ReorderIndices.empty()) {
3564
11
          OrdersType Mask;
3565
11
          inversePermutation(E->ReorderIndices, Mask);
3566
11
          Builder.SetInsertPoint(VL0);
3567
11
          V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), Mask,
3568
11
                                          "reorder_shuffle");
3569
11
        }
3570
2.18k
        if (NeedToShuffleReuses) {
3571
3
          // TODO: Merge this shuffle with the ReorderShuffleMask.
3572
3
          if (E->ReorderIndices.empty())
3573
2
            Builder.SetInsertPoint(VL0);
3574
3
          V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
3575
3
                                          E->ReuseShuffleIndices, "shuffle");
3576
3
        }
3577
2.18k
        E->VectorizedValue = V;
3578
2.18k
        return V;
3579
2.18k
      }
3580
0
      setInsertPointAfterBundle(E->Scalars, S);
3581
0
      auto *V = Gather(E->Scalars, VecTy);
3582
0
      if (NeedToShuffleReuses) {
3583
0
        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
3584
0
                                        E->ReuseShuffleIndices, "shuffle");
3585
0
        if (auto *I = dyn_cast<Instruction>(V)) {
3586
0
          GatherSeq.insert(I);
3587
0
          CSEBlocks.insert(I->getParent());
3588
0
        }
3589
0
      }
3590
0
      E->VectorizedValue = V;
3591
0
      return V;
3592
0
    }
3593
6
    case Instruction::ExtractValue: {
3594
6
      if (!E->NeedToGather) {
3595
6
        LoadInst *LI = cast<LoadInst>(E->getSingleOperand(0));
3596
6
        Builder.SetInsertPoint(LI);
3597
6
        PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
3598
6
        Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
3599
6
        LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlignment());
3600
6
        Value *NewV = propagateMetadata(V, E->Scalars);
3601
6
        if (!E->ReorderIndices.empty()) {
3602
0
          OrdersType Mask;
3603
0
          inversePermutation(E->ReorderIndices, Mask);
3604
0
          NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy), Mask,
3605
0
                                             "reorder_shuffle");
3606
0
        }
3607
6
        if (NeedToShuffleReuses) {
3608
0
          // TODO: Merge this shuffle with the ReorderShuffleMask.
3609
0
          NewV = Builder.CreateShuffleVector(
3610
0
              NewV, UndefValue::get(VecTy), E->ReuseShuffleIndices, "shuffle");
3611
0
        }
3612
6
        E->VectorizedValue = NewV;
3613
6
        return NewV;
3614
6
      }
3615
0
      setInsertPointAfterBundle(E->Scalars, S);
3616
0
      auto *V = Gather(E->Scalars, VecTy);
3617
0
      if (NeedToShuffleReuses) {
3618
0
        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
3619
0
                                        E->ReuseShuffleIndices, "shuffle");
3620
0
        if (auto *I = dyn_cast<Instruction>(V)) {
3621
0
          GatherSeq.insert(I);
3622
0
          CSEBlocks.insert(I->getParent());
3623
0
        }
3624
0
      }
3625
0
      E->VectorizedValue = V;
3626
0
      return V;
3627
0
    }
3628
1.23k
    case Instruction::ZExt:
3629
1.23k
    case Instruction::SExt:
3630
1.23k
    case Instruction::FPToUI:
3631
1.23k
    case Instruction::FPToSI:
3632
1.23k
    case Instruction::FPExt:
3633
1.23k
    case Instruction::PtrToInt:
3634
1.23k
    case Instruction::IntToPtr:
3635
1.23k
    case Instruction::SIToFP:
3636
1.23k
    case Instruction::UIToFP:
3637
1.23k
    case Instruction::Trunc:
3638
1.23k
    case Instruction::FPTrunc:
3639
1.23k
    case Instruction::BitCast: {
3640
1.23k
      setInsertPointAfterBundle(E->Scalars, S);
3641
1.23k
3642
1.23k
      Value *InVec = vectorizeTree(E->getOperand(0));
3643
1.23k
3644
1.23k
      if (E->VectorizedValue) {
3645
0
        LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
3646
0
        return E->VectorizedValue;
3647
0
      }
3648
1.23k
3649
1.23k
      CastInst *CI = dyn_cast<CastInst>(VL0);
3650
1.23k
      Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
3651
1.23k
      if (NeedToShuffleReuses) {
3652
0
        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
3653
0
                                        E->ReuseShuffleIndices, "shuffle");
3654
0
      }
3655
1.23k
      E->VectorizedValue = V;
3656
1.23k
      ++NumVectorInstructions;
3657
1.23k
      return V;
3658
1.23k
    }
3659
1.23k
    case Instruction::FCmp:
3660
202
    case Instruction::ICmp: {
3661
202
      setInsertPointAfterBundle(E->Scalars, S);
3662
202
3663
202
      Value *L = vectorizeTree(E->getOperand(0));
3664
202
      Value *R = vectorizeTree(E->getOperand(1));
3665
202
3666
202
      if (E->VectorizedValue) {
3667
4
        LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
3668
4
        return E->VectorizedValue;
3669
4
      }
3670
198
3671
198
      CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
3672
198
      Value *V;
3673
198
      if (S.getOpcode() == Instruction::FCmp)
3674
94
        V = Builder.CreateFCmp(P0, L, R);
3675
104
      else
3676
104
        V = Builder.CreateICmp(P0, L, R);
3677
198
3678
198
      propagateIRFlags(V, E->Scalars, VL0);
3679
198
      if (NeedToShuffleReuses) {
3680
0
        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
3681
0
                                        E->ReuseShuffleIndices, "shuffle");
3682
0
      }
3683
198
      E->VectorizedValue = V;
3684
198
      ++NumVectorInstructions;
3685
198
      return V;
3686
198
    }
3687
198
    case Instruction::Select: {
3688
138
      setInsertPointAfterBundle(E->Scalars, S);
3689
138
3690
138
      Value *Cond = vectorizeTree(E->getOperand(0));
3691
138
      Value *True = vectorizeTree(E->getOperand(1));
3692
138
      Value *False = vectorizeTree(E->getOperand(2));
3693
138
3694
138
      if (E->VectorizedValue) {
3695
4
        LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
3696
4
        return E->VectorizedValue;
3697
4
      }
3698
134
3699
134
      Value *V = Builder.CreateSelect(Cond, True, False);
3700
134
      if (NeedToShuffleReuses) {
3701
0
        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
3702
0
                                        E->ReuseShuffleIndices, "shuffle");
3703
0
      }
3704
134
      E->VectorizedValue = V;
3705
134
      ++NumVectorInstructions;
3706
134
      return V;
3707
134
    }
3708
134
    case Instruction::FNeg: {
3709
3
      setInsertPointAfterBundle(E->Scalars, S);
3710
3
3711
3
      Value *Op = vectorizeTree(E->getOperand(0));
3712
3
3713
3
      if (E->VectorizedValue) {
3714
0
        LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
3715
0
        return E->VectorizedValue;
3716
0
      }
3717
3
3718
3
      Value *V = Builder.CreateUnOp(
3719
3
          static_cast<Instruction::UnaryOps>(S.getOpcode()), Op);
3720
3
      propagateIRFlags(V, E->Scalars, VL0);
3721
3
      if (auto *I = dyn_cast<Instruction>(V))
3722
3
        V = propagateMetadata(I, E->Scalars);
3723
3
3724
3
      if (NeedToShuffleReuses) {
3725
0
        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
3726
0
                                        E->ReuseShuffleIndices, "shuffle");
3727
0
      }
3728
3
      E->VectorizedValue = V;
3729
3
      ++NumVectorInstructions;
3730
3
3731
3
      return V;
3732
3
    }
3733
8.84k
    case Instruction::Add:
3734
8.84k
    case Instruction::FAdd:
3735
8.84k
    case Instruction::Sub:
3736
8.84k
    case Instruction::FSub:
3737
8.84k
    case Instruction::Mul:
3738
8.84k
    case Instruction::FMul:
3739
8.84k
    case Instruction::UDiv:
3740
8.84k
    case Instruction::SDiv:
3741
8.84k
    case Instruction::FDiv:
3742
8.84k
    case Instruction::URem:
3743
8.84k
    case Instruction::SRem:
3744
8.84k
    case Instruction::FRem:
3745
8.84k
    case Instruction::Shl:
3746
8.84k
    case Instruction::LShr:
3747
8.84k
    case Instruction::AShr:
3748
8.84k
    case Instruction::And:
3749
8.84k
    case Instruction::Or:
3750
8.84k
    case Instruction::Xor: {
3751
8.84k
      setInsertPointAfterBundle(E->Scalars, S);
3752
8.84k
3753
8.84k
      Value *LHS = vectorizeTree(E->getOperand(0));
3754
8.84k
      Value *RHS = vectorizeTree(E->getOperand(1));
3755
8.84k
3756
8.84k
      if (E->VectorizedValue) {
3757
98
        LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
3758
98
        return E->VectorizedValue;
3759
98
      }
3760
8.74k
3761
8.74k
      Value *V = Builder.CreateBinOp(
3762
8.74k
          static_cast<Instruction::BinaryOps>(S.getOpcode()), LHS, RHS);
3763
8.74k
      propagateIRFlags(V, E->Scalars, VL0);
3764
8.74k
      if (auto *I = dyn_cast<Instruction>(V))
3765
8.73k
        V = propagateMetadata(I, E->Scalars);
3766
8.74k
3767
8.74k
      if (NeedToShuffleReuses) {
3768
3
        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
3769
3
                                        E->ReuseShuffleIndices, "shuffle");
3770
3
      }
3771
8.74k
      E->VectorizedValue = V;
3772
8.74k
      ++NumVectorInstructions;
3773
8.74k
3774
8.74k
      return V;
3775
8.74k
    }
3776
8.74k
    case Instruction::Load: {
3777
8.70k
      // Loads are inserted at the head of the tree because we don't want to
3778
8.70k
      // sink them all the way down past store instructions.
3779
8.70k
      bool IsReorder = !E->ReorderIndices.empty();
3780
8.70k
      if (IsReorder) {
3781
142
        S = getSameOpcode(E->Scalars, E->ReorderIndices.front());
3782
142
        VL0 = cast<Instruction>(S.OpValue);
3783
142
      }
3784
8.70k
      setInsertPointAfterBundle(E->Scalars, S);
3785
8.70k
3786
8.70k
      LoadInst *LI = cast<LoadInst>(VL0);
3787
8.70k
      Type *ScalarLoadTy = LI->getType();
3788
8.70k
      unsigned AS = LI->getPointerAddressSpace();
3789
8.70k
3790
8.70k
      Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
3791
8.70k
                                            VecTy->getPointerTo(AS));
3792
8.70k
3793
8.70k
      // The pointer operand uses an in-tree scalar so we add the new BitCast to
3794
8.70k
      // ExternalUses list to make sure that an extract will be generated in the
3795
8.70k
      // future.
3796
8.70k
      Value *PO = LI->getPointerOperand();
3797
8.70k
      if (getTreeEntry(PO))
3798
0
        ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0));
3799
8.70k
3800
8.70k
      unsigned Alignment = LI->getAlignment();
3801
8.70k
      LI = Builder.CreateLoad(VecTy, VecPtr);
3802
8.70k
      if (!Alignment) {
3803
24
        Alignment = DL->getABITypeAlignment(ScalarLoadTy);
3804
24
      }
3805
8.70k
      LI->setAlignment(Alignment);
3806
8.70k
      Value *V = propagateMetadata(LI, E->Scalars);
3807
8.70k
      if (IsReorder) {
3808
142
        OrdersType Mask;
3809
142
        inversePermutation(E->ReorderIndices, Mask);
3810
142
        V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()),
3811
142
                                        Mask, "reorder_shuffle");
3812
142
      }
3813
8.70k
      if (NeedToShuffleReuses) {
3814
7
        // TODO: Merge this shuffle with the ReorderShuffleMask.
3815
7
        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
3816
7
                                        E->ReuseShuffleIndices, "shuffle");
3817
7
      }
3818
8.70k
      E->VectorizedValue = V;
3819
8.70k
      ++NumVectorInstructions;
3820
8.70k
      return V;
3821
8.74k
    }
3822
41.1k
    case Instruction::Store: {
3823
41.1k
      StoreInst *SI = cast<StoreInst>(VL0);
3824
41.1k
      unsigned Alignment = SI->getAlignment();
3825
41.1k
      unsigned AS = SI->getPointerAddressSpace();
3826
41.1k
3827
41.1k
      setInsertPointAfterBundle(E->Scalars, S);
3828
41.1k
3829
41.1k
      Value *VecValue = vectorizeTree(E->getOperand(0));
3830
41.1k
      Value *ScalarPtr = SI->getPointerOperand();
3831
41.1k
      Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS));
3832
41.1k
      StoreInst *ST = Builder.CreateStore(VecValue, VecPtr);
3833
41.1k
3834
41.1k
      // The pointer operand uses an in-tree scalar, so add the new BitCast to
3835
41.1k
      // ExternalUses to make sure that an extract will be generated in the
3836
41.1k
      // future.
3837
41.1k
      if (getTreeEntry(ScalarPtr))
3838
1
        ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0));
3839
41.1k
3840
41.1k
      if (!Alignment)
3841
22
        Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
3842
41.1k
3843
41.1k
      ST->setAlignment(Alignment);
3844
41.1k
      Value *V = propagateMetadata(ST, E->Scalars);
3845
41.1k
      if (NeedToShuffleReuses) {
3846
0
        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
3847
0
                                        E->ReuseShuffleIndices, "shuffle");
3848
0
      }
3849
41.1k
      E->VectorizedValue = V;
3850
41.1k
      ++NumVectorInstructions;
3851
41.1k
      return V;
3852
8.74k
    }
3853
8.74k
    case Instruction::GetElementPtr: {
3854
45
      setInsertPointAfterBundle(E->Scalars, S);
3855
45
3856
45
      Value *Op0 = vectorizeTree(E->getOperand(0));
3857
45
3858
45
      std::vector<Value *> OpVecs;
3859
90
      for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
3860
45
           ++j) {
3861
45
        Value *OpVec = vectorizeTree(E->getOperand(j));
3862
45
        OpVecs.push_back(OpVec);
3863
45
      }
3864
45
3865
45
      Value *V = Builder.CreateGEP(
3866
45
          cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
3867
45
      if (Instruction *I = dyn_cast<Instruction>(V))
3868
45
        V = propagateMetadata(I, E->Scalars);
3869
45
3870
45
      if (NeedToShuffleReuses) {
3871
0
        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
3872
0
                                        E->ReuseShuffleIndices, "shuffle");
3873
0
      }
3874
45
      E->VectorizedValue = V;
3875
45
      ++NumVectorInstructions;
3876
45
3877
45
      return V;
3878
8.74k
    }
3879
8.74k
    case Instruction::Call: {
3880
1.14k
      CallInst *CI = cast<CallInst>(VL0);
3881
1.14k
      setInsertPointAfterBundle(E->Scalars, S);
3882
1.14k
3883
1.14k
      Intrinsic::ID IID  = Intrinsic::not_intrinsic;
3884
1.14k
      if (Function *FI = CI->getCalledFunction())
3885
1.14k
        IID = FI->getIntrinsicID();
3886
1.14k
3887
1.14k
      Value *ScalarArg = nullptr;
3888
1.14k
      std::vector<Value *> OpVecs;
3889
3.12k
      for (int j = 0, e = CI->getNumArgOperands(); j < e; 
++j1.97k
) {
3890
1.97k
        ValueList OpVL;
3891
1.97k
        // Some intrinsics have scalar arguments. This argument should not be
3892
1.97k
        // vectorized.
3893
1.97k
        if (hasVectorInstrinsicScalarOpd(IID, j)) {
3894
240
          CallInst *CEI = cast<CallInst>(VL0);
3895
240
          ScalarArg = CEI->getArgOperand(j);
3896
240
          OpVecs.push_back(CEI->getArgOperand(j));
3897
240
          continue;
3898
240
        }
3899
1.73k
3900
1.73k
        Value *OpVec = vectorizeTree(E->getOperand(j));
3901
1.73k
        LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
3902
1.73k
        OpVecs.push_back(OpVec);
3903
1.73k
      }
3904
1.14k
3905
1.14k
      Module *M = F->getParent();
3906
1.14k
      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3907
1.14k
      Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
3908
1.14k
      Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
3909
1.14k
      SmallVector<OperandBundleDef, 1> OpBundles;
3910
1.14k
      CI->getOperandBundlesAsDefs(OpBundles);
3911
1.14k
      Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
3912
1.14k
3913
1.14k
      // The scalar argument uses an in-tree scalar so we add the new vectorized
3914
1.14k
      // call to ExternalUses list to make sure that an extract will be
3915
1.14k
      // generated in the future.
3916
1.14k
      if (ScalarArg && 
getTreeEntry(ScalarArg)240
)
3917
1
        ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
3918
1.14k
3919
1.14k
      propagateIRFlags(V, E->Scalars, VL0);
3920
1.14k
      if (NeedToShuffleReuses) {
3921
0
        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
3922
0
                                        E->ReuseShuffleIndices, "shuffle");
3923
0
      }
3924
1.14k
      E->VectorizedValue = V;
3925
1.14k
      ++NumVectorInstructions;
3926
1.14k
      return V;
3927
8.74k
    }
3928
8.74k
    case Instruction::ShuffleVector: {
3929
160
      assert(S.isAltShuffle() &&
3930
160
             ((Instruction::isBinaryOp(S.getOpcode()) &&
3931
160
               Instruction::isBinaryOp(S.getAltOpcode())) ||
3932
160
              (Instruction::isCast(S.getOpcode()) &&
3933
160
               Instruction::isCast(S.getAltOpcode()))) &&
3934
160
             "Invalid Shuffle Vector Operand");
3935
160
3936
160
      Value *LHS = nullptr, *RHS = nullptr;
3937
160
      if (Instruction::isBinaryOp(S.getOpcode())) {
3938
146
        setInsertPointAfterBundle(E->Scalars, S);
3939
146
        LHS = vectorizeTree(E->getOperand(0));
3940
146
        RHS = vectorizeTree(E->getOperand(1));
3941
146
      } else {
3942
14
        setInsertPointAfterBundle(E->Scalars, S);
3943
14
        LHS = vectorizeTree(E->getOperand(0));
3944
14
      }
3945
160
3946
160
      if (E->VectorizedValue) {
3947
0
        LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
3948
0
        return E->VectorizedValue;
3949
0
      }
3950
160
3951
160
      Value *V0, *V1;
3952
160
      if (Instruction::isBinaryOp(S.getOpcode())) {
3953
146
        V0 = Builder.CreateBinOp(
3954
146
          static_cast<Instruction::BinaryOps>(S.getOpcode()), LHS, RHS);
3955
146
        V1 = Builder.CreateBinOp(
3956
146
          static_cast<Instruction::BinaryOps>(S.getAltOpcode()), LHS, RHS);
3957
146
      } else {
3958
14
        V0 = Builder.CreateCast(
3959
14
            static_cast<Instruction::CastOps>(S.getOpcode()), LHS, VecTy);
3960
14
        V1 = Builder.CreateCast(
3961
14
            static_cast<Instruction::CastOps>(S.getAltOpcode()), LHS, VecTy);
3962
14
      }
3963
160
3964
160
      // Create shuffle to take alternate operations from the vector.
3965
160
      // Also, gather up main and alt scalar ops to propagate IR flags to
3966
160
      // each vector operation.
3967
160
      ValueList OpScalars, AltScalars;
3968
160
      unsigned e = E->Scalars.size();
3969
160
      SmallVector<Constant *, 8> Mask(e);
3970
870
      for (unsigned i = 0; i < e; 
++i710
) {
3971
710
        auto *OpInst = cast<Instruction>(E->Scalars[i]);
3972
710
        assert(S.isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode");
3973
710
        if (OpInst->getOpcode() == S.getAltOpcode()) {
3974
352
          Mask[i] = Builder.getInt32(e + i);
3975
352
          AltScalars.push_back(E->Scalars[i]);
3976
358
        } else {
3977
358
          Mask[i] = Builder.getInt32(i);
3978
358
          OpScalars.push_back(E->Scalars[i]);
3979
358
        }
3980
710
      }
3981
160
3982
160
      Value *ShuffleMask = ConstantVector::get(Mask);
3983
160
      propagateIRFlags(V0, OpScalars);
3984
160
      propagateIRFlags(V1, AltScalars);
3985
160
3986
160
      Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
3987
160
      if (Instruction *I = dyn_cast<Instruction>(V))
3988
160
        V = propagateMetadata(I, E->Scalars);
3989
160
      if (NeedToShuffleReuses) {
3990
24
        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
3991
24
                                        E->ReuseShuffleIndices, "shuffle");
3992
24
      }
3993
160
      E->VectorizedValue = V;
3994
160
      ++NumVectorInstructions;
3995
160
3996
160
      return V;
3997
160
    }
3998
160
    default:
3999
0
    llvm_unreachable("unknown inst");
4000
0
  }
4001
0
  return nullptr;
4002
0
}
4003
4004
43.1k
Value *BoUpSLP::vectorizeTree() {
4005
43.1k
  ExtraValueToDebugLocsMap ExternallyUsedValues;
4006
43.1k
  return vectorizeTree(ExternallyUsedValues);
4007
43.1k
}
4008
4009
Value *
4010
43.2k
BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
4011
43.2k
  // All blocks must be scheduled before any instructions are inserted.
4012
256k
  for (auto &BSIter : BlocksSchedules) {
4013
256k
    scheduleBlock(BSIter.second.get());
4014
256k
  }
4015
43.2k
4016
43.2k
  Builder.SetInsertPoint(&F->getEntryBlock().front());
4017
43.2k
  auto *VectorRoot = vectorizeTree(VectorizableTree[0].get());
4018
43.2k
4019
43.2k
  // If the vectorized tree can be rewritten in a smaller type, we truncate the
4020
43.2k
  // vectorized root. InstCombine will then rewrite the entire expression. We
4021
43.2k
  // sign extend the extracted values below.
4022
43.2k
  auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
4023
43.2k
  if (MinBWs.count(ScalarRoot)) {
4024
23
    if (auto *I = dyn_cast<Instruction>(VectorRoot))
4025
23
      Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
4026
23
    auto BundleWidth = VectorizableTree[0]->Scalars.size();
4027
23
    auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
4028
23
    auto *VecTy = VectorType::get(MinTy, BundleWidth);
4029
23
    auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);
4030
23
    VectorizableTree[0]->VectorizedValue = Trunc;
4031
23
  }
4032
43.2k
4033
43.2k
  LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
4034
43.2k
                    << " values .\n");
4035
43.2k
4036
43.2k
  // If necessary, sign-extend or zero-extend ScalarRoot to the larger type
4037
43.2k
  // specified by ScalarType.
4038
43.2k
  auto extend = [&](Value *ScalarRoot, Value *Ex, Type *ScalarType) {
4039
12.2k
    if (!MinBWs.count(ScalarRoot))
4040
12.1k
      return Ex;
4041
114
    if (MinBWs[ScalarRoot].second)
4042
112
      return Builder.CreateSExt(Ex, ScalarType);
4043
2
    return Builder.CreateZExt(Ex, ScalarType);
4044
2
  };
4045
43.2k
4046
43.2k
  // Extract all of the elements with the external uses.
4047
43.2k
  for (const auto &ExternalUse : ExternalUses) {
4048
12.4k
    Value *Scalar = ExternalUse.Scalar;
4049
12.4k
    llvm::User *User = ExternalUse.User;
4050
12.4k
4051
12.4k
    // Skip users that we already RAUW. This happens when one instruction
4052
12.4k
    // has multiple uses of the same value.
4053
12.4k
    if (User && 
!is_contained(Scalar->users(), User)12.4k
)
4054
373
      continue;
4055
12.0k
    TreeEntry *E = getTreeEntry(Scalar);
4056
12.0k
    assert(E && "Invalid scalar");
4057
12.0k
    assert(!E->NeedToGather && "Extracting from a gather list");
4058
12.0k
4059
12.0k
    Value *Vec = E->VectorizedValue;
4060
12.0k
    assert(Vec && "Can't find vectorizable value");
4061
12.0k
4062
12.0k
    Value *Lane = Builder.getInt32(ExternalUse.Lane);
4063
12.0k
    // If User == nullptr, the Scalar is used as extra arg. Generate
4064
12.0k
    // ExtractElement instruction and update the record for this scalar in
4065
12.0k
    // ExternallyUsedValues.
4066
12.0k
    if (!User) {
4067
9
      assert(ExternallyUsedValues.count(Scalar) &&
4068
9
             "Scalar with nullptr as an external user must be registered in "
4069
9
             "ExternallyUsedValues map");
4070
9
      if (auto *VecI = dyn_cast<Instruction>(Vec)) {
4071
9
        Builder.SetInsertPoint(VecI->getParent(),
4072
9
                               std::next(VecI->getIterator()));
4073
9
      } else {
4074
0
        Builder.SetInsertPoint(&F->getEntryBlock().front());
4075
0
      }
4076
9
      Value *Ex = Builder.CreateExtractElement(Vec, Lane);
4077
9
      Ex = extend(ScalarRoot, Ex, Scalar->getType());
4078
9
      CSEBlocks.insert(cast<Instruction>(Scalar)->getParent());
4079
9
      auto &Locs = ExternallyUsedValues[Scalar];
4080
9
      ExternallyUsedValues.insert({Ex, Locs});
4081
9
      ExternallyUsedValues.erase(Scalar);
4082
9
      // Required to update internally referenced instructions.
4083
9
      Scalar->replaceAllUsesWith(Ex);
4084
9
      continue;
4085
9
    }
4086
12.0k
4087
12.0k
    // Generate extracts for out-of-tree users.
4088
12.0k
    // Find the insertion point for the extractelement lane.
4089
12.0k
    if (auto *VecI = dyn_cast<Instruction>(Vec)) {
4090
12.0k
      if (PHINode *PH = dyn_cast<PHINode>(User)) {
4091
1.11k
        for (int i = 0, e = PH->getNumIncomingValues(); i != e; 
++i902
) {
4092
902
          if (PH->getIncomingValue(i) == Scalar) {
4093
440
            Instruction *IncomingTerminator =
4094
440
                PH->getIncomingBlock(i)->getTerminator();
4095
440
            if (isa<CatchSwitchInst>(IncomingTerminator)) {
4096
1
              Builder.SetInsertPoint(VecI->getParent(),
4097
1
                                     std::next(VecI->getIterator()));
4098
439
            } else {
4099
439
              Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
4100
439
            }
4101
440
            Value *Ex = Builder.CreateExtractElement(Vec, Lane);
4102
440
            Ex = extend(ScalarRoot, Ex, Scalar->getType());
4103
440
            CSEBlocks.insert(PH->getIncomingBlock(i));
4104
440
            PH->setOperand(i, Ex);
4105
440
          }
4106
902
        }
4107
11.8k
      } else {
4108
11.8k
        Builder.SetInsertPoint(cast<Instruction>(User));
4109
11.8k
        Value *Ex = Builder.CreateExtractElement(Vec, Lane);
4110
11.8k
        Ex = extend(ScalarRoot, Ex, Scalar->getType());
4111
11.8k
        CSEBlocks.insert(cast<Instruction>(User)->getParent());
4112
11.8k
        User->replaceUsesOfWith(Scalar, Ex);
4113
11.8k
      }
4114
12.0k
    } else {
4115
10
      Builder.SetInsertPoint(&F->getEntryBlock().front());
4116
10
      Value *Ex = Builder.CreateExtractElement(Vec, Lane);
4117
10
      Ex = extend(ScalarRoot, Ex, Scalar->getType());
4118
10
      CSEBlocks.insert(&F->getEntryBlock());
4119
10
      User->replaceUsesOfWith(Scalar, Ex);
4120
10
    }
4121
12.0k
4122
12.0k
    LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
4123
12.0k
  }
4124
43.2k
4125
43.2k
  // For each vectorized value:
4126
106k
  for (auto &TEPtr : VectorizableTree) {
4127
106k
    TreeEntry *Entry = TEPtr.get();
4128
106k
4129
106k
    // No need to handle users of gathered values.
4130
106k
    if (Entry->NeedToGather)
4131
42.3k
      continue;
4132
64.0k
4133
64.0k
    assert(Entry->VectorizedValue && "Can't find vectorizable value");
4134
64.0k
4135
64.0k
    // For each lane:
4136
288k
    for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; 
++Lane224k
) {
4137
224k
      Value *Scalar = Entry->Scalars[Lane];
4138
224k
4139
224k
      Type *Ty = Scalar->getType();
4140
224k
      if (!Ty->isVoidTy()) {
4141
#ifndef NDEBUG
4142
        for (User *U : Scalar->users()) {
4143
          LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
4144
4145
          // It is legal to replace users in the ignorelist by undef.
4146
          assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) &&
4147
                 "Replacing out-of-tree value with undef");
4148
        }
4149
#endif
4150
        Value *Undef = UndefValue::get(Ty);
4151
95.8k
        Scalar->replaceAllUsesWith(Undef);
4152
95.8k
      }
4153
224k
      LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
4154
224k
      eraseInstruction(cast<Instruction>(Scalar));
4155
224k
    }
4156
64.0k
  }
4157
43.2k
4158
43.2k
  Builder.ClearInsertionPoint();
4159
43.2k
4160
43.2k
  return VectorizableTree[0]->VectorizedValue;
4161
43.2k
}
4162
4163
15.6k
void BoUpSLP::optimizeGatherSequence() {
4164
15.6k
  LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
4165
15.6k
                    << " gather sequences instructions.\n");
4166
15.6k
  // LICM InsertElementInst sequences.
4167
15.6k
  for (Instruction *I : GatherSeq) {
4168
12.0k
    if (!isa<InsertElementInst>(I) && 
!isa<ShuffleVectorInst>(I)27
)
4169
0
      continue;
4170
12.0k
4171
12.0k
    // Check if this block is inside a loop.
4172
12.0k
    Loop *L = LI->getLoopFor(I->getParent());
4173
12.0k
    if (!L)
4174
8.61k
      continue;
4175
3.45k
4176
3.45k
    // Check if it has a preheader.
4177
3.45k
    BasicBlock *PreHeader = L->getLoopPreheader();
4178
3.45k
    if (!PreHeader)
4179
713
      continue;
4180
2.73k
4181
2.73k
    // If the vector or the element that we insert into it are
4182
2.73k
    // instructions that are defined in this basic block then we can't
4183
2.73k
    // hoist this instruction.
4184
2.73k
    auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
4185
2.73k
    auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
4186
2.73k
    if (Op0 && 
L->contains(Op0)1.60k
)
4187
1.21k
      continue;
4188
1.52k
    if (Op1 && 
L->contains(Op1)1.47k
)
4189
938
      continue;
4190
585
4191
585
    // We can hoist this instruction. Move it to the pre-header.
4192
585
    I->moveBefore(PreHeader->getTerminator());
4193
585
  }
4194
15.6k
4195
15.6k
  // Make a list of all reachable blocks in our CSE queue.
4196
15.6k
  SmallVector<const DomTreeNode *, 8> CSEWorkList;
4197
15.6k
  CSEWorkList.reserve(CSEBlocks.size());
4198
15.6k
  for (BasicBlock *BB : CSEBlocks)
4199
2.05k
    if (DomTreeNode *N = DT->getNode(BB)) {
4200
2.05k
      assert(DT->isReachableFromEntry(N));
4201
2.05k
      CSEWorkList.push_back(N);
4202
2.05k
    }
4203
15.6k
4204
15.6k
  // Sort blocks by domination. This ensures we visit a block after all blocks
4205
15.6k
  // dominating it are visited.
4206
15.6k
  llvm::stable_sort(CSEWorkList,
4207
15.6k
                    [this](const DomTreeNode *A, const DomTreeNode *B) {
4208
1.34k
                      return DT->properlyDominates(A, B);
4209
1.34k
                    });
4210
15.6k
4211
15.6k
  // Perform O(N^2) search over the gather sequences and merge identical
4212
15.6k
  // instructions. TODO: We can further optimize this scan if we split the
4213
15.6k
  // instructions into different buckets based on the insert lane.
4214
15.6k
  SmallVector<Instruction *, 16> Visited;
4215
17.6k
  for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; 
++I2.05k
) {
4216
2.05k
    assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
4217
2.05k
           "Worklist not sorted properly!");
4218
2.05k
    BasicBlock *BB = (*I)->getBlock();
4219
2.05k
    // For all instructions in blocks containing gather sequences:
4220
110k
    for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
4221
108k
      Instruction *In = &*it++;
4222
108k
      if (!isa<InsertElementInst>(In) && 
!isa<ExtractElementInst>(In)89.8k
)
4223
77.3k
        continue;
4224
31.1k
4225
31.1k
      // Check if we can replace this instruction with any of the
4226
31.1k
      // visited instructions.
4227
32.5M
      
for (Instruction *v : Visited)31.1k
{
4228
32.5M
        if (In->isIdenticalTo(v) &&
4229
32.5M
            
DT->dominates(v->getParent(), In->getParent())4.25k
) {
4230
4.06k
          In->replaceAllUsesWith(v);
4231
4.06k
          eraseInstruction(In);
4232
4.06k
          In = nullptr;
4233
4.06k
          break;
4234
4.06k
        }
4235
32.5M
      }
4236
31.1k
      if (In) {
4237
27.0k
        assert(!is_contained(Visited, In));
4238
27.0k
        Visited.push_back(In);
4239
27.0k
      }
4240
31.1k
    }
4241
2.05k
  }
4242
15.6k
  CSEBlocks.clear();
4243
15.6k
  GatherSeq.clear();
4244
15.6k
}
4245
4246
// Groups the instructions to a bundle (which is then a single scheduling entity)
4247
// and schedules instructions until the bundle gets ready.
4248
bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
4249
                                                 BoUpSLP *SLP,
4250
1.06M
                                                 const InstructionsState &S) {
4251
1.06M
  if (isa<PHINode>(S.OpValue))
4252
175k
    return true;
4253
888k
4254
888k
  // Initialize the instruction bundle.
4255
888k
  Instruction *OldScheduleEnd = ScheduleEnd;
4256
888k
  ScheduleData *PrevInBundle = nullptr;
4257
888k
  ScheduleData *Bundle = nullptr;
4258
888k
  bool ReSchedule = false;
4259
888k
  LLVM_DEBUG(dbgs() << "SLP:  bundle: " << *S.OpValue << "\n");
4260
888k
4261
888k
  // Make sure that the scheduling region contains all
4262
888k
  // instructions of the bundle.
4263
2.09M
  for (Value *V : VL) {
4264
2.09M
    if (!extendSchedulingRegion(V, S))
4265
3.73k
      return false;
4266
2.09M
  }
4267
888k
4268
2.08M
  
for (Value *V : VL)884k
{
4269
2.08M
    ScheduleData *BundleMember = getScheduleData(V);
4270
2.08M
    assert(BundleMember &&
4271
2.08M
           "no ScheduleData for bundle member (maybe not in same basic block)");
4272
2.08M
    if (BundleMember->IsScheduled) {
4273
38.4k
      // A bundle member was scheduled as single instruction before and now
4274
38.4k
      // needs to be scheduled as part of the bundle. We just get rid of the
4275
38.4k
      // existing schedule.
4276
38.4k
      LLVM_DEBUG(dbgs() << "SLP:  reset schedule because " << *BundleMember
4277
38.4k
                        << " was already scheduled\n");
4278
38.4k
      ReSchedule = true;
4279
38.4k
    }
4280
2.08M
    assert(BundleMember->isSchedulingEntity() &&
4281
2.08M
           "bundle member already part of other bundle");
4282
2.08M
    if (PrevInBundle) {
4283
1.19M
      PrevInBundle->NextInBundle = BundleMember;
4284
1.19M
    } else {
4285
884k
      Bundle = BundleMember;
4286
884k
    }
4287
2.08M
    BundleMember->UnscheduledDepsInBundle = 0;
4288
2.08M
    Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
4289
2.08M
4290
2.08M
    // Group the instructions to a bundle.
4291
2.08M
    BundleMember->FirstInBundle = Bundle;
4292
2.08M
    PrevInBundle = BundleMember;
4293
2.08M
  }
4294
884k
  if (ScheduleEnd != OldScheduleEnd) {
4295
590k
    // The scheduling region got new instructions at the lower end (or it is a
4296
590k
    // new region for the first bundle). This makes it necessary to
4297
590k
    // recalculate all dependencies.
4298
590k
    // It is seldom that this needs to be done a second time after adding the
4299
590k
    // initial bundle to the region.
4300
4.46M
    for (auto *I = ScheduleStart; I != ScheduleEnd; 
I = I->getNextNode()3.87M
) {
4301
3.87M
      doForAllOpcodes(I, [](ScheduleData *SD) {
4302
3.87M
        SD->clearDependencies();
4303
3.87M
      });
4304
3.87M
    }
4305
590k
    ReSchedule = true;
4306
590k
  }
4307
884k
  if (ReSchedule) {
4308
609k
    resetSchedule();
4309
609k
    initialFillReadyList(ReadyInsts);
4310
609k
  }
4311
884k
4312
884k
  LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
4313
884k
                    << BB->getName() << "\n");
4314
884k
4315
884k
  calculateDependencies(Bundle, true, SLP);
4316
884k
4317
884k
  // Now try to schedule the new bundle. As soon as the bundle is "ready" it
4318
884k
  // means that there are no cyclic dependencies and we can schedule it.
4319
884k
  // Note that's important that we don't "schedule" the bundle yet (see
4320
884k
  // cancelScheduling).
4321
2.36M
  while (!Bundle->isReady() && 
!ReadyInsts.empty()1.52M
) {
4322
1.47M
4323
1.47M
    ScheduleData *pickedSD = ReadyInsts.back();
4324
1.47M
    ReadyInsts.pop_back();
4325
1.47M
4326
1.47M
    if (pickedSD->isSchedulingEntity() && 
pickedSD->isReady()1.47M
) {
4327
1.46M
      schedule(pickedSD, ReadyInsts);
4328
1.46M
    }
4329
1.47M
  }
4330
884k
  if (!Bundle->isReady()) {
4331
50.7k
    cancelScheduling(VL, S.OpValue);
4332
50.7k
    return false;
4333
50.7k
  }
4334
834k
  return true;
4335
834k
}
4336
4337
void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
4338
263k
                                                Value *OpValue) {
4339
263k
  if (isa<PHINode>(OpValue))
4340
475
    return;
4341
263k
4342
263k
  ScheduleData *Bundle = getScheduleData(OpValue);
4343
263k
  LLVM_DEBUG(dbgs() << "SLP:  cancel scheduling of " << *Bundle << "\n");
4344
263k
  assert(!Bundle->IsScheduled &&
4345
263k
         "Can't cancel bundle which is already scheduled");
4346
263k
  assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
4347
263k
         "tried to unbundle something which is not a bundle");
4348
263k
4349
263k
  // Un-bundle: make single instructions out of the bundle.
4350
263k
  ScheduleData *BundleMember = Bundle;
4351
840k
  while (BundleMember) {
4352
577k
    assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
4353
577k
    BundleMember->FirstInBundle = BundleMember;
4354
577k
    ScheduleData *Next = BundleMember->NextInBundle;
4355
577k
    BundleMember->NextInBundle = nullptr;
4356
577k
    BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
4357
577k
    if (BundleMember->UnscheduledDepsInBundle == 0) {
4358
511k
      ReadyInsts.insert(BundleMember);
4359
511k
    }
4360
577k
    BundleMember = Next;
4361
577k
  }
4362
263k
}
4363
4364
1.49M
BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
4365
1.49M
  // Allocate a new ScheduleData for the instruction.
4366
1.49M
  if (ChunkPos >= ChunkSize) {
4367
194k
    ScheduleDataChunks.push_back(llvm::make_unique<ScheduleData[]>(ChunkSize));
4368
194k
    ChunkPos = 0;
4369
194k
  }
4370
1.49M
  return &(ScheduleDataChunks.back()[ChunkPos++]);
4371
1.49M
}
4372
4373
bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
4374
2.09M
                                                      const InstructionsState &S) {
4375
2.09M
  if (getScheduleData(V, isOneOf(S, V)))
4376
525k
    return true;
4377
1.56M
  Instruction *I = dyn_cast<Instruction>(V);
4378
1.56M
  assert(I && "bundle member must be an instruction");
4379
1.56M
  assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
4380
1.56M
  auto &&CheckSheduleForI = [this, &S](Instruction *I) -> bool {
4381
1.56M
    ScheduleData *ISD = getScheduleData(I);
4382
1.56M
    if (!ISD)
4383
1.56M
      return false;
4384
0
    assert(isInSchedulingRegion(ISD) &&
4385
0
           "ScheduleData not in scheduling region");
4386
0
    ScheduleData *SD = allocateScheduleDataChunks();
4387
0
    SD->Inst = I;
4388
0
    SD->init(SchedulingRegionID, S.OpValue);
4389
0
    ExtraScheduleDataMap[I][S.OpValue] = SD;
4390
0
    return true;
4391
0
  };
4392
1.56M
  if (CheckSheduleForI(I))
4393
0
    return true;
4394
1.56M
  if (!ScheduleStart) {
4395
589k
    // It's the first instruction in the new region.
4396
589k
    initScheduleData(I, I->getNextNode(), nullptr, nullptr);
4397
589k
    ScheduleStart = I;
4398
589k
    ScheduleEnd = I->getNextNode();
4399
589k
    if (isOneOf(S, I) != I)
4400
0
      CheckSheduleForI(I);
4401
589k
    assert(ScheduleEnd && "tried to vectorize a terminator?");
4402
589k
    LLVM_DEBUG(dbgs() << "SLP:  initialize schedule region to " << *I << "\n");
4403
589k
    return true;
4404
589k
  }
4405
976k
  // Search up and down at the same time, because we don't know if the new
4406
976k
  // instruction is above or below the existing scheduling region.
4407
976k
  BasicBlock::reverse_iterator UpIter =
4408
976k
      ++ScheduleStart->getIterator().getReverse();
4409
976k
  BasicBlock::reverse_iterator UpperEnd = BB->rend();
4410
976k
  BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
4411
976k
  BasicBlock::iterator LowerEnd = BB->end();
4412
4.88M
  while (true) {
4413
4.88M
    if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
4414
3.73k
      LLVM_DEBUG(dbgs() << "SLP:  exceeded schedule region size limit\n");
4415
3.73k
      return false;
4416
3.73k
    }
4417
4.87M
4418
4.87M
    if (UpIter != UpperEnd) {
4419
4.52M
      if (&*UpIter == I) {
4420
463k
        initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
4421
463k
        ScheduleStart = I;
4422
463k
        if (isOneOf(S, I) != I)
4423
0
          CheckSheduleForI(I);
4424
463k
        LLVM_DEBUG(dbgs() << "SLP:  extend schedule region start to " << *I
4425
463k
                          << "\n");
4426
463k
        return true;
4427
463k
      }
4428
4.05M
      ++UpIter;
4429
4.05M
    }
4430
4.87M
    
if (4.41M
DownIter != LowerEnd4.41M
) {
4431
3.50M
      if (&*DownIter == I) {
4432
509k
        initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
4433
509k
                         nullptr);
4434
509k
        ScheduleEnd = I->getNextNode();
4435
509k
        if (isOneOf(S, I) != I)
4436
0
          CheckSheduleForI(I);
4437
509k
        assert(ScheduleEnd && "tried to vectorize a terminator?");
4438
509k
        LLVM_DEBUG(dbgs() << "SLP:  extend schedule region end to " << *I
4439
509k
                          << "\n");
4440
509k
        return true;
4441
509k
      }
4442
2.99M
      ++DownIter;
4443
2.99M
    }
4444
4.41M
    assert((UpIter != UpperEnd || DownIter != LowerEnd) &&
4445
3.90M
           "instruction not found in block");
4446
3.90M
  }
4447
976k
  
return true0
;
4448
976k
}
4449
4450
void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
4451
                                                Instruction *ToI,
4452
                                                ScheduleData *PrevLoadStore,
4453
1.56M
                                                ScheduleData *NextLoadStore) {
4454
1.56M
  ScheduleData *CurrentLoadStore = PrevLoadStore;
4455
6.99M
  for (Instruction *I = FromI; I != ToI; 
I = I->getNextNode()5.43M
) {
4456
5.43M
    ScheduleData *SD = ScheduleDataMap[I];
4457
5.43M
    if (!SD) {
4458
1.49M
      SD = allocateScheduleDataChunks();
4459
1.49M
      ScheduleDataMap[I] = SD;
4460
1.49M
      SD->Inst = I;
4461
1.49M
    }
4462
5.43M
    assert(!isInSchedulingRegion(SD) &&
4463
5.43M
           "new ScheduleData already in scheduling region");
4464
5.43M
    SD->init(SchedulingRegionID, I);
4465
5.43M
4466
5.43M
    if (I->mayReadOrWriteMemory() &&
4467
5.43M
        
(1.64M
!isa<IntrinsicInst>(I)1.64M
||
4468
1.64M
         
cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect19.0k
)) {
4469
1.64M
      // Update the linked list of memory accessing instructions.
4470
1.64M
      if (CurrentLoadStore) {
4471
1.06M
        CurrentLoadStore->NextLoadStore = SD;
4472
1.06M
      } else {
4473
582k
        FirstLoadStoreInRegion = SD;
4474
582k
      }
4475
1.64M
      CurrentLoadStore = SD;
4476
1.64M
    }
4477
5.43M
  }
4478
1.56M
  if (NextLoadStore) {
4479
254k
    if (CurrentLoadStore)
4480
182k
      CurrentLoadStore->NextLoadStore = NextLoadStore;
4481
1.30M
  } else {
4482
1.30M
    LastLoadStoreInRegion = CurrentLoadStore;
4483
1.30M
  }
4484
1.56M
}
4485
4486
void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
4487
                                                     bool InsertInReadyList,
4488
1.13M
                                                     BoUpSLP *SLP) {
4489
1.13M
  assert(SD->isSchedulingEntity());
4490
1.13M
4491
1.13M
  SmallVector<ScheduleData *, 10> WorkList;
4492
1.13M
  WorkList.push_back(SD);
4493
1.13M
4494
3.60M
  while (!WorkList.empty()) {
4495
2.47M
    ScheduleData *SD = WorkList.back();
4496
2.47M
    WorkList.pop_back();
4497
2.47M
4498
2.47M
    ScheduleData *BundleMember = SD;
4499
6.31M
    while (BundleMember) {
4500
3.83M
      assert(isInSchedulingRegion(BundleMember));
4501
3.83M
      if (!BundleMember->hasValidDependencies()) {
4502
3.34M
4503
3.34M
        LLVM_DEBUG(dbgs() << "SLP:       update deps of " << *BundleMember
4504
3.34M
                          << "\n");
4505
3.34M
        BundleMember->Dependencies = 0;
4506
3.34M
        BundleMember->resetUnscheduledDeps();
4507
3.34M
4508
3.34M
        // Handle def-use chain dependencies.
4509
3.34M
        if (BundleMember->OpValue != BundleMember->Inst) {
4510
0
          ScheduleData *UseSD = getScheduleData(BundleMember->Inst);
4511
0
          if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
4512
0
            BundleMember->Dependencies++;
4513
0
            ScheduleData *DestBundle = UseSD->FirstInBundle;
4514
0
            if (!DestBundle->IsScheduled)
4515
0
              BundleMember->incrementUnscheduledDeps(1);
4516
0
            if (!DestBundle->hasValidDependencies())
4517
0
              WorkList.push_back(DestBundle);
4518
0
          }
4519
3.34M
        } else {
4520
4.48M
          for (User *U : BundleMember->Inst->users()) {
4521
4.48M
            if (isa<Instruction>(U)) {
4522
4.48M
              ScheduleData *UseSD = getScheduleData(U);
4523
4.48M
              if (UseSD && 
isInSchedulingRegion(UseSD->FirstInBundle)2.44M
) {
4524
2.44M
                BundleMember->Dependencies++;
4525
2.44M
                ScheduleData *DestBundle = UseSD->FirstInBundle;
4526
2.44M
                if (!DestBundle->IsScheduled)
4527
2.21M
                  BundleMember->incrementUnscheduledDeps(1);
4528
2.44M
                if (!DestBundle->hasValidDependencies())
4529
1.15M
                  WorkList.push_back(DestBundle);
4530
2.44M
              }
4531
4.48M
            } else {
4532
0
              // I'm not sure if this can ever happen. But we need to be safe.
4533
0
              // This lets the instruction/bundle never be scheduled and
4534
0
              // eventually disable vectorization.
4535
0
              BundleMember->Dependencies++;
4536
0
              BundleMember->incrementUnscheduledDeps(1);
4537
0
            }
4538
4.48M
          }
4539
3.34M
        }
4540
3.34M
4541
3.34M
        // Handle the memory dependencies.
4542
3.34M
        ScheduleData *DepDest = BundleMember->NextLoadStore;
4543
3.34M
        if (DepDest) {
4544
876k
          Instruction *SrcInst = BundleMember->Inst;
4545
876k
          MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA);
4546
876k
          bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
4547
876k
          unsigned numAliased = 0;
4548
876k
          unsigned DistToSrc = 1;
4549
876k
4550
8.40M
          while (DepDest) {
4551
7.52M
            assert(isInSchedulingRegion(DepDest));
4552
7.52M
4553
7.52M
            // We have two limits to reduce the complexity:
4554
7.52M
            // 1) AliasedCheckLimit: It's a small limit to reduce calls to
4555
7.52M
            //    SLP->isAliased (which is the expensive part in this loop).
4556
7.52M
            // 2) MaxMemDepDistance: It's for very large blocks and it aborts
4557
7.52M
            //    the whole loop (even if the loop is fast, it's quadratic).
4558
7.52M
            //    It's important for the loop break condition (see below) to
4559
7.52M
            //    check this limit even between two read-only instructions.
4560
7.52M
            if (DistToSrc >= MaxMemDepDistance ||
4561
7.52M
                    
(7.51M
(7.51M
SrcMayWrite7.51M
||
DepDest->Inst->mayWriteToMemory()5.16M
) &&
4562
7.51M
                     
(3.84M
numAliased >= AliasedCheckLimit3.84M
||
4563
3.84M
                      
SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)3.27M
))) {
4564
1.28M
4565
1.28M
              // We increment the counter only if the locations are aliased
4566
1.28M
              // (instead of counting all alias checks). This gives a better
4567
1.28M
              // balance between reduced runtime and accurate dependencies.
4568
1.28M
              numAliased++;
4569
1.28M
4570
1.28M
              DepDest->MemoryDependencies.push_back(BundleMember);
4571
1.28M
              BundleMember->Dependencies++;
4572
1.28M
              ScheduleData *DestBundle = DepDest->FirstInBundle;
4573
1.28M
              if (!DestBundle->IsScheduled) {
4574
1.08M
                BundleMember->incrementUnscheduledDeps(1);
4575
1.08M
              }
4576
1.28M
              if (!DestBundle->hasValidDependencies()) {
4577
186k
                WorkList.push_back(DestBundle);
4578
186k
              }
4579
1.28M
            }
4580
7.52M
            DepDest = DepDest->NextLoadStore;
4581
7.52M
4582
7.52M
            // Example, explaining the loop break condition: Let's assume our
4583
7.52M
            // starting instruction is i0 and MaxMemDepDistance = 3.
4584
7.52M
            //
4585
7.52M
            //                      +--------v--v--v
4586
7.52M
            //             i0,i1,i2,i3,i4,i5,i6,i7,i8
4587
7.52M
            //             +--------^--^--^
4588
7.52M
            //
4589
7.52M
            // MaxMemDepDistance let us stop alias-checking at i3 and we add
4590
7.52M
            // dependencies from i0 to i3,i4,.. (even if they are not aliased).
4591
7.52M
            // Previously we already added dependencies from i3 to i6,i7,i8
4592
7.52M
            // (because of MaxMemDepDistance). As we added a dependency from
4593
7.52M
            // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
4594
7.52M
            // and we can abort this loop at i6.
4595
7.52M
            if (DistToSrc >= 2 * MaxMemDepDistance)
4596
0
              break;
4597
7.52M
            DistToSrc++;
4598
7.52M
          }
4599
876k
        }
4600
3.34M
      }
4601
3.83M
      BundleMember = BundleMember->NextInBundle;
4602
3.83M
    }
4603
2.47M
    if (InsertInReadyList && 
SD->isReady()2.18M
) {
4604
750k
      ReadyInsts.push_back(SD);
4605
750k
      LLVM_DEBUG(dbgs() << "SLP:     gets ready on update: " << *SD->Inst
4606
750k
                        << "\n");
4607
750k
    }
4608
2.47M
  }
4609
1.13M
}
4610
4611
653k
void BoUpSLP::BlockScheduling::resetSchedule() {
4612
653k
  assert(ScheduleStart &&
4613
653k
         "tried to reset schedule on block which has not been scheduled");
4614
6.51M
  for (Instruction *I = ScheduleStart; I != ScheduleEnd; 
I = I->getNextNode()5.86M
) {
4615
5.86M
    doForAllOpcodes(I, [&](ScheduleData *SD) {
4616
5.86M
      assert(isInSchedulingRegion(SD) &&
4617
5.86M
             "ScheduleData not in scheduling region");
4618
5.86M
      SD->IsScheduled = false;
4619
5.86M
      SD->resetUnscheduledDeps();
4620
5.86M
    });
4621
5.86M
  }
4622
653k
  ReadyInsts.clear();
4623
653k
}
4624
4625
256k
void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
4626
256k
  if (!BS->ScheduleStart)
4627
212k
    return;
4628
44.0k
4629
44.0k
  LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
4630
44.0k
4631
44.0k
  BS->resetSchedule();
4632
44.0k
4633
44.0k
  // For the real scheduling we use a more sophisticated ready-list: it is
4634
44.0k
  // sorted by the original instruction location. This lets the final schedule
4635
44.0k
  // be as  close as possible to the original instruction order.
4636
44.0k
  struct ScheduleDataCompare {
4637
743k
    bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
4638
743k
      return SD2->SchedulingPriority < SD1->SchedulingPriority;
4639
743k
    }
4640
44.0k
  };
4641
44.0k
  std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
4642
44.0k
4643
44.0k
  // Ensure that all dependency data is updated and fill the ready-list with
4644
44.0k
  // initial instructions.
4645
44.0k
  int Idx = 0;
4646
44.0k
  int NumToSchedule = 0;
4647
450k
  for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
4648
406k
       I = I->getNextNode()) {
4649
406k
    BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) {
4650
406k
      assert(SD->isPartOfBundle() ==
4651
406k
                 (getTreeEntry(SD->Inst) != nullptr) &&
4652
406k
             "scheduler and vectorizer bundle mismatch");
4653
406k
      SD->FirstInBundle->SchedulingPriority = Idx++;
4654
406k
      if (SD->isSchedulingEntity()) {
4655
245k
        BS->calculateDependencies(SD, false, this);
4656
245k
        NumToSchedule++;
4657
245k
      }
4658
406k
    });
4659
406k
  }
4660
44.0k
  BS->initialFillReadyList(ReadyInsts);
4661
44.0k
4662
44.0k
  Instruction *LastScheduledInst = BS->ScheduleEnd;
4663
44.0k
4664
44.0k
  // Do the "real" scheduling.
4665
289k
  while (!ReadyInsts.empty()) {
4666
245k
    ScheduleData *picked = *ReadyInsts.begin();
4667
245k
    ReadyInsts.erase(ReadyInsts.begin());
4668
245k
4669
245k
    // Move the scheduled instruction(s) to their dedicated places, if not
4670
245k
    // there yet.
4671
245k
    ScheduleData *BundleMember = picked;
4672
651k
    while (BundleMember) {
4673
406k
      Instruction *pickedInst = BundleMember->Inst;
4674
406k
      if (LastScheduledInst->getNextNode() != pickedInst) {
4675
406k
        BS->BB->getInstList().remove(pickedInst);
4676
406k
        BS->BB->getInstList().insert(LastScheduledInst->getIterator(),
4677
406k
                                     pickedInst);
4678
406k
      }
4679
406k
      LastScheduledInst = pickedInst;
4680
406k
      BundleMember = BundleMember->NextInBundle;
4681
406k
    }
4682
245k
4683
245k
    BS->schedule(picked, ReadyInsts);
4684
245k
    NumToSchedule--;
4685
245k
  }
4686
44.0k
  assert(NumToSchedule == 0 && "could not schedule all instructions");
4687
44.0k
4688
44.0k
  // Avoid duplicate scheduling of the block.
4689
44.0k
  BS->ScheduleStart = nullptr;
4690
44.0k
}
4691
4692
644k
unsigned BoUpSLP::getVectorElementSize(Value *V) const {
4693
644k
  // If V is a store, just return the width of the stored value without
4694
644k
  // traversing the expression tree. This is the common case.
4695
644k
  if (auto *Store = dyn_cast<StoreInst>(V))
4696
198k
    return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
4697
445k
4698
445k
  // If V is not a store, we can traverse the expression tree to find loads
4699
445k
  // that feed it. The type of the loaded value may indicate a more suitable
4700
445k
  // width than V's type. We want to base the vector element size on the width
4701
445k
  // of memory operations where possible.
4702
445k
  SmallVector<Instruction *, 16> Worklist;
4703
445k
  SmallPtrSet<Instruction *, 16> Visited;
4704
445k
  if (auto *I = dyn_cast<Instruction>(V))
4705
445k
    Worklist.push_back(I);
4706
445k
4707
445k
  // Traverse the expression tree in bottom-up order looking for loads. If we
4708
445k
  // encounter an instruction we don't yet handle, we give up.
4709
445k
  auto MaxWidth = 0u;
4710
445k
  auto FoundUnknownInst = false;
4711
12.3M
  while (!Worklist.empty() && 
!FoundUnknownInst11.8M
) {
4712
11.8M
    auto *I = Worklist.pop_back_val();
4713
11.8M
    Visited.insert(I);
4714
11.8M
4715
11.8M
    // We should only be looking at scalar instructions here. If the current
4716
11.8M
    // instruction has a vector type, give up.
4717
11.8M
    auto *Ty = I->getType();
4718
11.8M
    if (isa<VectorType>(Ty))
4719
3.91k
      FoundUnknownInst = true;
4720
11.8M
4721
11.8M
    // If the current instruction is a load, update MaxWidth to reflect the
4722
11.8M
    // width of the loaded value.
4723
11.8M
    else if (isa<LoadInst>(I))
4724
1.07M
      MaxWidth = std::max<unsigned>(MaxWidth, DL->getTypeSizeInBits(Ty));
4725
10.7M
4726
10.7M
    // Otherwise, we need to visit the operands of the instruction. We only
4727
10.7M
    // handle the interesting cases from buildTree here. If an operand is an
4728
10.7M
    // instruction we haven't yet visited, we add it to the worklist.
4729
10.7M
    else if (isa<PHINode>(I) || 
isa<CastInst>(I)10.2M
||
isa<GetElementPtrInst>(I)9.99M
||
4730
10.7M
             
isa<CmpInst>(I)9.93M
||
isa<SelectInst>(I)9.81M
||
isa<BinaryOperator>(I)9.75M
) {
4731
10.7M
      for (Use &U : I->operands())
4732
21.4M
        if (auto *J = dyn_cast<Instruction>(U.get()))
4733
17.4M
          if (!Visited.count(J))
4734
11.4M
            Worklist.push_back(J);
4735
10.7M
    }
4736
56.9k
4737
56.9k
    // If we don't yet handle the instruction, give up.
4738
56.9k
    else
4739
56.9k
      FoundUnknownInst = true;
4740
11.8M
  }
4741
445k
4742
445k
  // If we didn't encounter a memory access in the expression tree, or if we
4743
445k
  // gave up for some reason, just return the width of V.
4744
445k
  if (!MaxWidth || 
FoundUnknownInst341k
)
4745
118k
    return DL->getTypeSizeInBits(V->getType());
4746
327k
4747
327k
  // Otherwise, return the maximum width we found.
4748
327k
  return MaxWidth;
4749
327k
}
4750
4751
// Determine if a value V in a vectorizable expression Expr can be demoted to a
4752
// smaller type with a truncation. We collect the values that will be demoted
4753
// in ToDemote and additional roots that require investigating in Roots.
4754
static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,
4755
                                  SmallVectorImpl<Value *> &ToDemote,
4756
162k
                                  SmallVectorImpl<Value *> &Roots) {
4757
162k
  // We can always demote constants.
4758
162k
  if (isa<Constant>(V)) {
4759
10.0k
    ToDemote.push_back(V);
4760
10.0k
    return true;
4761
10.0k
  }
4762
152k
4763
152k
  // If the value is not an instruction in the expression with only one use, it
4764
152k
  // cannot be demoted.
4765
152k
  auto *I = dyn_cast<Instruction>(V);
4766
152k
  if (!I || 
!I->hasOneUse()146k
||
!Expr.count(I)133k
)
4767
28.4k
    return false;
4768
124k
4769
124k
  switch (I->getOpcode()) {
4770
124k
4771
124k
  // We can always demote truncations and extensions. Since truncations can
4772
124k
  // seed additional demotion, we save the truncated value.
4773
124k
  case Instruction::Trunc:
4774
5.03k
    Roots.push_back(I->getOperand(0));
4775
5.03k
    break;
4776
124k
  case Instruction::ZExt:
4777
8.90k
  case Instruction::SExt:
4778
8.90k
    break;
4779
8.90k
4780
8.90k
  // We can demote certain binary operations if we can demote both of their
4781
8.90k
  // operands.
4782
51.4k
  case Instruction::Add:
4783
51.4k
  case Instruction::Sub:
4784
51.4k
  case Instruction::Mul:
4785
51.4k
  case Instruction::And:
4786
51.4k
  case Instruction::Or:
4787
51.4k
  case Instruction::Xor:
4788
51.4k
    if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots) ||
4789
51.4k
        
!collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots)12.9k
)
4790
49.4k
      return false;
4791
2.00k
    break;
4792
2.00k
4793
2.00k
  // We can demote selects if we can demote their true and false values.
4794
2.00k
  case Instruction::Select: {
4795
291
    SelectInst *SI = cast<SelectInst>(I);
4796
291
    if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots) ||
4797
291
        
!collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote, Roots)37
)
4798
259
      return false;
4799
32
    break;
4800
32
  }
4801
32
4802
32
  // We can demote phis if we can demote all their incoming operands. Note that
4803
32
  // we don't need to worry about cycles since we ensure single use above.
4804
10.2k
  case Instruction::PHI: {
4805
10.2k
    PHINode *PN = cast<PHINode>(I);
4806
10.2k
    for (Value *IncValue : PN->incoming_values())
4807
12.4k
      if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots))
4808
9.54k
        return false;
4809
10.2k
    
break712
;
4810
10.2k
  }
4811
10.2k
4812
10.2k
  // Otherwise, conservatively give up.
4813
48.3k
  default:
4814
48.3k
    return false;
4815
16.6k
  }
4816
16.6k
4817
16.6k
  // Record the value that we can demote.
4818
16.6k
  ToDemote.push_back(V);
4819
16.6k
  return true;
4820
16.6k
}
4821
4822
290k
void BoUpSLP::computeMinimumValueSizes() {
4823
290k
  // If there are no external uses, the expression tree must be rooted by a
4824
290k
  // store. We can't demote in-memory values, so there is nothing to do here.
4825
290k
  if (ExternalUses.empty())
4826
56.8k
    return;
4827
233k
4828
233k
  // We only attempt to truncate integer expressions.
4829
233k
  auto &TreeRoot = VectorizableTree[0]->Scalars;
4830
233k
  auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
4831
233k
  if (!TreeRootIT)
4832
61.9k
    return;
4833
171k
4834
171k
  // If the expression is not rooted by a store, these roots should have
4835
171k
  // external uses. We will rely on InstCombine to rewrite the expression in
4836
171k
  // the narrower type. However, InstCombine only rewrites single-use values.
4837
171k
  // This means that if a tree entry other than a root is used externally, it
4838
171k
  // must have multiple uses and InstCombine will not rewrite it. The code
4839
171k
  // below ensures that only the roots are used externally.
4840
171k
  SmallPtrSet<Value *, 32> Expr(TreeRoot.begin(), TreeRoot.end());
4841
171k
  for (auto &EU : ExternalUses)
4842
372k
    if (!Expr.erase(EU.Scalar))
4843
85.5k
      return;
4844
171k
  
if (85.7k
!Expr.empty()85.7k
)
4845
1
    return;
4846
85.7k
4847
85.7k
  // Collect the scalar values of the vectorizable expression. We will use this
4848
85.7k
  // context to determine which values can be demoted. If we see a truncation,
4849
85.7k
  // we mark it as seeding another demotion.
4850
85.7k
  for (auto &EntryPtr : VectorizableTree)
4851
297k
    Expr.insert(EntryPtr->Scalars.begin(), EntryPtr->Scalars.end());
4852
85.7k
4853
85.7k
  // Ensure the roots of the vectorizable tree don't form a cycle. They must
4854
85.7k
  // have a single external user that is not in the vectorizable tree.
4855
85.7k
  for (auto *Root : TreeRoot)
4856
171k
    if (!Root->hasOneUse() || 
Expr.count(*Root->user_begin())166k
)
4857
6.36k
      return;
4858
85.7k
4859
85.7k
  // Conservatively determine if we can actually truncate the roots of the
4860
85.7k
  // expression. Collect the values that can be demoted in ToDemote and
4861
85.7k
  // additional roots that require investigating in Roots.
4862
85.7k
  SmallVector<Value *, 32> ToDemote;
4863
79.3k
  SmallVector<Value *, 4> Roots;
4864
79.3k
  for (auto *Root : TreeRoot)
4865
85.6k
    if (!collectValuesToDemote(Root, Expr, ToDemote, Roots))
4866
76.7k
      return;
4867
79.3k
4868
79.3k
  // The maximum bit width required to represent all the values that can be
4869
79.3k
  // demoted without loss of precision. It would be safe to truncate the roots
4870
79.3k
  // of the expression to this width.
4871
79.3k
  auto MaxBitWidth = 8u;
4872
2.56k
4873
2.56k
  // We first check if all the bits of the roots are demanded. If they're not,
4874
2.56k
  // we can truncate the roots to this narrower type.
4875
8.50k
  for (auto *Root : TreeRoot) {
4876
8.50k
    auto Mask = DB->getDemandedBits(cast<Instruction>(Root));
4877
8.50k
    MaxBitWidth = std::max<unsigned>(
4878
8.50k
        Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth);
4879
8.50k
  }
4880
2.56k
4881
2.56k
  // True if the roots can be zero-extended back to their original type, rather
4882
2.56k
  // than sign-extended. We know that if the leading bits are not demanded, we
4883
2.56k
  // can safely zero-extend. So we initialize IsKnownPositive to True.
4884
2.56k
  bool IsKnownPositive = true;
4885
2.56k
4886
2.56k
  // If all the bits of the roots are demanded, we can try a little harder to
4887
2.56k
  // compute a narrower type. This can happen, for example, if the roots are
4888
2.56k
  // getelementptr indices. InstCombine promotes these indices to the pointer
4889
2.56k
  // width. Thus, all their bits are technically demanded even though the
4890
2.56k
  // address computation might be vectorized in a smaller type.
4891
2.56k
  //
4892
2.56k
  // We start by looking at each entry that can be demoted. We compute the
4893
2.56k
  // maximum bit width required to store the scalar by using ValueTracking to
4894
2.56k
  // compute the number of high-order bits we can truncate.
4895
2.56k
  if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType()) &&